In [None]:
import os
import sys
import gzip

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

import tqdm
from joblib import Parallel, delayed
from multiprocessing import Manager

sys.path.append('../..')
from slp.slp_package.slp_functions import create_merged_game_data_df, prepare_data_for_training


In [None]:
df = create_merged_game_data_df(['public','ranked','mango'])

In [None]:
# Example usage
source_data = ['ranked', 'public']

general_features = {
    'stage_name': ['FOUNTAIN_OF_DREAMS','FINAL_DESTINATION','BATTLEFIELD','YOSHIS_STORY','POKEMON_STADIUM'],
    'num_players': [2],
    'conclusive': [True]
}
player_features = {
    # 'netplay_code': ['MANG#0'],
    'character_name': ['FOX', 'FALCO', 'MARTH', 'CAPTAIN_FALCON', 'SHEIK', 'PEACH', 'JIGGLYPUFF']
    
}
opposing_player_features = {
    # 'character_name': ['MARTH'],
    # 'netplay_code': ['KOD#0', 'ZAIN#0']
}
label_info = {
    'source': ['player'], # Can be 'general', 'player
    # 'feature': ['netplay_code']
    'feature': ['character_name']
}
    
processed_df = prepare_data_for_training(source_data, general_features, player_features, opposing_player_features, label_info)


In [None]:
pd.DataFrame(processed_df['labels'].value_counts(), columns=['labels', 'count'])

# Get the value counts of the 'labels' column
label_counts = processed_df['labels'].value_counts()

# Create a DataFrame from the value counts
label_counts_df = pd.DataFrame(label_counts).reset_index()
label_counts_df.columns = ['labels', 'count']

print(label_counts_df)

In [None]:
num_segments_per_label = 50000
segment_length_power = 10

In [None]:
# def number_of_segments_per_game(df, segment_length_power, num_segments_per_label):
#     """
    
#     :param df: column 'length' should have 123 less already
#     """
#     segment_length = 2 ** segment_length_power
#     unique_labels = df['labels'].unique()
    
#     # Eventually, I do want to modify the original dataframe.
#     df = df.copy()
#     df = df[df['length'] > segment_length]
#     df['float_num_segments'] = 0.
#     # df['int_num_segments'] = 0 
#     # df['frac_num_segments'] = 0.
    
#     label_info_list = []
    
#     for label in unique_labels:
#         label_indices = df['labels'] == label
#         adjusted_game_length = df.loc[label_indices, 'length'] - segment_length # A segment must start its own length before the end of the game.
#         game_length_sum = adjusted_game_length.sum()
#         shift_estimate = game_length_sum / num_segments_per_label # Idea: Put all the frame data in a (9,-) array, evenly space out segments.
#         # The number of segments we take from each game will be roughly round(adjusted_game_length / shift_estimate).
#         # df['int_num_segments'].sum() - num_segments_per_label =~ (number of games with this label) / 2
#         # If we simply took round(adjusted_game_length / shift_estimate) segments per game, we would be off by a little bit.
#         # Idea is to sort the games with this label decreasing by df['frac_num_segments'] and take one extra segment from the first
#         # however many games needed to get the right number of segments.
#         # Because we want exactly the right number of segments per label in each of test, train, and possibly val, we will calculate the
#         # number of segments we take from each game after we split the games into those sets.
#         df.loc[label_indices, 'float_num_segments'] = (adjusted_game_length / shift_estimate)
#         # df.loc[label_indices, 'int_num_segments'] = adjusted_game_length // shift_estimate 
#         # df.loc[label_indices, 'frac_num_segments'] = adjusted_game_length / shift_estimate - df.loc[label_indices, 'int_num_segments']
        
#         label_info_list.append([label, df.loc[label_indices].shape[0], round(shift_estimate)])
    
#     label_info = pd.DataFrame(label_info_list, columns=['Label', 'Count', 'Estimated Shift'])
    
#     # Sort the label_info DataFrame by 'Count' in descending order
#     label_info = label_info.sort_values(by='Count', ascending=False).reset_index(drop=True)

#     return df, label_info

# segments_per_game_df, label_info_df = number_of_segments_per_game(processed_df, segment_length_power, num_segments_per_label)
# print(segments_per_game_df.groupby('labels')['float_num_segments'].sum())
# print(segments_per_game_df.head())



In [None]:
import pandas as pd

def number_of_segments_per_game(df, segment_length_power, num_segments_per_label):
    """
    Calculate the floating-point number of segments for each game in the dataframe based on the game's length
    and the desired total number of segments per label.

    Parameters:
    df (DataFrame): Dataframe containing game data with at least 'labels' and 'length' columns.
    segment_length_power (int): Power of 2 to determine the segment length.
    num_segments_per_label (int): Desired total number of segments per label.

    Returns:
    DataFrame: Updated dataframe with an additional column 'float_num_segments'.
    DataFrame: Summary information about the labels, their counts, and estimated shift values.
    """
    # Copy the dataframe to avoid modifying the original data
    df = df.copy()

    # Calculate segment length as a power of 2
    segment_length = 2 ** segment_length_power

    # Filter out games where length is less than or equal to the segment length
    df = df[df['length'] > segment_length]

    # Initialize the column to store the floating-point number of segments
    df['float_num_segments'] = 0.0

    # Initialize a list to store information about each label for later summary
    label_info_list = []

    # Iterate through each unique label to process segments
    for label in df['labels'].unique():
        # Identify rows matching the current label
        label_indices = df['labels'] == label

        # Adjust game length to ensure segments fit within the game length
        adjusted_game_length = df.loc[label_indices, 'length'] - segment_length

        # Sum the lengths of all games with the current label to estimate the shift value
        game_length_sum = adjusted_game_length.sum()
        shift_estimate = game_length_sum / num_segments_per_label

        # Calculate the floating-point number of segments for each game
        df.loc[label_indices, 'float_num_segments'] = adjusted_game_length / shift_estimate

        # Collect label information including the total count and shift estimate
        label_info = [label, adjusted_game_length.count(), round(shift_estimate)]
        label_info_list.append(label_info)

    # Create a dataframe from the label information list
    label_info_df = pd.DataFrame(label_info_list, columns=['Label', 'Count', 'Shift'])

    # Sort the label_info DataFrame by 'Count' in descending order for better readability
    label_info_df = label_info_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
    
    return_columns = ['player_inputs_np_save_path',  'length', 'labels','float_num_segments']

    return df[return_columns], label_info_df

segments_per_game_df, label_info_df = number_of_segments_per_game(processed_df, segment_length_power, num_segments_per_label)
# print(segments_per_game_df.groupby('labels')['float_num_segments'].sum())
print(label_info_df)
segments_per_game_df.head()



In [None]:
def divide_games(df, num_segments_per_label, test_ratio=0.15, val_ratio=0.15, val=True):
    """
    Splits the games into training, testing, and optionally validation sets based on the approximate number of segments per game.
    
    Parameters:
    df (DataFrame): The output of number_of_segments_per_game containing game data with 'labels' and 'float_num_segments'.
    num_segments_per_label (int): Total number of segments desired per label.
    test_ratio (float): The proportion of data to be used for the test set.
    val_ratio (float): The proportion of data to be used for the validation set.
    val (bool): Whether to create a validation set.
    
    Returns:
    test_df (DataFrame): Data for testing.
    val_df (DataFrame): Data for validation (if val is True, otherwise an empty DataFrame).
    train_df (DataFrame): Data for training.
    """

    # Copy the dataframe to avoid modifying the original data
    df = df.copy()
    
    # Initialize empty lists to store split dataframes
    test_dfs, val_dfs, train_dfs = [], [], []

    # Calculate the number of segments for each split based on the provided ratios
    num_segments_test = round(num_segments_per_label * test_ratio)
    num_segments_val = round(num_segments_per_label * val_ratio) * val
    num_segments_train = num_segments_per_label - num_segments_test - num_segments_val
    
    # Process each label separately
    for label in df['labels'].unique():
        # Filter the dataframe for the current label and shuffle
        label_df = df[df['labels'] == label].sample(frac=1).reset_index(drop=True)
         # Ensure 'float_num_segments' is in label_df before proceeding
        if 'float_num_segments' not in label_df.columns:
            print(f"'float_num_segments' column is missing in label_df for label {label}")
            continue  # Skip this label if the required column is missing
        
        # Calculate cumulative sum to find the cutoff points for splitting
        num_segments_cumsum = label_df['float_num_segments'].cumsum()

        # Determine the index to split test and train datasets
        test_idx = num_segments_cumsum[num_segments_cumsum <= num_segments_test].last_valid_index() or 0
        val_idx = num_segments_cumsum[num_segments_cumsum <= num_segments_test + num_segments_val].last_valid_index() or test_idx

        # Split the data based on calculated indices
        test_label_df = label_df.iloc[:test_idx + 1].copy()
        val_label_df = label_df.iloc[test_idx + 1:val_idx + 1].copy() if val else pd.DataFrame(columns = label_df.columns)
        train_label_df = label_df.iloc[val_idx + 1:].copy()
        # print(test_label_df.head())

        # Calculate the actual number of segments to extract for each set
        # This process adjusts the 'num_segments' by distributing the rounding errors across the segments
        # to ensure that the total number of segments remains as close as possible to the desired count
        for split_df, num_segments_split in zip(
            [test_label_df, val_label_df, train_label_df],
            [num_segments_test, num_segments_val, num_segments_train]
        ):
            # Start with floor values of 'float_num_segments' and calculate the residual fractional part
            split_df['num_segments'] = split_df['float_num_segments'].astype(int)
            split_df['frac_part'] = split_df['float_num_segments'] - split_df['num_segments']
            split_df.sort_values(by='frac_part', ascending=False, inplace=True)

            # Distribute rounding residuals to match the total segment count precisely
            residual_count = num_segments_split - split_df['num_segments'].sum()
            split_df.iloc[:residual_count, split_df.columns.get_loc('num_segments')] += 1

        # Append the processed dataframes to their respective lists
        test_dfs.append(test_label_df)
        val_dfs.append(val_label_df)
        train_dfs.append(train_label_df)

    # Concatenate all the dataframes in each list to create the final splits
    return_columns = ['player_inputs_np_save_path',  'length', 'num_segments','labels']
    test_df = pd.concat(test_dfs, ignore_index=True)[return_columns]
    val_df = pd.concat(val_dfs, ignore_index=True)[return_columns] if val else pd.DataFrame(columns=return_columns)
    train_df = pd.concat(train_dfs, ignore_index=True)[return_columns]
    
    # Encode the labels for training
    label_encoder = LabelEncoder()
    label_encoder.fit(df['labels'].unique())
    test_df['encoded_labels'] = label_encoder.fit_transform(test_df['labels'])
    val_df['encoded_labels'] = label_encoder.fit_transform(val_df['labels'])
    train_df['encoded_labels'] = label_encoder.fit_transform(train_df['labels'])

    return test_df, val_df, train_df

# print(segments_per_game_df.head())
test_df, val_df, train_df = divide_games(segments_per_game_df, num_segments_per_label, test_ratio=.15, val_ratio=.15, val=True)
# Sum 'num_segments' for each 'label'
print(test_df.groupby('labels')['num_segments'].sum())
# Sum 'num_segments' for each 'label'
if not val_df.empty:
    print(val_df.groupby('labels')['num_segments'].sum())
# Sum 'num_segments' for each 'label'
print(train_df.groupby('labels')['num_segments'].sum())
train_df.head()


In [None]:
import numpy as np
import pandas as pd

def create_training_dataframe(df, segment_length_power):
    """
    Generate a DataFrame that lists the segments for training, where each row corresponds to a segment.
    
    Parameters:
    df (DataFrame): DataFrame containing the output from `divide_games`, which includes 'num_segments' and 'length'.
    segment_length_power (int): The power of 2 used to determine the segment length.
    
    Returns:
    DataFrame: A new DataFrame where each row represents a segment, including the start index of each segment.
    """
    # Calculate the segment length as a power of 2
    segment_length = 2 ** segment_length_power
    
    # Retrieve the 'num_segments' column as an array to determine how many times to repeat each row
    repeats = df['num_segments'].values

    # Repeat each index in the DataFrame according to the number of segments it should be split into
    index_repeated = np.repeat(df.index, repeats)
    
    # Duplicate rows in the DataFrame based on the repeat counts for each row
    df_repeated = df.loc[index_repeated].reset_index(drop=True)
    
    # Generate a sequential 'segment_index' for each group of repeated rows
    segment_indices = np.concatenate([np.arange(n, dtype=np.int16) for n in repeats])
    
    # Calculate the start index of each segment within the game
    df_repeated['segment_start_index'] = ((df_repeated['length'] - segment_length) // df_repeated['num_segments']) * segment_indices
    
    # Drop columns that are no longer necessary after computing 'segment_start_index'
    df_repeated = df_repeated.drop(columns=['length', 'num_segments'])

    # Add 'segment_index' to the DataFrame to keep track of each segment within its group
    df_repeated['segment_index'] = segment_indices
    
    return df_repeated


train_segments_df = create_training_dataframe(train_df, 10)
print(train_segments_df.value_counts('labels'))
# print(train_segments_df.shape)
   
test_segments_df = create_training_dataframe(test_df, 10)
print(test_segments_df.value_counts('labels'))
# print(test_segments_df.shape)
if not val_df.empty:
    val_segments_df = create_training_dataframe(val_df, 10)
    print(val_segments_df.value_counts('labels'))
    # print(val_segments_df.shape)    
    
train_segments_df.head()

In [None]:

def create_training_numpy(df, segment_length_power):
    """
    Creates a NumPy array containing all the segments from the dataframe, with parallel processing for efficiency.
    
    Parameters:
    df (DataFrame): The DataFrame containing game data, must be one of the outputs from `divide_games`.
    segment_length_power (int): The power of 2 that defines the length of each segment.
    
    Returns:
    tuple: A tuple containing two elements; the first is a NumPy array of input arrays, 
           and the second is a NumPy array of corresponding labels.
    """
    
    def process_game(path, label, length, num_segments, segment_length):
        """
        Loads the game data from the specified path and extracts segments of the specified length.
        
        Parameters:
        path (str): File path to the numpy array containing game data.
        label (str): The label associated with the game data.
        length (int): The total length of the game data.
        num_segments (int): The number of segments to be extracted from the game data.
        segment_length (int): The length of each segment.
        
        Appends the extracted segments and their labels to a shared list accessible by the parent process.
        """
        # Return immediately if there are no segments to process
        if num_segments == 0:
            return
        
        # Load the game data from the specified path
        with gzip.open(path, 'rb') as f:
            inputs_array = np.load(f)
        
        # Initialize an array to hold the extracted segments
        segments_array = np.empty((num_segments, 9, segment_length), dtype=np.single)
        
        # Calculate the shift between starting points of consecutive segments
        segment_shift = (length - segment_length) // num_segments
        
        # Extract segments from the input array
        for i in range(num_segments):
            start_index = segment_shift * i
            segments_array[i, :, :] = inputs_array[:, start_index : start_index + segment_length]
        
        # Append the extracted segments and their label to the shared list
        shared_list.append((segments_array, [label] * num_segments))
    
    # Calculate the segment length using the power of 2
    segment_length = 2 ** segment_length_power
    
    # Prepare tasks for parallel processing
    tasks = [
        (row['player_inputs_np_save_path'], row['labels'], row['length'], row['num_segments']) 
        for index, row in df.iterrows()
    ]
    
    # Use Manager to create a shared list for collecting results from parallel processes
    manager = Manager()
    shared_list = manager.list()
    
    # Process each game in parallel to extract segments
    Parallel(n_jobs=-1, verbose=0)(
        delayed(process_game)(task[0], task[1], task[2], task[3], segment_length) 
        for task in tqdm.tqdm(tasks)
    )
    
    # After parallel processing, extract the segments and labels from the shared list
    input_arrays, label_lists = zip(*list(shared_list))
    
    # Combine all segment arrays into one array and all labels into one list
    input_array = np.concatenate(input_arrays, axis=0)
    labels = np.concatenate(label_lists)

    return input_array, labels

input_array, labels = create_training_numpy(train_df, 10)
print(input_array.shape)
print(labels.shape)
input_array, labels = create_training_numpy(test_df, 10)
print(input_array.shape)
print(labels.shape)
if not val_df.empty:
    input_array, labels = create_training_numpy(val_df, 10)
    print(input_array.shape)
    print(labels.shape)