<h2> Outline</h2>
In this notebook we extract the input data from Fox vs Sheik games. We split each game into n second segments to get more training examples.

In [None]:
import os as os
import numpy as np
import pandas as pd
import tqdm
import slippi as slp
from joblib import Parallel, delayed
from multiprocessing import Manager

<h2> Initialize some variables</h2>
We can change the number of seconds per segment.

In [None]:
seconds_per_segment = 15
frames_per_segment = seconds_per_segment * 60
game_data_columns = ['spl_file', 'game_segment', 'is_sheik', 'input_data']
dataset_path = '../Slippi_Public_Dataset_v3/'
slp_files = [file for file in os.listdir(dataset_path) if file.endswith('.slp') and 'Sheik' in file and 'Fox' in file]

<h2> Preliminary Functions </h2>
We use these functions to one-hot encode the button bitmask and get the frame input data for a given port number and frames object.

In [None]:
def one_hot_encode(bitmask):
    labels = ['DPAD_LEFT', 'DPAD_RIGHT', 'DPAD_DOWN', 'DPAD_UP', 'Z', 'R', 'L', 'A', 'B', 'X', 'Y', 'START']
    encoded_values = [1, 2, 4, 8, 16, 32, 64, 256, 512, 1024, 2048, 4096]

    # Create a dictionary mapping labels to their encoded values
    label_to_value = dict(zip(labels, encoded_values))

    # Initialize a list to store the one-hot encoded values
    one_hot_encoded = [0] * len(labels)

    # Iterate through labels and set the corresponding one-hot encoded value
    for label, value in label_to_value.items():
        if bitmask & value:
            one_hot_encoded[labels.index(label)] = 1

    return one_hot_encoded

# Create a numpy list that is the correct size and fill it with a loop
def get_frame_data(frames, port):
    inputs = np.empty((frames_per_segment, 18))  # Initialize an empty Numpy array
    for i, frame in enumerate(frames):  
        buttons = one_hot_encode(frame.ports[port].leader.pre.buttons.physical.value)
        j_x = frame.ports[port].leader.pre.joystick.x
        j_y = frame.ports[port].leader.pre.joystick.y
        c_x = frame.ports[port].leader.pre.cstick.x
        c_y = frame.ports[port].leader.pre.cstick.y
        t_l = frame.ports[port].leader.pre.triggers.physical.l
        t_r = frame.ports[port].leader.pre.triggers.physical.r

        frame_data = buttons + [j_x, j_y, c_x, c_y, t_l, t_r]
        inputs[i] = frame_data

    return inputs


<h2>Process SLP function</h2>
The function that will be called for each SLP file we are interested in.

In [None]:
def process_slp_file(slp_file, dataset_path, game_data):
    try:
        file_path = os.path.join(dataset_path, slp_file)
        game = slp.Game(file_path)
        frames = game.frames
         
        # Check game is long enough
        game_length = game.metadata.duration
        if game_length < 123 + frames_per_segment:
            return
        
        # Find the ports the players are using
        occupied_ports = [i for i, port in enumerate(game.start.players) if port is not None]
        if len(occupied_ports) > 2:  # Ignore games that aren't singles
            return
        if game.start.players[occupied_ports[0]].character.name == 'SHEIK':
            sheik_port = occupied_ports[0]
            fox_port = occupied_ports[1]
        else:
            sheik_port = occupied_ports[1]
            fox_port = occupied_ports[0]

        # Is one of the players a CPU? If a player is a computer, ignor the game
        # event.players[sheik_port].type.value returns 0 if human and 1 if cpu
        if game.start.players[sheik_port].type.value or game.start.players[fox_port].type.value:
            return

        num_game_segments = game_length // frames_per_segment # To get more training data we take all 15s segments of the game
        for i in range(num_game_segments):
            # Get Sheik data for the ith game segment
            sheik_input_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], sheik_port)
            game_data.append([slp_file, i, 1, sheik_input_data])
            
            # Get Fox data for the ith game segment
            fox_input_data =  get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], fox_port)
            game_data.append([slp_file, i, 0, fox_input_data])
            
    except Exception as e:
        print(f"Error processing {slp_file}: {str(e)}")

<h2> Multiprocess data extraction </h2>
We use joblib to speed the extraction of data.

In [None]:
num_files = -1 # How many files we want to extract data from

# Create shared lists to store results
manager = Manager()
game_data_list = manager.list()

# Use joblib to parallelize processing of SLP files
Parallel(n_jobs=-1, verbose=1)(delayed(process_slp_file)(slp_file, dataset_path, game_data_list) for slp_file in tqdm.tqdm(slp_files[:num_files]))

# Make the data frame
game_data_df = pd.DataFrame(list(game_data_list),columns = game_data_columns)
print(game_data_df.shape) # Check the shape to make sure we actually did something

<h2>Save the extracted input data</h2>
Save the data as a pickle file. Pickle is not the best format, but it seems to be the only one that works with numpy arrays in the data frame.

In [None]:
# # Specify the file path to save the pickle file
# pickle_file_path = '../data/Sheik_vs_Fox_15_second_segments.pkl'

# # Save the game data as a pickle file
# game_data_df.to_pickle(pickle_file_path)

## Check to see if it is saved correctly
# df = pd.read_pickle(pickle_file_path)