<h2> Outline</h2>
In this notebook we extract the input data from Fox vs Sheik games. We split each game into n second segments to get more training examples.

In [1]:
import os as os
import sys
import numpy as np
import pandas as pd
import tqdm
import slippi as slp
import gzip
from joblib import Parallel, delayed
from multiprocessing import Manager
from numba import njit, prange
# import cupy as cp
import gzip

<h2> Initialize some variables</h2>
We can change the number of seconds per segment.

In [2]:

# seconds_per_segment = 16
frames_per_segment = 1024
# frames_per_segment = seconds_per_segment * 60
# frames_per_segment = 900
game_data_columns = ['slp_file', 'game_segment', 'character', 'input_data']
dataset_path = '../../Slippi_Public_Dataset_v3/'
slp_files = [file for file in os.listdir(dataset_path) if file.endswith('.slp') and 
             ('Fox' in file or
              'Falco' in file or
              'Marth' in file or
              'Sheik' in file or
              'Jigglypuff' in file
              )]
# data_type_inputs = np.half
data_type_inputs = np.single
data_type_character_encode = np.bool_

<h2> Preliminary Functions </h2>
We use these functions to one-hot encode the button bitmask and get the frame input data for a given port number and frames object.

In [3]:
# Takes encoded buttons as bitmask and returns binary array of buttons pressed
def one_hot_encode_buttons(bitmask):
    labels = ['DPAD_LEFT', 'DPAD_RIGHT', 'DPAD_DOWN', 'DPAD_UP', 'Z', 'R', 'L', 'A', 'B', 'X', 'Y', 'START']
    encoded_values = [1, 2, 4, 8, 16, 32, 64, 256, 512, 1024, 2048, 4096]

    # Create a dictionary mapping labels to their encoded values
    label_to_value = dict(zip(labels, encoded_values))

    # Initialize a list to store the one-hot encoded values
    one_hot_encoded = [0] * len(labels)

    # Iterate through labels and set the corresponding one-hot encoded value
    for label, value in label_to_value.items():
        if bitmask & value:
            one_hot_encoded[labels.index(label)] = 1

    return one_hot_encoded

def one_hot_encode_characters(character):
    if character == 'FOX':
        return np.array([1, 0, 0, 0, 0],dtype=data_type_character_encode)
    elif character == 'FALCO':
        return np.array([0, 1, 0, 0, 0],dtype=data_type_character_encode)
    elif character == 'MARTH':
        return np.array([0, 0, 1, 0, 0],dtype=data_type_character_encode)
    elif character == 'SHEIK':
        return np.array([0, 0, 0, 1, 0],dtype=data_type_character_encode)
    elif character == 'JIGGLYPUFF':
        return np.array([0, 0, 0, 0, 1],dtype=data_type_character_encode)
    else:
        return None

# Create a numpy list that is the correct size and fill it with a loop
def get_frame_data(frames, port):
    inputs = np.empty((frames_per_segment, 18),dtype=data_type_inputs)  # Initialize an empty Numpy array
    for i, frame in enumerate(frames):  
        buttons = one_hot_encode_buttons(frame.ports[port].leader.pre.buttons.physical.value)
        j_x = frame.ports[port].leader.pre.joystick.x
        j_y = frame.ports[port].leader.pre.joystick.y
        c_x = frame.ports[port].leader.pre.cstick.x
        c_y = frame.ports[port].leader.pre.cstick.y
        t_l = frame.ports[port].leader.pre.triggers.physical.l
        t_r = frame.ports[port].leader.pre.triggers.physical.r

        frame_data = buttons + [j_x, j_y, c_x, c_y, t_l, t_r]
        inputs[i] = frame_data

    return inputs


<h2>Process SLP function</h2>
The function that will be called for each SLP file we are interested in.

In [4]:
# Each iteration of this creates a row of the dataframe and appends it to the dataframe.
def process_slp_file(slp_file, dataset_path, fox_game_data_list, falco_game_data_list, marth_game_data_list, sheik_game_data_list, jigglypuff_game_data_list):
    try:
        file_path = os.path.join(dataset_path, slp_file)
        game = slp.Game(file_path)
        frames = game.frames
         
        # Check game is long enough
        game_length = game.metadata.duration
        if game_length < 123 + frames_per_segment:          # TODO: Create a package and define constants like 123 with meaningful names
            return
        
        # Find the ports the players are using
        occupied_ports = [i for i, port in enumerate(game.start.players) if port is not None]
        # Ignore games that aren't singles
        if len(occupied_ports) > 2:  
            return
        # Ignore games with CPUs
        if game.start.players[occupied_ports[0]].type.value or game.start.players[occupied_ports[1]].type.value:
            return
        
        num_game_segments = game_length // frames_per_segment   # To get more training data we take all 15s segments of the game
        for i in range(num_game_segments):                      # Iterate over each of the 15s segments of the game
            # Get any frame data for the ith game segment
            for j in occupied_ports:
                # character = one_hot_encode_characters(game.start.players[j].character.name)
                #     if character is not None:
                #         frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                #         game_data.append([slp_file, i, character, frame_data])
                
                character = game.start.players[j].character.name
                if character == 'FOX':
                    frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                    # fox_game_data_list.append([slp_file, i, character, frame_data])
                    fox_game_data_list.append(frame_data)
                elif character == 'FALCO':
                    frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                    # falco_game_data_list.append([slp_file, i, character, frame_data])
                    falco_game_data_list.append(frame_data)
                elif character == 'MARTH':
                    frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                    # marth_game_data_list.append([slp_file, i, character, frame_data])
                    marth_game_data_list.append(frame_data)
                elif character == 'SHEIK':
                    frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                    # sheik_game_data_list.append([slp_file, i, character, frame_data])
                    sheik_game_data_list.append(frame_data)
                elif character == 'JIGGLYPUFF':
                    frame_data = get_frame_data(frames[123 + i * frames_per_segment: 123 + (i + 1) * frames_per_segment], j)
                    # jigglypuff_game_data_list.append([slp_file, i, character, frame_data])
                    jigglypuff_game_data_list.append(frame_data)
                # else:
                #     return None
            
    except Exception as e:
        print(f"Error processing {slp_file}: {str(e)}")

<h2> Multiprocess data extraction </h2>
We use joblib to speed the extraction of data.

In [5]:
num_files =  -1# How many files we want to extract data from

# Create shared lists to store results
manager = Manager()
fox_game_data_list = manager.list()
falco_game_data_list = manager.list()
marth_game_data_list = manager.list()
sheik_game_data_list = manager.list()
jigglypuff_game_data_list = manager.list()



# Use joblib to parallelize processing of SLP files
Parallel(n_jobs=-1, verbose=0)(delayed(process_slp_file)(slp_file, dataset_path, fox_game_data_list, falco_game_data_list, marth_game_data_list, sheik_game_data_list, jigglypuff_game_data_list) for slp_file in tqdm.tqdm(slp_files[:num_files]))


# Make the data frame
# game_data_df = pd.DataFrame(list(game_data_list),columns = game_data_columns)
# game_data_df # Check the shape to make sure we actually did something

100%|██████████| 57791/57791 [21:49<00:00, 44.13it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

<h1> Function to Process and Save Data </h1>

In [6]:
#Find another way to create batches. I worry about memory
def create_batches(data_list, n):
    return [data_list[i:i + n] for i in range(0, len(data_list), n)]

def process_game_data_list(game_data_list, character, batch, path):
    game_data_array = np.stack(game_data_list, axis=0)
    game_data_array = game_data_array.reshape(game_data_array.shape[0], 18, frames_per_segment)
    game_data_array[:, 0] = np.maximum(game_data_array[:, 9], game_data_array[:, 10])
    game_data_array[:, 1] = np.maximum(game_data_array[:, 16], game_data_array[:, 17])
    game_data_array = np.delete(game_data_array, [2,3,5,6,9,10,11,16,17], axis=1)
    
    filename = f"{character}_batch_{batch}_1024_frames.npy.gz"
    full_path = os.path.join(path, filename)
    with gzip.open(full_path, 'wb') as f:
        np.save(f, game_data_array)


def parallel_process(data_list, character, batch_size, path):
    batches = create_batches(data_list, batch_size)
    Parallel(n_jobs=-1, verbose=0)(delayed(process_game_data_list)(batch, character, i, path) for i, batch in tqdm.tqdm(enumerate(batches)))


# def batch_generator(data_list, n):
#     """Yield successive n-sized chunks from data_list."""
#     for i in range(0, len(data_list), n):
#         yield data_list[i:i + n]



<h1> Save Data </h1>

In [7]:
# Example usage
save_path = '../../data/character_batches/'
batch_size = 1000
parallel_process(fox_game_data_list, 'FOX', batch_size, save_path)
parallel_process(falco_game_data_list, 'FALCO', batch_size, save_path)
parallel_process(marth_game_data_list, 'MARTH', batch_size, save_path)
parallel_process(sheik_game_data_list, 'SHEIK', batch_size, save_path)
parallel_process(jigglypuff_game_data_list, 'JIGGLYPUFF', batch_size, save_path)

297it [01:23,  3.56it/s]
180it [00:28,  6.39it/s]
148it [00:21,  6.75it/s]
74it [00:06, 11.78it/s]
25it [00:00, 25073.55it/s]
