<h1> Data Extraction </h1>
We extract all the data from all the replay files and store it in a sensible way. We use a modified version of py-slippi that handles the lack of metadata in the ranked dataset.

In [None]:
import os as os
from pathlib import Path
import numpy as np
import pandas as pd
# import polars as pl
import tqdm
import slippi as slp
from joblib import Parallel, delayed
from multiprocessing import Manager
import pyarrow as pa
# import pyarrow.parquet as pq
import gzip
import pickle
import feather
import uuid
import time
import tables
import shutil
import random

<h2>  Functions for Handling Paths and Files </h2>

In [None]:
# Function to create necessary directories
def create_directories(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
# Split a list of paths into the common path and the rest of the path
def split_paths(paths):
    # Find the common prefix
    common_prefix = os.path.commonprefix(paths)
    
    # Ensure the common prefix ends with a directory separator
    common_path = os.path.dirname(common_prefix) + os.sep

    # Split each path into the common part and the unique part
    split_paths = [ path[len(common_path):] for path in paths]

    return common_path, split_paths


# Function that puts all the information about where the file we send to process_slp_file into lists
def add_to_triples(file_paths, common_path, source, list_of_paths):
    for file in file_paths:
        list_of_paths.append((common_path, file, source))
    return list_of_paths

def delete_folder(path):
    try:
        shutil.rmtree(path)
        print(f"The folder '{path}' has been deleted successfully.")
    except Exception as e:
        print(f"Error deleting the folder '{path}': {e}")

<h2> Define paths and list files </h2>
We define paths to our datasets and list all files in the directory.

In [None]:
def make_path_list(num_files):
    common_ranked_path = 'D:\\ranked\\ranked-anonymized-1-116248\\ranked-anonymized'
    common_public_path = 'C:\\Users\\jaspa\\Grant ML\\Slippi_Public_Dataset_v3'
    mango_path = 'D:\\Mango'

    # Get all the files in the  directories
    ranked_files = [file for file in os.listdir(common_ranked_path) if file.endswith('.slp')] 
    public_files = [file for file in os.listdir(common_public_path) if file.endswith('.slp')] 
    mango_files = [] # Has sub folders so must treat differently

    # Walk through the directory tree of mango_path
    for mango_path, dirnames, filenames in os.walk(mango_path):
        for filename in filenames:
            # Check if the file ends with .slp
            if filename.endswith('.slp'):
                # Construct the full file path and add it to the list
                file_path =os.path.join(mango_path,filename)
                mango_files.append(file_path)

    common_mango_path, sub_mango_paths = split_paths(mango_files)

    list_of_paths = []
    # Add files to triples list
    list_of_paths = add_to_triples(ranked_files[:num_files], common_ranked_path, 'ranked', list_of_paths)
    list_of_paths = add_to_triples(public_files[:num_files], common_public_path,'public', list_of_paths)
    list_of_paths = add_to_triples(sub_mango_paths[:num_files], common_mango_path, 'mango', list_of_paths)

    # print(ranked_files[:2])
    # print(public_files[:2])
    # print(mango_files[:2])
    return list_of_paths



<h2> Extract Simple Data </h2>
Extract the data that will be stored in the data frame. Need to handle the cases where the data is none or not there. Esspecially metadata.

In [None]:
#get the overall metadata from the start of the game
def get_event_start_data(game,occupied_ports):
    ################
    # Start Metadata
    ################
    start_metadata_dict = {
        'is_teams': game.start.is_teams, #bool # We only take two player games
    }
    
    for i, port in enumerate(occupied_ports):
        start_metadata_dict.update({
            f'player_{i+1}_port': port, #int
            f'player_{i+1}_character_name': game.start.players[port].character.name, #string
            # # f'player_{i+1}_character_value': game.start.players[port_1].character.value, #int
            f'player_{i+1}_type_name': game.start.players[port].type.name, #string
            # # f'player_{i+1}_type_value': game.start.players[port].type.value, #int
            f'player_{i+1}_stocks': game.start.players[port].stocks, #int
            f'player_{i+1}_costume': game.start.players[port].costume, #int
        })
        try:
            start_metadata_dict.update({f'player_{i+1}_team_value': game.start.players[port].team.value if game.start.players[port].team else None}) #int
        except: pass
        start_metadata_dict.update({
            f'player_{i+1}_ucf_shield_drop_name': game.start.players[port].ucf.shield_drop.name, #string
            # f'player_{i+1}_ucf_shield_drop_value': game.start.players[port_1].ucf.shield_drop.value, #int
            f'player_{i+1}_tag': game.start.players[port].tag, #string
            f'player_{i+1}_display_name': game.start.players[port].display_name #string
        })
                # frame.ports[port].leader.post.state_age if frame.ports[port].leader.post.state_age else None,
    start_metadata_dict.update({
        # 'players': game.start.players, #tuple that contains players by port.
        'random_seed': game.start.random_seed, #int #
        'slippi': str(game.start.slippi.version), #slippi TUPLE NEEDS PROCESSING
        'stage_name': game.start.stage.name, #string
        # 'stage_value': game.start.stage.value, #int
        'is_pal': game.start.is_pal, #bool
        'is_frozen_ps': game.start.is_frozen_ps, #bool
    })
    return start_metadata_dict

# Extract all the data from the slippi.event.end module
# Determine winner if it is a conclusive two player game
def get_event_end_data(game, occupied_ports):
    end_data_dict = {
        'end_method_name' : game.end.method.name, #string
        # 'end_method_value' : game.end.method.value, #int
        'lras_initiator' : game.end.lras_initiator #int
    }
    
    end_method = game.end.method.name
    if game.start.is_teams == False and len(occupied_ports) == 2:
        if end_method == 'CONCLUSIVE' or end_method == 'TIME ' or end_method == 'GAME':
            last_frame = game.frames[-1]
            player_1_port = occupied_ports[0]
            player_2_port = occupied_ports[1]
            if last_frame.ports[player_1_port].leader.post.stocks > last_frame.ports[player_2_port].leader.post.stocks:
                end_data_dict.update({
                    'conclusive': True,
                    'winning_player': int(1),
                    'player_1_win': True,
                    'player_2_win': False
                })
            elif last_frame.ports[player_1_port].leader.post.stocks < last_frame.ports[player_2_port].leader.post.stocks:
                end_data_dict.update({
                    'conclusive': True,
                    'winning_player': int(2),
                    'player_1_win': False,
                    'player_2_win': True
                })
            elif last_frame.ports[player_1_port].leader.post.stocks == last_frame.ports[player_2_port].leader.post.stocks:
                if last_frame.ports[player_1_port].leader.post.damage < last_frame.ports[player_2_port].leader.post.damage:
                    end_data_dict.update({
                        'conclusive': True,
                        'winning_player': int(1),
                        'player_1_win': True,
                        'player_2_win': False
                    })
                elif last_frame.ports[player_1_port].leader.post.damage > last_frame.ports[player_2_port].leader.post.damage:
                    end_data_dict.update({
                        'conclusive': True,
                        'winning_player': int(2),
                        'player_1_win': False,
                        'player_2_win': True
                    })
            else:
                end_data_dict.update({
                    'conclusive': False,
                    'winning_player': int(0),
                    # 'player_1_did_win': False,
                    # 'player_2_did_win': False
                    })
        else:
            end_data_dict.update({
                'conclusive': False,
                'winning_player': int(0),
                # 'player_1_did_win': False,
                # 'player_2_did_win': False
                })
        
            
        

    return end_data_dict


# Some games won't have metadata
def get_metadata(game, occupied_ports):
    metadata_dict = {
       'date': game.metadata.date, #datetime
       'duration': game.metadata.duration, #int
       'platform': game.metadata.platform.name #string
        }
    
    for i, port in enumerate(occupied_ports):
        try:
            metadata_dict.update({
                # f'player_{i+1}_character': game.metadata.players[port].characters if game.metadata.players[port] else None, #string
                f'player_{i+1}_netplay_code': game.metadata.players[port].netplay.code # String
                })
        except: pass
        try:
            metadata_dict.update({
                f'player_{i+1}_netplay_name': game.metadata.players[port].netplay.name if game.metadata.players[port] else None, #string
                })
        except: pass
    
    try:   
        metadata_dict.update({
            'console_name': game.metadata.console_name
            })
    except: pass
        
    return metadata_dict


<h2> Frame Data Functions </h2>
These are the functions that we will call to extract all the frame data and one-hot encode some of it. The outputs are numpy arrays. I am not removing the first 123 frames.

In [None]:
# # Function to extract frames.pre data
# def get_frames_pre_data(frames, port):
#     # self.state = state #: :py:class:`slippi.id.ActionState` | int: Character's action state
#     # self.position = position #: :py:class:`Position`: Character's position
#     # self.direction = direction #: :py:class:`Direction`: Direction the character is facing
#     # self.joystick = joystick #: :py:class:`Position`: Processed analog joystick position
#     # self.cstick = cstick #: :py:class:`Position`: Processed analog c-stick position
#     # self.triggers = triggers #: :py:class:`Triggers`: Trigger state
#     # self.buttons = buttons #: :py:class:`Buttons`: Button state
#     # self.random_seed = random_seed #: int: Random seed at this point
#     # self.raw_analog_x = raw_analog_x #: int | None: `added(1.2.0)` Raw x analog controller input (for UCF)
#     # self.damage = damage #: float | None: `added(1.4.0)` Current damage percent
    
#     pre_data = np.empty((4,len(frames)),dtype = np.float)
#     # self.state = state #: :py:class:`slippi.id.ActionState` | int: Character's action state
#     # self.position = position #: :py:class:`Position`: Character's position
#     # self.direction = direction #: :py:class:`Direction`: Direction the character is facing
#     # self.damage = damage #: float | None: `added(1.4.0)` Current damage percent
    
#     pre_input_data = np.empty((4,len(frames)),dtype = np.float)
    
#     for i, frame in enumerate(frames):
#         integer_data[0,i] = frame.ports[port].leader.pre.state.value
#         integer_data[1,i] = frame.ports[port].leader.pre.direction.value
#         integer_data[2,i] = frame.ports[port].leader.pre.buttons.logical.value
#         integer_data[3,i] = frame.ports[port].leader.pre.buttons.physical.value
        
#         float_data[0,i] = frame.ports[port].leader.pre.position.x
#         float_data[1,i] = frame.ports[port].leader.pre.position.y
#         float_data[2,i] = frame.ports[port].leader.pre.joystick.x
#         float_data[3,i] = frame.ports[port].leader.pre.joystick.y
#         float_data[4,i] = frame.ports[port].leader.pre.cstick.x
#         float_data[5,i] = frame.ports[port].leader.pre.cstick.y  
#         float_data[6,i] = frame.ports[port].leader.pre.trigger.logical
#         float_data[7,i] = frame.ports[port].leader.pre.trigger.physical.l
#         float_data[8,i] = frame.ports[port].leader.pre.trigger.physical.r
    
#     return integer_data, float_data

# # return np containing floats and np containing int.
# def get_frames_post_data(frames,port):
#     class Post(
#         character, #int
#         state, #slippi.id.ActionState: int 0-382
#         position_x, #float
#         position_y,#float
#         direction, #int
#         damage, #float
#         shield, #float
#         stocks, #int
#         last_attack_landed, #int
#         last_hit_by, #int
#         combo_count, #int
#         state_age=None, #float | none
#         flags=None, #int 16-549755813888 int64
#         hit_stun=None,#float
#         airborne=None,#bool
#         ground=None,#int
#         jumps=None, #int
#         l_cancel=None # success = 1, failure = 2
#         )

        

<h2> Get Frames </h2>
A function to put all the frame data from a single port into a pandas df.

In [None]:
# Function to extract frame data
def get_frames_df(frames, port):
    # Pre
    # self.state = state #: :py:class:`slippi.id.ActionState` | int: Character's action state
    # self.position = position #: :py:class:`Position`: Character's position
    # self.direction = direction #: :py:class:`Direction`: Direction the character is facing
    # self.joystick = joystick #: :py:class:`Position`: Processed analog joystick position
    # self.cstick = cstick #: :py:class:`Position`: Processed analog c-stick position
    # self.triggers = triggers #: :py:class:`Triggers`: Trigger state
    # self.buttons = buttons #: :py:class:`Buttons`: Button state
    # self.random_seed = random_seed #: int: Random seed at this point
    # self.raw_analog_x = raw_analog_x #: int | None: `added(1.2.0)` Raw x analog controller input (for UCF)
    # self.damage = damage #: float | None: `added(1.4.0)` Current damage percent
    
    # Post
    # character: sid.InGameCharacter #: In-game character (can only change for Zelda/Sheik). Check on first frame to determine if Zelda started as Sheik
    # state: Union[sid.ActionState, int] #: Character's action state
    # position: Position #: Character's position
    # direction: Direction #: Direction the character is facing
    # damage: float #: Current damage percent
    # shield: float #: Current size of shield
    # stocks: int #: Number of stocks remaining
    # last_attack_landed: Union[Attack, int] #: Last attack that this character landed
    # last_hit_by: Optional[int] #: Port of character that last hit this character
    # combo_count: int #: Combo count as defined by the game
    # state_age: Optional[float] #: `added(0.2.0)` Number of frames action state has been active. Can have a fractional component for certain actions
    # flags: Optional[StateFlags] #: `added(2.0.0)` State flags
    # hit_stun: Optional[float] #: `added(2.0.0)` Number of hitstun frames remaining
    # airborne: Optional[bool] #: `added(2.0.0)` True if character is airborne
    # ground: Optional[int] #: `added(2.0.0)` ID of ground character is standing on, if any
    # jumps: Optional[int] #: `added(2.0.0)` Jumps remaining
    # l_cancel: Optional[LCancel] #: `added(2.0.0)` L-cancel status, if any
    column_names = ['frame_index',
                    # 
                    'pre_state', 'pre_position_x','pre_position_y','pre_direction',
                    'pre_joystick_x','pre_joystick_y', 'pre_cstick_x', 'pre_cstick_y',
                    'pre_triggers_logical','pre_triggers_physical_l','pre_triggers_physical_r',
                    'pre_buttons_logical','pre_buttons_physical',
                    'pre_random_seed','pre_raw_analog_x', 'pre_damage',
                    # 
                    'post_character',
                    'post_state','post_position_x','post_position_y','post_direction',
                    'post_damage','post_sheild','post_stocks',
                    'post_last_attack_landed','post_last_hit_by','post_combo_count',
                    'post_state_age','post_flags','post_hit_stun',
                    'post_airbourn', 'post_ground','post_jumps','post_l_cancel'
                    ]
    

    frame_data = []
    
    for frame in frames:
        frame_data.append([
            frame.index, # To remind us that it starts at -123
            # Pre
            # frame.ports[port].leader.pre.state.value,
            frame.ports[port].leader.pre.state,
            # 0,
            frame.ports[port].leader.pre.position.x,
            frame.ports[port].leader.pre.position.y,
            frame.ports[port].leader.pre.direction.value,
            # 0,
            #
            frame.ports[port].leader.pre.joystick.x,
            frame.ports[port].leader.pre.joystick.y,
            frame.ports[port].leader.pre.cstick.x,
            frame.ports[port].leader.pre.cstick.y,
            #
            frame.ports[port].leader.pre.triggers.logical,
            frame.ports[port].leader.pre.triggers.physical.l,
            frame.ports[port].leader.pre.triggers.physical.r,
            #
            frame.ports[port].leader.pre.buttons.logical.value,
            # 0,
            frame.ports[port].leader.pre.buttons.physical.value,
            # 0,
            # 
            frame.ports[port].leader.pre.random_seed,
            frame.ports[port].leader.pre.raw_analog_x,
            frame.ports[port].leader.pre.damage,
            # Post
            frame.ports[port].leader.post.character.value,
            #
            # frame.ports[port].leader.post.state.value,
            frame.ports[port].leader.post.state,
            frame.ports[port].leader.post.position.x,
            frame.ports[port].leader.post.position.y,
            frame.ports[port].leader.post.direction.value,
            #
            frame.ports[port].leader.post.damage,
            frame.ports[port].leader.post.shield,
            frame.ports[port].leader.post.stocks,
            #
            frame.ports[port].leader.post.last_attack_landed.value if frame.ports[port].leader.post.last_attack_landed else None,
            # 0,
            frame.ports[port].leader.post.last_hit_by if frame.ports[port].leader.post.last_hit_by else None,
            frame.ports[port].leader.post.combo_count,
            #
            frame.ports[port].leader.post.state_age if frame.ports[port].leader.post.state_age else None,
            frame.ports[port].leader.post.flags.value if frame.ports[port].leader.post.flags else None,
            # 0,
            frame.ports[port].leader.post.hit_stun if frame.ports[port].leader.post.hit_stun else None,
            #
            frame.ports[port].leader.post.airborne if frame.ports[port].leader.post.airborne else None,
            frame.ports[port].leader.post.ground if frame.ports[port].leader.post.ground else None,
            frame.ports[port].leader.post.jumps if frame.ports[port].leader.post.jumps else None,
            frame.ports[port].leader.post.l_cancel.value if frame.ports[port].leader.post.l_cancel else None
            # 0
            ])

    # pre_list = ['frame_index',
    #             'pre_state', 'pre_position_x','pre_position_y','pre_direction',
    #             'pre_joystick_x','pre_joystick_y', 'pre_cstick_x', 'pre_cstick_y',
    #             'pre_triggers_logical','pre_triggers_physical_l','pre_triggers_physical_r',
    #             'pre_buttons_logical','pre_buttons_physical',
    #             'pre_random_seed','pre_raw_analog_x', 'pre_damage']
    # post_list = ['post_character',
    #                 'post_state','post_position_x','post_position_y','post_direction',
    #                 'post_damage','post_sheild','post_stocks',
    #                 'post_last_attack_landed','post_last_hit_by','post_combo_count',
    #                 'post_state_age','post_flags','post_hit_stun',
    #                 'post_airbourn', 'post_ground','post_jumps','post_l_cancel']    
    input_list = ['frame_index',
                  'pre_joystick_x','pre_joystick_y', 'pre_cstick_x', 'pre_cstick_y',
                  'pre_triggers_logical','pre_triggers_physical_l','pre_triggers_physical_r']
    
    all_data_df = pd.DataFrame(frame_data, columns=column_names)
    # pre_data_df = all_data_df#[pre_list]
    # post_data_df = all_data_df#[post_list]
    input_data_df = all_data_df[input_list]
    
    button_states = all_data_df['pre_buttons_logical'].values

    # For each button, apply the bitmask and add a column to the DataFrame
    for button in slp.event.Buttons.Logical:
        if button == slp.event.Buttons.Logical.NONE:  # Skip the NONE value to avoid an unnecessary column
            continue
        # Use broadcasting to apply the bitmask and generate the one-hot enco
        # ed array
        input_data_df[button.name] = (button_states & button.value) != 0
    
                  
    return all_data_df, input_data_df 

def inputs_to_np(input_data_df):
    inputs_array = np.empty((input_data_df.shape[0]-123, 9),dtype=np.float32)
    inputs_X_Y_array =  np.empty((input_data_df.shape[0]-123, 2),dtype=np.float32)
    inputs = input_data_df[['pre_joystick_x', 'pre_joystick_y', 'pre_cstick_x', 'pre_cstick_y','pre_triggers_logical','Z', 'A', 'B']]
    inputs_X_Y = input_data_df[['X','Y']]
    inputs_array[:,:8] = inputs.iloc[123:]
    inputs_X_Y_array = inputs_X_Y.iloc[123:]
    inputs_array[:,8] = np.max(inputs_X_Y_array,axis = 1)
    return np.transpose(inputs_array)

# def inputs_to_np(input_data_df):
#     inputs_array = input_data_df.to_numpy(dtype= np.float32)
#     return np.transpose(inputs_array)

    

<h2> Process Fuction </h2>
A function to process a .slp file path. Save the frame data in character subfolder of public, ranked, and mango depending on which dataset the game comes from. We want each frame data file to be saved with a unique name that is relatively short (I think there is a package to generate unique codes and then we would put underscore followed by the port.)  We will save all the data, even if we don't think will use it.

In [None]:



# Each iteration of this creates a row of the dataframe and appends it to the dataframe.
def process_slp_file(path, common_frame_data_path,common_inputs_df_path, common_inputs_np_path, no_teams_1_players, no_teams_2_players, no_teams_3_players,
                     no_teams_4_players, teams_1_players, teams_2_players, teams_3_players, teams_4_players, all_data_list, load_time_list,error_list):#, dataset_path, save_path, no_teams_2_players,no_teams_3_players,no_teams_4_players,teams_3_players,teams_4_players):
    try:
        slp_file_path = os.path.join(path[0], path[1])
        # slp_file_name = slp_file.removesuffix('.slp')
        
        # game = slp.Game(file_path)
        start_time = time.time()
        game = slp.Game(slp_file_path)
        load_time_list.append(time.time()- start_time)
        # Get the frame data and save it
        frames = game.frames
        # Get occupied ports
        occupied_ports = [i for i, port in enumerate(game.start.players) if port is not None]
        game_data_dict = {'source':path[2], 'source_path_prefix': path[0], 'source_path_suffix': path[1], 'length': len(frames), 'num_players': len(occupied_ports)}
        # Get the game's start data
        game_data_dict.update(get_event_start_data(game,occupied_ports))
        # Get the game's end data
        if game.end is not None:
            game_data_dict.update(get_event_end_data(game,occupied_ports))
        # # Determine player placements
        # game_data_dict.update(player_placement(game,occupied_ports))
        
        # # Get the game's metadata, hadle the case that there is no metadata
        if game.metadata is not None:
            game_data_dict.update(get_metadata(game, occupied_ports))
        
        game_data_dict.update({'all_data_df_common_path': common_frame_data_path, 'inputs_df_common_path': common_inputs_df_path, 'inputs_np_common_path': common_inputs_np_path})
        for i, port in enumerate(occupied_ports):
            all_data_df, input_data_df = get_frames_df(frames, port)
            character = game.start.players[port].character.name
            
            subpath = os.path.join(path[2],character)
            create_directories(os.path.join(common_frame_data_path,subpath))
            all_data_df_sub_path = os.path.join(subpath, str(uuid.uuid4()) + '.parquet')
            all_data_df_save_path = os.path.join(common_frame_data_path, all_data_df_sub_path)
            all_data_df.to_parquet(all_data_df_save_path, engine = 'pyarrow', compression='gzip')
            game_data_dict.update({f'player_{i+1}_all_data_df_sub_path': all_data_df_sub_path,f'player_{i+1}_all_data_df_save_path': all_data_df_save_path})
            
            create_directories(os.path.join(common_inputs_df_path,subpath))
            input_df_sub_path = os.path.join(subpath, str(uuid.uuid4()) + '.parquet')
            input_df_save_path = os.path.join(common_inputs_df_path, input_df_sub_path)
            input_data_df.to_parquet(input_df_save_path, engine = 'pyarrow', compression='gzip')
            game_data_dict.update({f'player_{i+1}_inputs_df_sub_path': input_df_sub_path,f'player_{i+1}_inputs_df_save_path': input_df_save_path})
            
            if len(frames) > 123:
                input_data_np = inputs_to_np(input_data_df)
                create_directories(os.path.join(common_inputs_np_path,subpath))
                input_data_np_sub_path = os.path.join(subpath, f"{uuid.uuid4()}.npy.gz")
                input_data_np_save_path = os.path.join(common_inputs_np_path, input_data_np_sub_path) 
                with gzip.open(input_data_np_save_path, 'wb') as f:
                    np.save(f, input_data_np)
                game_data_dict.update({f'player_{i+1}_inputs_np_sub_path': input_data_np_sub_path,f'player_{i+1}_inputs_np_save_path': input_data_np_save_path})
                
            
        
        if game.start.is_teams:
            if len(occupied_ports) == 1: 
                teams_1_players.append(game_data_dict)
            elif len(occupied_ports) == 2:
                teams_2_players.append(game_data_dict)
            elif len(occupied_ports) == 3:
                teams_3_players.append(game_data_dict)
            elif len(occupied_ports) == 4:
                teams_4_players.append(game_data_dict)
        else:
            if len(occupied_ports) == 1: 
                no_teams_1_players.append(game_data_dict)
            elif len(occupied_ports) == 2:
                no_teams_2_players.append(game_data_dict)
            elif len(occupied_ports) == 3:
                no_teams_3_players.append(game_data_dict)
            elif len(occupied_ports) == 4:
                no_teams_4_players.append(game_data_dict)
        
        all_data_list.append(game_data_dict)
        
        return 
             
    except Exception as e:
        error_list.append(f"Error processing {path}: {str(e)}")

In [None]:
# Make a list of paths to process
path_list = make_path_list(-1)

# Create directory to save frame data dataframse
common_frame_data_path = 'C:\\Users\\jaspa\\Grant ML\\frame_data'
# Delete the directories if they exist
delete_folder(common_frame_data_path)
# Create the directories
create_directories(os.path.join(common_frame_data_path, 'ranked'))
create_directories(os.path.join(common_frame_data_path, 'public'))
create_directories(os.path.join(common_frame_data_path, 'mango'))

# Create directory to save input data dataframse
common_inputs_df_path = 'C:\\Users\\jaspa\\Grant ML\\input_df'
# Delete the directories if they exist
delete_folder(common_inputs_df_path)
# Create the directories
create_directories(os.path.join(common_inputs_df_path, 'ranked'))
create_directories(os.path.join(common_inputs_df_path, 'public'))
create_directories(os.path.join(common_inputs_df_path, 'mango'))

# Create directory to save input data numpy
common_inputs_np_path = 'C:\\Users\\jaspa\\Grant ML\\input_np'
# Delete the directories if they exist
delete_folder(common_inputs_np_path)
# Create the directories
create_directories(os.path.join(common_inputs_np_path, 'ranked'))
create_directories(os.path.join(common_inputs_np_path, 'public'))
create_directories(os.path.join(common_inputs_np_path, 'mango'))



# # Print the first few triples to verify
# for slp_path_double in slp_paths_doubles:  # Print more or less based on your need
#     print(slp_path_double)

# Create lists to send to process_slp_file
manager = Manager()
no_teams_1_players = manager.list()
no_teams_2_players = manager.list()
no_teams_3_players = manager.list()
no_teams_4_players = manager.list()
teams_1_players = manager.list()
teams_2_players = manager.list()
teams_3_players = manager.list()
teams_4_players = manager.list()
all_data_list = manager.list()
load_time_list = manager.list()
error_list = manager.list()

# The average time to process a game varries between the data sets
# Shuffle the order we process them in so that the estimated time to complete in the
# progress bar is more accurate
random.shuffle(path_list)
# 
# Use joblib to parallelize processing of SLP files
Parallel(n_jobs=-1, verbose=1)(delayed(process_slp_file)(path,common_frame_data_path,common_inputs_df_path,common_inputs_np_path, no_teams_1_players, no_teams_2_players, no_teams_3_players, no_teams_4_players, teams_1_players, teams_2_players, teams_3_players, teams_4_players,all_data_list,load_time_list,error_list) for path in tqdm.tqdm(path_list))


In [None]:
# print(no_teams_1_players)
print(no_teams_2_players)
# print(no_teams_3_players)
# print(no_teams_4_players)
# print(teams_1_players)
# print(teams_2_players)
# print(teams_3_players)
# print(teams_4_players)
print(all_data_list)

print(len(all_data_list))
print(sum(load_time_list))

In [None]:
no_teams_1_players_df = pd.DataFrame(list(no_teams_1_players))
no_teams_2_players_df = pd.DataFrame(list(no_teams_2_players))
no_teams_3_players_df = pd.DataFrame(list(no_teams_3_players))
no_teams_4_players_df = pd.DataFrame(list(no_teams_4_players))
teams_1_players_df = pd.DataFrame(list(teams_1_players))
teams_2_players_df = pd.DataFrame(list(teams_2_players))
teams_3_players_df = pd.DataFrame(list(teams_3_players))
teams_4_players_df = pd.DataFrame(list(teams_4_players))
all_game_data_df = pd.DataFrame(list(all_data_list))
print(all_game_data_df.shape)
all_game_data_df

In [None]:
save_path = 'C:\\Users\\jaspa\\Grant ML\\slp\\data'
file_path = os.path.join(save_path, 'all_game_data_df' + '.parquet')
all_game_data_df.to_parquet(file_path, engine = 'pyarrow', compression='gzip')

# file_path = os.path.join(save_path, 'no_teams_1_players_df' + '.feather')
# no_teams_1_players_df.to_feather(file_path)
file_path = os.path.join(save_path, 'no_teams_2_players_df' + '.parquet')
no_teams_2_players_df.to_parquet(file_path, engine = 'pyarrow', compression='gzip')
# file_path = os.path.join(save_path, 'no_teams_3_players_df' + '.feather')
# no_teams_3_players_df.to_feather(file_path)
# file_path = os.path.join(save_path, 'no_teams_4_players_df' + '.feather')
# no_teams_4_players_df.to_feather(file_path)
# file_path = os.path.join(save_path, 'teams_1_players_df' + '.feather')
# teams_1_players_df.to_feather(file_path)
# file_path = os.path.join(save_path, 'teams_2_players_df' + '.feather')
# teams_2_players_df.to_feather(file_path)
# file_path = os.path.join(save_path, 'teams_3_players_df' + '.feather')
# teams_3_players_df.to_feather(file_path)
# file_path = os.path.join(save_path, 'teams_4_players_df' + '.feather')
# teams_4_players_df.to_feather(file_path)



In [None]:
print(all_game_data_df['source'].value_counts())

print(len(error_list))
for error in error_list:
    print(error)

In [None]:
print(all_game_data_df['source'].value_counts())