In [1]:
import pandas as pd
import numpy as np
from typing import Optional, List, Callable, Any, Union, Dict
from itertools import product
from statistics import mean
from pathlib import Path
import gzip
import os
import matplotlib.pyplot as plt
#import seaborn as sns
import re
from scipy.stats import f_oneway

# Read dataset

In [2]:
def read_ds_gzip(path: Optional[Path]=None, ds: str = "TRAIN") -> pd.DataFrame:
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with gzip.open(f'/kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

def read_ds_train(path: Optional[Path]=None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(f'/kaggle/input/train-sc2-keystrokes/{ds}.CSV' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

In [3]:
features_train = read_ds_train(Path(os.path.abspath('')) / "data/train.csv") 
features_train.shape 

(3052, 10539)

In [4]:
features_train.head(3)

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [5]:
unique_battleneturls = features_train['battleneturl'].unique()

print(f"Total unique battleneturl values: {len(unique_battleneturls)}")
print(unique_battleneturls)

Total unique battleneturl values: 200
['http://eu.battle.net/sc2/en/profile/4234852/1/First/'
 'http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/'
 'http://eu.battle.net/sc2/en/profile/3401218/1/Welmu/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/3538115/1/Golden/'
 'http://eu.battle.net/sc2/en/profile/250458/1/VortiX/'
 'http://eu.battle.net/sc2/en/profile/3973341/1/yoeFWSan/'
 'http://eu.battle.net/sc2/en/profile/2452136/1/MinChul/'
 'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/'
 'http://eu.battle.net/sc2/en/profile/4341883/1/Patience/'
 'http://eu.battle.net/sc2/en/profile/950504/1/Grubby/'
 'http://eu.battle.net/sc2/en/profile/2898004/1/MMA/'
 'http://eu.battle.net/sc2/en/profile/251061/1/LiveZerg/'
 'http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/'
 'http://eu.battle.net/sc2/en/profile/326029/1/LiquidTLO/'
 'http://eu.battle.net/sc2/en/profile/1058669/1/Happy/'
 'http://eu.battle.net/sc2/en/profile/1139573/1

# Pre-processing and feature creation

Function to create the dummie features for played race

In [6]:
def get_dummies(df: pd.DataFrame):
    """Converts textual columns to one-hot encoded vectors (one column per possible value)"""
    df = pd.get_dummies(df, columns=["played_race"])
    return df

Convertion of players urls to codes:

In [7]:
def to_categories(df: pd.DataFrame, col: str="battleneturl") -> None:
    """Convert col of df to a categorical column"""
    df["battleneturl"] = pd.Categorical(df["battleneturl"])
    df[[col]] = df[[col]].apply(lambda x: x.cat.codes)

### Calculate the features: s_ratio, base_ratio and mineral_ratio:

In [8]:
def calculate_action_ratios(row):
    # Define the columns of actions (columns 0 to 10536, excluding the last column 'num_actions')
    #action_columns = [str(i) for i in range(0, 10537)]
    
    # Extract action values from the row
    actions = row.values
    
    # Count occurrences of 's', 'Base', and 'SingleMineral'
    s_count = sum(1 for action in actions if action == 's')
    base_count = sum(1 for action in actions if action == 'Base')
    mineral_count = sum(1 for action in actions if action == 'SingleMineral')
    
    # Extract the total number of actions from the 'num_actions' column
    total_actions = sum(1 for action in actions)
    
    # Calculate the ratios
    ratio_s = s_count / total_actions if total_actions > 0 else 0
    ratio_base = base_count / total_actions if total_actions > 0 else 0
    ratio_mineral = mineral_count / total_actions if total_actions > 0 else 0
    
    # Return the ratios as a dictionary
    return pd.Series({'ratio_s': ratio_s, 'ratio_base': ratio_base, 'ratio_mineral': ratio_mineral})

### Calculate the feature number of actions per interval

In [9]:
def calculate_action_mean(row):
    """
    Calculate the mean number of actions per 5 seconds for a given row.

    Args:
        row (pd.Series): The row to process.

    Returns:
        float: The mean number of actions per 5 seconds.
    """
    counts = []
    count = 0

    for value in row:
        if pd.isna(value):  # Stop if the value is NaN
            break
        elif isinstance(value, str) and re.match(r"^t\d+$", value):  # Found a time window
            if count > 0:
                counts.append(count)  # Save the current count
            count = 0  # Restart the count
        else:  # Count non-NaN, non-time values
            count += 1

    if count > 0:  # Add any remaining count
        counts.append(count)

    return np.mean(counts) if counts else 0  # Calculate the mean, default to 0 if no counts

In [10]:
def count_values(row):
    return row.value_counts()

### Calculate ratio of kinds of hotkeys

In [11]:
def count_hotkeys_xx(row):
    # Find all hotkeyXX patterns
    hotkeys = re.findall(r"hotkey\d{2}", " ".join(row.astype(str)))
    
    # Count occurrences of all hotkeys
    hotkey_counts = pd.Series(hotkeys).value_counts()
    
    # Calculate the total number of hotkeys
    total_hotkeys = hotkey_counts.sum()
    
    # Filter for hotkeyX0 patterns
    hotkey_x0_counts = hotkey_counts[hotkey_counts.index.str.endswith("0")].sum()
    # Filter for hotkeyX1 patterns
    hotkey_x1_counts = hotkey_counts[hotkey_counts.index.str.endswith("1")].sum()
    # Filter for hotkeyX2 patterns
    hotkey_x2_counts = hotkey_counts[hotkey_counts.index.str.endswith("2")].sum()
    
    # Calculate the ratio of hotkeyX0 to all hotkeys
    ratio_x0 = hotkey_x0_counts / total_hotkeys if total_hotkeys > 0 else 0
    # Calculate the ratio of hotkeyX0 to all hotkeys
    ratio_x1 = hotkey_x1_counts / total_hotkeys if total_hotkeys > 0 else 0
    # Calculate the ratio of hotkeyX0 to all hotkeys
    ratio_x2 = hotkey_x2_counts / total_hotkeys if total_hotkeys > 0 else 0
   # Return the ratios as a dictionary
    return pd.Series({'ratio_x0': ratio_x0, 'ratio_x1': ratio_x1, 'ratio_x2': ratio_x2})



Justify the presence of 0s in the ratio of hotkeyX1 by looking at the graph.

### Number of all keys pressed per second

def count_values(row):
    # Extract all values of the format "t<number>"
    t_values = [int(value[1:]) for value in row if re.match(r"^t\d+$", str(value))]
    
    # Determine the last "t<number>" value, default to 165 if not found
    divisor = t_values[-1] if t_values else 5
    
    # Filter out "t<number>" values
    filtered_values = [value for value in row if not re.match(r"^t\d+$", str(value))]
    
    # Count occurrences of each value in the filtered list
    value_counts = pd.Series(filtered_values).value_counts()
    
    # Divide by the determined divisor
    return value_counts / divisor



In [12]:

# Define the list of action keys to be counted- present in training dataset
action_keys = {
    'Base', 'SingleMineral', 'hotkey00', 'hotkey01', 'hotkey02', 
    'hotkey10', 'hotkey11', 'hotkey12', 'hotkey20', 'hotkey21', 'hotkey22', 
    'hotkey30', 'hotkey31', 'hotkey32', 'hotkey40', 'hotkey41', 'hotkey42', 
    'hotkey50', 'hotkey51', 'hotkey52', 'hotkey60', 'hotkey61', 'hotkey62', 
    'hotkey70', 'hotkey71', 'hotkey72', 'hotkey80', 'hotkey81', 'hotkey82', 
    'hotkey90', 'hotkey91', 'hotkey92', 's'
}

def count_values(row):
    # Extract all values of the format "t<number>"
    t_values = [int(value[1:]) for value in row if re.match(r"^t\d+$", str(value))]
    
    # Determine the last "t<number>" value, default to 5 if not found
    divisor = t_values[-1] if t_values else 5

    # Separate values: count only those in action_keys, print others
    filtered_values = []
    
    for value in row:
        str_value = str(value)
        
        if re.match(r"^t\d+$", str_value):
            continue  # Skip "t<number>" values

        if str_value in action_keys:
            filtered_values.append(str_value)  # Count only valid actions
        #else:
            #print("Unknown action key")
            #print(str_value, end=" ")  # Print values not in action_keys
    
    if filtered_values:  # Avoid creating an empty Series warning
        value_counts = pd.Series(filtered_values).value_counts() / divisor
    else:
        value_counts = pd.Series(dtype=float)  # Return an empty Series if nothing is counted

    return value_counts


### Count of all keys pressed in the first 5 seconds:


old function  to count values up to the first occurrence of "t5"
def count_values_t5(row):
    # Slice the row up to (but not including) the first "t5"
    sliced_row = row[:row.tolist().index("t5")] if "t5" in row.tolist() else row
    # Count occurrences of each value in the sliced row
    value_counts = sliced_row.value_counts().to_dict()
    return value_counts

def count5(data):
    value_counts_df = pd.json_normalize(data.apply(count_values_t5, axis=1))
    value_counts_df = value_counts_df.fillna(0)
    value_counts_df = value_counts_df.add_suffix("_f5")
    return value_counts_df

#### New function

###### Function to count values up to the first occurrence of "t5"
def count_values_t5(row):
    # Convert row to a list
    row_list = row.tolist()
    
    # Slice the row up to (but not including) the first "t5"
    sliced_row = row_list[:row_list.index("t5")] if "t5" in row_list else row_list

    # Separate values: count only those in action_keys, print others
    filtered_values = []
    
    for value in sliced_row:
        str_value = str(value)
        
        if re.match(r"^t\d+$", str_value):
            continue  # Skip "t<number>" values
        
        if str_value in action_keys:
            filtered_values.append(str_value)  # Count only valid actions
        #else:
            #print("Unknown action key")
            #print(str_value, end=" ")  # Print values not in action_keys
    
    if filtered_values:  # Avoid creating an empty Series warning
        value_counts = pd.Series(filtered_values).value_counts().to_dict()
    else:
        value_counts = {}  # Return an empty dict if nothing is counted

    return value_counts

##### Apply count_values_t5 function to the DataFrame
def count5(data):
    value_counts_df = pd.json_normalize(data.apply(count_values_t5, axis=1))
    value_counts_df = value_counts_df.fillna(0)
    value_counts_df = value_counts_df.add_suffix("_f5")
    return value_counts_df

In [13]:
action_keys_f5 = [
    'hotkey00', 'hotkey02', 'hotkey10', 'hotkey11', 'hotkey12', 
    'hotkey20', 'hotkey21', 'hotkey22', 'hotkey30', 'hotkey31', 
    'hotkey32', 'hotkey40', 'hotkey41', 'hotkey42', 'hotkey50', 
    'hotkey51', 'hotkey52', 'hotkey60', 'hotkey61', 'hotkey62', 
    'hotkey70', 'hotkey71', 'hotkey72', 'hotkey80', 'hotkey82', 
    'hotkey90', 'hotkey92', 's', 'Base', 'SingleMineral'
]

In [14]:

# Function to count values up to the first occurrence of "t5"
def count_values_t5(row):
    row_list = row.tolist()  # Convert row to a list

    # Slice the row up to (but not including) the first "t5"
    sliced_row = row_list[:row_list.index("t5")] if "t5" in row_list else row_list

    # Filter and count only valid actions
    filtered_values = [str(value) for value in sliced_row if str(value) in action_keys_f5]

    # Create a dictionary of counts
    value_counts = pd.Series(filtered_values).value_counts().to_dict() if filtered_values else {}

    # Ensure all expected keys are present, even if zero
    return {key: value_counts.get(key, 0) for key in action_keys_f5}

# Function to apply count_values_t5 and normalize the result into a DataFrame
def count5(data):
    value_counts_df = pd.json_normalize(data.apply(count_values_t5, axis=1)).fillna(0)
    value_counts_df = value_counts_df.add_suffix("_f5")  # Add suffix to all column names
    return value_counts_df


### Creation of features

In [15]:
def create_features(dataset,is_train: bool=True):
    """
    Create features for the given dataset by applying feature engineering functions.

    Args:
        dataset (pd.DataFrame): The dataset to process.
        data (str)

    Returns:
        pd.DataFrame: The dataset with new features added.
    """
    # Ensure a copy of the dataset is used to avoid modifying the original
    #dff_features = dataset.copy()
    if is_train:
        df_temp= dataset.drop(['battleneturl','played_race'],axis=1, errors='ignore') 
        df_features= dataset[['battleneturl','played_race']].copy()
    else:
         df_temp= dataset.drop(['played_race'],axis=1, errors='ignore')
         df_features= dataset[['played_race']].copy()
    

    # Apply the calculate_action_ratios function
    action_ratios = df_temp.apply(calculate_action_ratios, axis=1)
    df_features[['ratio_s', 'ratio_base', 'ratio_mineral']] = action_ratios

    # Apply the calculate_action_mean function
    df_features['action_per_5_seconds'] = df_temp.iloc[:, 2:].apply(calculate_action_mean, axis=1)

    # Apply the count_hotkeys_xx function
    hotkeys_ratios = df_temp.apply(count_hotkeys_xx, axis=1)
    df_features[['ratio_x0', 'ratio_x1', 'ratio_x2']] = hotkeys_ratios

    #Number of all keys pressed per second
    df_features[['Base', 'SingleMineral', 'hotkey00', 'hotkey01', 'hotkey02', 'hotkey10',
       'hotkey11', 'hotkey12', 'hotkey20', 'hotkey21', 'hotkey22', 'hotkey30',
       'hotkey31', 'hotkey32', 'hotkey40', 'hotkey41', 'hotkey42', 'hotkey50',
       'hotkey51', 'hotkey52', 'hotkey60', 'hotkey61', 'hotkey62', 'hotkey70',
       'hotkey71', 'hotkey72', 'hotkey80', 'hotkey81', 'hotkey82', 'hotkey90',
       'hotkey91', 'hotkey92', 's']] = df_temp.apply(count_values, axis=1).fillna(0)

   

    
    # Count of all keys pressed in the first 5 seconds:
    df_features[['hotkey00_f5', 'hotkey02_f5', 'hotkey10_f5', 'hotkey11_f5', 'hotkey12_f5', 
    'hotkey20_f5', 'hotkey21_f5', 'hotkey22_f5', 'hotkey30_f5', 'hotkey31_f5', 
    'hotkey32_f5', 'hotkey40_f5', 'hotkey41_f5', 'hotkey42_f5', 'hotkey50_f5', 
    'hotkey51_f5', 'hotkey52_f5', 'hotkey60_f5', 'hotkey61_f5', 'hotkey62_f5', 
    'hotkey70_f5', 'hotkey71_f5', 'hotkey72_f5', 'hotkey80_f5', 'hotkey82_f5', 
    'hotkey90_f5', 'hotkey92_f5', 's_f5', 'Base_f5', 'SingleMineral_f5']]= count5(df_temp)
    

    return df_features

In [44]:
def preprocess(df: pd.DataFrame,  is_train: bool=True, convert_race: bool=True):
    """Calls FeatureGetter on the dataframe, applying preprocessing steps before
    Args:
        df (pd.DataFrame)
        min_ (int)
        max_ (int)
        is_train (bool, optional): whether the current dataframe contains training data 
            (to preprocess dependent variable or not). Defaults to True.
        convert_race (bool, optional): whether to convert race attribute to dummies. Defaults to True.

    Returns:
        _type_: _description_
    """
    if 0 in df.columns:
        df = df.dropna(subset=[0])

    else:
        print("Column named '0' does not exist in the DataFrame.")

    df_feat = create_features(df,is_train)
    #df_cat= df[['battleneturl','played_race']]
    if convert_race: 
        df_feat = get_dummies(df_feat)
    df_feat.columns = df_feat.columns.astype(str)
    if is_train:
        to_categories(df_feat)

    df_feat
    #merged_df = df_feat.merge(df, left_index=True, right_index=True)
    return df_feat 
    

# Preprocess train set

In [45]:
features_train_final= preprocess(features_train)

In [53]:
features_train_final

Unnamed: 0,battleneturl,ratio_s,ratio_base,ratio_mineral,action_per_5_seconds,ratio_x0,ratio_x1,ratio_x2,Base,SingleMineral,...,hotkey80_f5,hotkey82_f5,hotkey90_f5,hotkey92_f5,s_f5,Base_f5,SingleMineral_f5,played_race_Protoss,played_race_Terran,played_race_Zerg
0,53,0.063965,0.006264,0.000475,6.990596,0.082717,0.000000,0.917283,0.039879,0.003021,...,0.0,0.0,0.0,0.0,5.0,1.0,0.0,True,False,False
1,29,0.051058,0.001139,0.000000,6.384615,0.061559,0.001965,0.936477,0.007251,0.000000,...,0.0,0.0,0.0,0.0,6.0,3.0,0.0,True,False,False
2,53,0.040809,0.004460,0.000285,6.867347,0.049539,0.000000,0.950461,0.046535,0.002970,...,0.0,0.0,0.0,0.0,4.0,2.0,0.0,True,False,False
3,29,0.037677,0.001993,0.000000,6.861386,0.059794,0.001031,0.939175,0.020896,0.000000,...,0.0,0.0,0.0,0.0,5.0,2.0,0.0,True,False,False
4,53,0.019740,0.000190,0.000000,5.820755,0.051345,0.000000,0.948655,0.003704,0.000000,...,0.0,0.0,0.0,0.0,4.0,2.0,0.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,192,0.054285,0.000000,0.000380,12.346591,0.036273,0.000000,0.963727,0.000000,0.004545,...,,,,,,,,False,True,False
3048,196,0.073550,0.000000,0.002752,14.739316,0.033245,0.000000,0.966755,0.000000,0.024681,...,,,,,,,,False,True,False
3049,192,0.085413,0.000000,0.000475,12.995745,0.042306,0.000000,0.957694,0.000000,0.004274,...,,,,,,,,False,True,False
3050,196,0.042896,0.000000,0.003986,20.008547,0.019470,0.000000,0.980530,0.000000,0.072414,...,,,,,,,,False,True,False


In [55]:
features_train_final.isna().sum()

battleneturl            0
ratio_s                 0
ratio_base              0
ratio_mineral           0
action_per_5_seconds    0
                       ..
Base_f5                 8
SingleMineral_f5        8
played_race_Protoss     0
played_race_Terran      0
played_race_Zerg        0
Length: 74, dtype: int64

In [56]:
features_train_final.columns

Index(['battleneturl', 'ratio_s', 'ratio_base', 'ratio_mineral',
       'action_per_5_seconds', 'ratio_x0', 'ratio_x1', 'ratio_x2', 'Base',
       'SingleMineral', 'hotkey00', 'hotkey01', 'hotkey02', 'hotkey10',
       'hotkey11', 'hotkey12', 'hotkey20', 'hotkey21', 'hotkey22', 'hotkey30',
       'hotkey31', 'hotkey32', 'hotkey40', 'hotkey41', 'hotkey42', 'hotkey50',
       'hotkey51', 'hotkey52', 'hotkey60', 'hotkey61', 'hotkey62', 'hotkey70',
       'hotkey71', 'hotkey72', 'hotkey80', 'hotkey81', 'hotkey82', 'hotkey90',
       'hotkey91', 'hotkey92', 's', 'hotkey00_f5', 'hotkey02_f5',
       'hotkey10_f5', 'hotkey11_f5', 'hotkey12_f5', 'hotkey20_f5',
       'hotkey21_f5', 'hotkey22_f5', 'hotkey30_f5', 'hotkey31_f5',
       'hotkey32_f5', 'hotkey40_f5', 'hotkey41_f5', 'hotkey42_f5',
       'hotkey50_f5', 'hotkey51_f5', 'hotkey52_f5', 'hotkey60_f5',
       'hotkey61_f5', 'hotkey62_f5', 'hotkey70_f5', 'hotkey71_f5',
       'hotkey72_f5', 'hotkey80_f5', 'hotkey82_f5', 'hotkey90_f5',
    

In [57]:
unique_battleneturls = features_train_final['battleneturl'].unique()

print(f"Total unique battleneturl values: {len(unique_battleneturls)}")
print(unique_battleneturls)

Total unique battleneturl values: 200
[ 53  29  35  24  40  17  49  16  11  54  61  25  18   0  32   2   3   9
  34   4  30  50  13  52  28  60  20  31  44   8  23  59  10   6  15  45
  21  26  58   5  38   7  14  37  48 177 157 169 151 165 163 152 164  64
 150 188 184 171 162 159 186 174 161 132  63 172  65 134  62 183 141 146
 156 143 158 138 142 182 144 153 135 160 176 179 155 133 136 166 140 175
 148 180 170  86  85 100  66  95  80 131  73 120  70 101 106  92 117 108
  89  74  67 122  81 128  82  96  99 116 105  98 121 123  77  68  90  78
  71 102  93 111  87 124 126 107  88 119 113 118 109  76  94 112  83  72
 130   1  19  46  12  27  43  42  33  47  39  57  51 187 149 173 145 185
 178 154 168 181  22  56  55  36  41 167 139 137 147 103  69 127  79  75
 104 114 125 115  91 129  84 110  97 199 191 190 195 198 197 193 196 194
 189 192]


In [26]:
#turn into csv
features_train_final.to_csv('./data/features_train_final.csv', index=False)

# Preprocess test set

In [27]:
def read_ds_test(path: Optional[Path]=None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(f'/kaggle/input/train-sc2-keystrokes/{ds}.CSV' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

In [28]:
#read test set 
features_test_final = read_ds_test("data/TEST.csv")
print(features_test_final.shape)
features_test_final.head(5)

(340, 6818)


Unnamed: 0,played_race,0,1,2,3,4,5,6,7,8,...,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816
0,Zerg,s,s,s,hotkey10,s,hotkey60,s,hotkey00,s,...,,,,,,,,,,
1,Terran,s,hotkey30,hotkey00,t5,s,hotkey32,s,hotkey32,s,...,,,,,,,,,,
2,Protoss,Base,s,s,Base,s,hotkey20,hotkey30,hotkey60,hotkey40,...,,,,,,,,,,
3,Zerg,s,s,s,s,hotkey20,s,hotkey40,t5,t10,...,,,,,,,,,,
4,Protoss,Base,s,s,Base,s,hotkey50,hotkey30,t5,hotkey40,...,,,,,,,,,,


In [29]:
df_test = preprocess(features_test_final,is_train= False)
df_test.head(3)


Unnamed: 0,ratio_s,ratio_base,ratio_mineral,action_per_5_seconds,ratio_x0,ratio_x1,ratio_x2,Base,SingleMineral,hotkey00,...,hotkey80_f5,hotkey82_f5,hotkey90_f5,hotkey92_f5,s_f5,Base_f5,SingleMineral_f5,played_race_Protoss,played_race_Terran,played_race_Zerg
0,0.124395,0.002787,0.00044,18.674286,0.019583,0.013333,0.967083,0.021714,0.003429,0.016,...,0,0,1,0,6,0,0,False,False,True
1,0.083908,0.0,0.0,11.30102,0.032219,0.0,0.967781,0.0,0.0,0.00203,...,0,0,0,0,1,0,0,False,True,False
2,0.081561,0.002934,0.0,14.442149,0.046217,0.0,0.953783,0.016529,0.0,0.000826,...,1,0,0,0,5,2,0,True,False,False


In [30]:
#turn into csv
df_test.to_csv('./data/features_test_final.csv', index=False)