In [1]:
import pandas as pd
import numpy as np
from typing import Optional, List, Callable, Any, Union, Dict
from itertools import product
from statistics import mean
from pathlib import Path
import gzip
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import f_oneway

# Read dataset

In [3]:
def read_ds_gzip(path: Optional[Path]=None, ds: str = "TRAIN") -> pd.DataFrame:
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with gzip.open(f'/kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

def read_ds(path: Optional[Path]=None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(f'/kaggle/input/train-sc2-keystrokes/{ds}.CSV' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

In [4]:
features_train = read_ds(Path(os.path.abspath('')) / "data/train.csv") # Replace with correct path 
# features_test = read_ds("TEST")
features_train.shape #, features_test.shape

(3052, 10539)

In [12]:
features_train

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,t5,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,s,s,hotkey10,hotkey20,hotkey30,...,,,,,,,,,,
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,,,
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,hotkey50,...,,,,,,,,,,
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,,,


In [13]:
features_train.isna().sum()

battleneturl       0
played_race        0
0                  0
1                  0
2                  0
                ... 
10532           3043
10533           3043
10534           3043
10535           3043
10536           3043
Length: 10539, dtype: int64

# Pre-processing and feature creation

Step to take out outliers: defined as the rows which had null values in the first action column (0), which also had null values in the first 5 action columns. Resultig dataset: 10539 rows (games)

In [15]:
df_try= features_train.copy()

In [16]:
df_try.head(3)

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [14]:
#df_cleaned = features_train.dropna(subset=[0, 1, 2, 4],inplace= True)

#df_cleaned.shape


Function to create the dummie features for played race

In [None]:
def get_dummies(df: pd.DataFrame):
    """Converts textual columns to one-hot encoded vectors (one column per possible value)"""
    df = pd.get_dummies(df, columns=["played_race"])
    return df

Convertion of players urls to codes:

In [None]:
def to_categories(df: pd.DataFrame, col: str="battleneturl") -> None:
    """Convert col of df to a categorical column"""
    df["battleneturl"] = pd.Categorical(df["battleneturl"])
    df[[col]] = df[[col]].apply(lambda x: x.cat.codes)

Calculate the features: s_ratio, base_ratio and mineral_ratio:

In [None]:
def calculate_action_ratios(row):
    # Define the columns of actions (columns 0 to 10536, excluding the last column 'num_actions')
    action_columns = [str(i) for i in range(0, 10537)]
    
    # Extract action values from the row
    actions = row[action_columns].values
    
    # Count occurrences of 's', 'Base', and 'SingleMineral'
    s_count = sum(1 for action in actions if action == 's')
    base_count = sum(1 for action in actions if action == 'Base')
    mineral_count = sum(1 for action in actions if action == 'SingleMineral')
    
    # Extract the total number of actions from the 'num_actions' column
    total_actions = row['num_actions']
    
    # Calculate the ratios
    ratio_s = s_count / total_actions if total_actions > 0 else 0
    ratio_base = base_count / total_actions if total_actions > 0 else 0
    ratio_mineral = mineral_count / total_actions if total_actions > 0 else 0
    
    # Return the ratios as a dictionary
    return pd.Series({'ratio_s': ratio_s, 'ratio_base': ratio_base, 'ratio_mineral': ratio_mineral})

Calculate the feature number of actions per interval

In [None]:
def calculate_action_mean(row):
    """
    Calculate the mean number of actions per 5 seconds for a given row.

    Args:
        row (pd.Series): The row to process.

    Returns:
        float: The mean number of actions per 5 seconds.
    """
    counts = []
    count = 0

    for value in row:
        if pd.isna(value):  # Stop if the value is NaN
            break
        elif isinstance(value, str) and re.match(r"^t\d+$", value):  # Found a time window
            if count > 0:
                counts.append(count)  # Save the current count
            count = 0  # Restart the count
        else:  # Count non-NaN, non-time values
            count += 1

    if count > 0:  # Add any remaining count
        counts.append(count)

    return np.mean(counts) if counts else 0  # Calculate the mean, default to 0 if no counts

In [None]:
def count_values(row):
    return row.value_counts()

In [17]:
# Assuming you already have the function to count keys pressed ?
def count_hotkeys_x0(row):
    # Find all hotkeyXX patterns
    hotkeys = re.findall(r"hotkey\d{2}", " ".join(row.astype(str)))
    
    # Count occurrences of all hotkeys
    hotkey_counts = pd.Series(hotkeys).value_counts()
    
    # Calculate the total number of hotkeys
    total_hotkeys = hotkey_counts.sum()
    
    # Filter for hotkeyX0 patterns
    hotkey_x0_counts = hotkey_counts[hotkey_counts.index.str.endswith("0")].sum()
    
    # Calculate the ratio of hotkeyX0 to all hotkeys
    ratio_x0 = hotkey_x0_counts / total_hotkeys if total_hotkeys > 0 else 0
    return ratio_x0



In [18]:
# Apply the function to the test set to calculate hotkey_x0_ratio for each row (game)
df_try = df_try.apply(count_hotkeys_x0, axis=1)
df_try

0       0.082717
1       0.061559
2       0.049539
3       0.059794
4       0.051345
          ...   
3047    0.036273
3048    0.033245
3049    0.042306
3050    0.019470
3051    0.033124
Length: 3044, dtype: float64

In [None]:
def create_features(dataset):
    """
    Create features for the given dataset by applying feature engineering functions.

    Args:
        dataset (pd.DataFrame): The dataset to process.

    Returns:
        pd.DataFrame: The dataset with new features added.
    """
    # Ensure a copy of the dataset is used to avoid modifying the original
    processed_df = dataset.copy()

    # Apply the calculate_action_ratios function
    action_ratios = processed_df.apply(calculate_action_ratios, axis=1)
    processed_df[['ratio_s', 'ratio_base', 'ratio_mineral']] = action_ratios

    # Apply the calculate_action_mean function
    processed_df['action_per_5_seconds'] = processed_df.iloc[:, 2:].apply(calculate_action_mean, axis=1)

    return processed_df

In [None]:
def preprocess(df: pd.DataFrame, min_: int, max_: int, is_train: bool=True, convert_race: bool=True, **kwargs):
    """Calls FeatureGetter on the dataframe, applying preprocessing steps before
    Args:
        df (pd.DataFrame)
        min_ (int)
        max_ (int)
        is_train (bool, optional): whether the current dataframe contains training data 
            (to preprocess dependent variable or not). Defaults to True.
        convert_race (bool, optional): whether to convert race attribute to dummies. Defaults to True.

    Returns:
        _type_: _description_
    """
    df = create_features(df)
    if convert_race: 
        df = get_dummies(df)
    df.columns = df.columns.astype(str)
    if is_train:
        to_categories(df)
    return df