In [1]:
import pandas as pd
import numpy as np
from typing import Optional, List, Callable, Any, Union, Dict
from itertools import product
from statistics import mean
from pathlib import Path
import gzip
import os
import matplotlib.pyplot as plt

### Read datasets

In [2]:
def read_ds_gzip(path: Optional[Path]=None, ds: str = "TRAIN") -> pd.DataFrame:
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with gzip.open(f'/kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

def read_ds(path: Optional[Path]=None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(f'/kaggle/input/train-sc2-keystrokes/{ds}.CSV' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

In [31]:
#read training data 
features_train = read_ds(Path(os.path.abspath('')) / "data/TRAIN.CSV")
features_train.shape

(3052, 10539)

In [32]:
features_train.head(5)

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,t5,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [5]:
features_train.describe()

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
count,3052,3052,3044,3044,3044,3044,3044,3044,3043,3042,...,1,1,1,1,1,1,1,1,1,1
unique,200,3,4,10,17,22,28,27,29,32,...,1,1,1,1,1,1,1,1,1,1
top,http://kr.battle.net/sc2/en/profile/2348639/1/...,Protoss,s,s,s,s,s,s,s,s,...,hotkey22,hotkey12,hotkey22,hotkey12,hotkey22,hotkey12,hotkey22,hotkey42,hotkey12,t5770
freq,58,1210,1839,2757,2569,1770,1610,1174,1046,888,...,1,1,1,1,1,1,1,1,1,1


### Dependent Variable
Our dependent variable is a categorical string; we can convert it to categories codes (number) with pd.Categorical

pd.Categorical doesn't directly modify the battleneturl to a number, instead it adds a cat.codes attribute to it. We can create a little function to convert the dependent variable from string to its category ID:

In [7]:
def to_categories(df: pd.DataFrame, col: str="battleneturl") -> None:
    """Convert col of df to a categorical column"""
    df["battleneturl"] = pd.Categorical(df["battleneturl"])
    df[[col]] = df[[col]].apply(lambda x: x.cat.codes)

In [33]:
#save original urls 
urls = features_train['battleneturl']

In [35]:
to_categories(features_train, col='battleneturl')
#features_train.head()

### Removing outliers

In [9]:
#Remove NA rows
df_cleaned = features_train.dropna(subset=[0])
print(features_train.shape)
print(df_cleaned.shape)

(3052, 10539)
(3044, 10539)


### Manual features

#### COUNT for the first 5 keys pressed

In [11]:
# Function to count values per row
def count_values(row):
    return row.value_counts()

def count5(df_cleaned):
    # Apply the counting function to each row for columns 2 to 5
    df_count5 = df_cleaned[df_cleaned.columns[2:8]].apply(count_values, axis=1).fillna(0)
    df_count5['battleneturl'] = df_cleaned['battleneturl']
    return df_count5

In [12]:
df_count5 = count5(df_cleaned)
df_count5.head(5)

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,hotkey22,...,hotkey62,hotkey70,hotkey80,hotkey90,hotkey92,s,t10,t15,t5,battleneturl
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,53
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,29
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,53
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,29
4,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,53


#### COUNT all keys pressed in the run

In [13]:
#function to count all hotkeys
def count_all(df_cleaned):
    df_count = df_cleaned[df_cleaned.columns[2:]].apply(count_values, axis=1).fillna(0)
    df_count['battleneturl'] = df_cleaned['battleneturl']
    return df_count.iloc[:, :33]

In [14]:
df_count = count_all(df_cleaned)
df_count.head(5)

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey01,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,...,hotkey70,hotkey71,hotkey72,hotkey80,hotkey81,hotkey82,hotkey90,hotkey91,hotkey92,s
0,66.0,5.0,5.0,0.0,41.0,63.0,0.0,350.0,37.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,674.0
1,12.0,0.0,2.0,0.0,83.0,65.0,3.0,448.0,15.0,0.0,...,0.0,0.0,0.0,1.0,0.0,16.0,1.0,0.0,18.0,538.0
2,47.0,3.0,3.0,0.0,18.0,17.0,0.0,130.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,430.0
3,21.0,0.0,2.0,0.0,39.0,32.0,1.0,259.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,14.0,397.0
4,2.0,0.0,1.0,0.0,0.0,10.0,0.0,52.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208.0


#### Hotkey count

In [39]:
import re

In [41]:
# Assuming you already have the function to count keys pressed
def count_hotkeys_x0(row):
    # Find all hotkeyXX patterns
    hotkeys = re.findall(r"hotkey\d{2}", " ".join(row.astype(str)))
    
    # Count occurrences of all hotkeys
    hotkey_counts = pd.Series(hotkeys).value_counts()
    
    # Calculate the total number of hotkeys
    total_hotkeys = hotkey_counts.sum()
    
    # Filter for hotkeyX0 patterns
    hotkey_x0_counts = hotkey_counts[hotkey_counts.index.str.endswith("0")].sum()
    
    # Calculate the ratio of hotkeyX0 to all hotkeys
    ratio_x0 = hotkey_x0_counts / total_hotkeys if total_hotkeys > 0 else 0
    return ratio_x0

# Apply the function to the test set to calculate hotkey_x0_ratio for each row (game)
df_hotkey = df_cleaned.apply(count_hotkeys_x0, axis=1)
df_hotkey

# CHANGE THE TEST_SET SO AS TO ADD THE NEW FEATURE OF HOTKEY0
#test_set = pd.concat([df_count5_test, df_count_test, test_df[["hotkey_x0_ratio"]]], axis=1)

<bound method Series.max of 0       0.082717
1       0.061559
2       0.049539
3       0.059794
4       0.051345
          ...   
3047    0.036273
3048    0.033245
3049    0.042306
3050    0.019470
3051    0.033124
Length: 3044, dtype: float64>

In [47]:
# Find and print the maximum and minimum values of the hotkey_x0_ratio_mean
max_hotkey_x0_ratio_mean = test_df["hotkey_x0_ratio_mean"].max()
min_hotkey_x0_ratio_mean = test_df["hotkey_x0_ratio_mean"].min()

print(f"Maximum mean hotkeyX0 ratio: {max_hotkey_x0_ratio_mean}")
print(f"Minimum mean hotkeyX0 ratio: {min_hotkey_x0_ratio_mean}")

KeyError: 'hotkey_x0_ratio_mean'

In [46]:
print(max(df_hotkey))
min(df_hotkey)

max_hotkey_x0_ratio_mean=test_df

1.0


0.005487411233053583

### Getting features...

Building a mini framework to read our Dataframe and convert it to features.

Now we will create features out of the dataset.

FeaturesGetter iterates over an ActionsDataLoader (yield every actions between two 't[xx]') and apply a set of Feature contained in a FeaturePool. At the end, it gets metrics over the values registered by each features in the feature pool.

In [17]:
class CancelBatchException(Exception):
    """Used to cancel processing of a batch of data (when the keystroke sequence is fully read)"""

In [18]:
class Feature:
    def __init__(
        self, name: str, 
        lambda_: Callable[[List[str]], Union[int, float]]=None, 
        val_count: int=None, 
        max_iter: int=None, 
        predicate: Callable[[List[str]], bool]=None, 
        metric: Callable[[List[str]], Union[int, float]]=mean, 
        div: bool=True
    ):
        """If neither lambda_, val_count nor predicate are defined, the _lambda will just be the length of the given action range.

        Args:
            name (str): feature name
            lambda_ (Callable[[List[str]], Union[int, float]], optional): 
                lambda that'll be applied to compute metric value over action ranges. Defaults to None.
            val_count (int, optional): set feature's lambda to be the count of this value (if lambda_ is None). Defaults to None.
            max_iter (int, optional): when exceeding this iteration, the feature will no longer be computed. Defaults to None.
            predicate (Callable[[List[str]], bool], optional): define a predicate to compute lambda across one 
                action range (if lambda_ and val_count is None). Defaults to None.
            metric (Callable[[List[str]], Union[int, float]], optional): the metric used to aggregate feature's 
                values across all ranges. Defaults to mean.
            div (bool, optional): whether to divide the aggregated metric value. Defaults to True.
        """
        self.name, self.metric, self.max_iter, self.div = name, metric, max_iter, div
        self.reset()
        self._lambda: Callable[[List[str]], Union[int, float]]
        if   lambda_   is not None: 
            self._lambda = lambda_
        elif val_count is not None: 
            self._lambda = lambda x: x.count(val_count)
        elif predicate is not None: 
            self._lambda = lambda x: sum(1 for o in x if predicate(o))
        else: 
            self._lambda = lambda x: len(x)
    
    def reset(self):
        """Resets the value of the feature
        """
        self.vals: List[Union[int, float]] = []
        self.val, self.i = 0, 0
        
    def __call__(self, rng: List[str], *args):
        """Compute feature's value according to _lambda, for given action range. Extra *args are given to _lambda

        Args:
            rng (List[str]): range of action (given by ActionDataLoader)
        """
        if self.max_iter is None or self.i < self.max_iter:
            self.val = self._lambda(rng, *args)
            self.vals.append(self.val)
            self.i += 1
            
    @property
    def value(self) -> int | float:
        """Returns:
            int | float: the aggregated feature's value across all action ranges read until now
        """
        return self.metric(self.vals)

In [19]:
class ActionsDataLoader:
    """Iterates over two 'tXX', yielding actions between each time steps
    """
    def __init__(self, actions: pd.DataFrame, do_range: bool = True, max_t: Optional[int]=None):
        """Args:
            actions (List[Feature]): The raw dataframe
            do_range (bool): whether the data loader should iterate and yield each range 
                between two 'tXX', or just yield the whole sequence once then return. Defaults to True.
            max_t (Optional[int], optional): the t max to stop yielding. Defaults to None.
        """
        self.t_indx = [0] + [j for j, val in enumerate(actions) if isinstance(val, str) and val[0] == "t"]
        self.do_range = do_range
        if max_t and max_t < len(self.t_indx):
            self.t_indx = self.t_indx[:max_t]
            self.values = actions.values[: self.t_indx[max_t - 1]]
        else: 
            self.values = actions.values
        self.n_t = len(self.t_indx)
    
    def __len__(self): return 1 if self.do_range else (self.n_t or 1)
    
    def __iter__(self):
        if self.n_t == 0 or not self.do_range:
            self.start_indx = 0
            self.end_indx = self._get_first_nan_indx()
            yield self.values[self.start_indx:self.end_indx].tolist()
            return 
        for self.i in range(self.n_t):
            try:
                self._get_actions_range()
                yield self.values[self.start_indx:self.end_indx].tolist()
            except CancelBatchException: 
                return

    def _get_actions_range(self):
        """Computes the action range until a 'tXX' is met. If there are no more 'tXX', 
            it means we reached the end of the game, and the sequences finish with NaN 
            (or for the longest game, the full row is read).

        Raises:
            CancelBatchException: indicates that there is no more action to be read (next action is NaN).
        """
        self.start_indx = self.t_indx[self.i] + (1 if self.i >0 else 0)
        if  self.start_indx >= len(self.values) or pd.isna(self.values[self.start_indx]): 
            raise CancelBatchException
        self.end_indx = self.t_indx[self.i + 1] if (self.i + 1) < self.n_t else self._get_first_nan_indx()
    
    def _get_first_nan_indx(self) -> int:
        """Returns:
            int: the first index in values that is not NaN
        """
        nans = np.argwhere(pd.isna(self.values[self.start_indx:]))
        return len(self.values) if len(nans) == 0 else nans[0][0]
    
    def get_max_t(self):
        """Gets the last 'tXX' defined. If this data loader was defined with max_t not None, it returns this max_t
        """
        if self.n_t - 1 == 0:
            return 0
        return int(self.values[self.t_indx[self.n_t - 1]][1:]) if self.n_t > 0 else 0

In [20]:
class FeaturesGetter:
    def __init__(self, features: List[Feature], n_rows: int=3052, log: bool=False, **kwargs_dataloader):
        """Args:
            features (List[Feature]): the list of features to compute
            n_rows (int, optional): the number of row (used only in log). Defaults to 3052.
            log (bool, optional): whether to output log information when processing the df. Defaults to False.

            Accepts extra kwargs_dataloader that'll be passed to the dataloader
        """
        self.feature_pool, self.n_rows, self.log, self.kwargs_dataloader = features, n_rows, log, kwargs_dataloader
        self.game_l: int # game length
        self.reset()
        
    def reset(self):
        """Resets the value of each feature in the feature pool"""
        for feature in self.feature_pool: 
            feature.reset()
        self.game_l = 0
    
    def _log(self):
        """Print to stdout the current % of the df that have been processed"""
        global cnt
        cnt += 1
        print(f"{cnt * 100 / self.n_rows:.2f} %", end="\r")
    
    def _one_update(self):
        """Compute each feature's value for one batch (one action range yielded by the ActionDataLoader)"""
        for feature in self.feature_pool: 
            feature(self.actions_rng)
        
    def __call__(self, actions: pd.DataFrame) -> pd.Series:
        """Computes all features' values for each of the given actions, iterating over ADL with parameters defined in __init__
        
        Returns:
            pd.Series: the features' values as a Series. 
                Adds an extra feature which is the game length is max_t is not in __init__ kwargs
        """
        self.reset()
        if self.log:
            self._log()
        adl = ActionsDataLoader(actions, **self.kwargs_dataloader)
        for self.actions_rng in adl:
            self._one_update()
        activs = [f.value / len(adl) if f.div else f.value for f in self.feature_pool]
        self.game_l = (max_t := self.kwargs_dataloader.get("max_t", None)) or adl.get_max_t()
        return pd.Series( activs + ([self.game_l] if max_t is None else []) )

Defining lambdas to convert dataset to features
We create basic features, corresponding to the mean of each action played per timestamp plus the mean of all actions together

In [21]:
FEATURES_NAMES = ["s_mean", "base_mean", "mineral_mean", "hotkeys_mean", "actions_mean"]
ACTIONS = [ "s", "Base", "SingleMineral", "hotkey" ]

def get_base_features() -> List[Feature]:
    """Defines base features (mean of count of each action / hotkeys)
    """
    features = []
    for i, action in enumerate(ACTIONS[:-1]):
        features.append(Feature(FEATURES_NAMES[i], val_count=action))
    features.append(Feature(FEATURES_NAMES[-2], predicate=lambda x: x.startswith(ACTIONS[-1]))) # hotkeys
    features.append(Feature(FEATURES_NAMES[-1])) # all actions combined (no lambda_ means lambda_ is just the length)
    for i, j in product(range(10), range(3)):
        pass
    #TODO
        # OTHER FEATURES HERE
        #features.append(Feature('feature_name', 'test'))
    # OTHER FEATURES HERE
    # Guess what would be useful ?
    # set div, metric and lambda_ accordingly
    return features

Now it's ready to be put into a function that'll get all the features from the initial dataframe and return a new dataframe containing only those features. FeaturesGetter gets one extra feature from that we created, which is max_time, corresponding to the "xx" of the last "txx" seen.

In [22]:
features_getter = None
def create_features(
    df: pd.DataFrame, 
    min_: int, 
    max_: int, 
    drop: bool=False, 
    features: List[Feature]=get_base_features(), 
    **kwargs
) -> pd.DataFrame:
    """Compute features on given dataframe

    Args:
        df (pd.DataFrame)
        min_ (int): index of the first action to pass to the feature
        max_ (int): index of the last action to pass to the feature
        drop (bool, optional): whether to drop original columns of the dataframe. Defaults to False.
        features (List[Feature], optional). Defaults to get_base_features().

    Returns:
        pd.DataFrame: a dataframe containing features' values for each row
    """
    global features_getter
    features_getter = FeaturesGetter(features, **kwargs)
    final_df = df.loc[:,min_:max_].apply(features_getter, axis=1, result_type='expand')
    final_df.columns = [f.name for f in features_getter.feature_pool] + (["max_time"] if not kwargs.get("max_t") else [])
    if drop:
        df = df.drop(columns=[i for i in range(min_, max_ + 1)])
    final_df = pd.concat([df, final_df], axis=1)
    features_getter.reset()
    return final_df

#### Handling string
The race_played column can only take three values; instead of converting it to categorical as we did with our dependent variable, we will instead convert it to dummy variables: we one-hot encode each race. It will not add many columns to our dataframe (only three) but will allow the decision trees to split much faster on the race (on only one binary split).

In [23]:
def get_dummies(df: pd.DataFrame):
    """Converts textual columns to one-hot encoded vectors (one column per possible value)"""
    df = pd.get_dummies(df, columns=["played_race"])
    return df

Function preprocess creates a pipeline of all the function we just implemented: it create the features, converts the race to dummy variables and the dependent variable to category codes.

In [24]:
def preprocess(df: pd.DataFrame, min_: int, max_: int, is_train: bool=True, convert_race: bool=True, **kwargs):
    """Calls FeatureGetter on the dataframe, applying preprocessing steps before
    Args:
        df (pd.DataFrame)
        min_ (int)
        max_ (int)
        is_train (bool, optional): whether the current dataframe contains training data 
            (to preprocess dependent variable or not). Defaults to True.
        convert_race (bool, optional): whether to convert race attribute to dummies. Defaults to True.

    Returns:
        _type_: _description_
    """
    df = create_features(df, min_, max_, **kwargs)
    if convert_race: 
        df = get_dummies(df)
    df.columns = df.columns.astype(str)
    if is_train:
        to_categories(df)
    return df

In [25]:
%%time
cnt=0
processed_df = preprocess(df_cleaned, 0, features_train.shape[1]-3, drop=True, n_rows=3052, log=True) 
processed_df.head()

CPU times: total: 38.7 s
Wall time: 43.7 s


Unnamed: 0,battleneturl,s_mean,base_mean,mineral_mean,hotkeys_mean,actions_mean,max_time,played_race_Protoss,played_race_Terran,played_race_Zerg
0,53,2.036254,0.199396,0.015106,4.492447,6.743202,1655.0,True,False,False
1,29,1.620482,0.036145,0.0,4.596386,6.253012,1655.0,True,False,False
2,53,2.128713,0.232673,0.014851,4.29703,6.673267,1010.0,True,False,False
3,29,1.965347,0.10396,0.0,4.787129,6.856436,1005.0,True,False,False
4,53,1.925926,0.018519,0.0,3.787037,5.731481,540.0,True,False,False


### Combine and save features

In [15]:
own_features = pd.concat([df_count5, df_count], axis=1)
own_features =own_features.drop(['t10', 't15', 't5'], axis=1)
#own_features.to_csv('data/output/own_features.csv', index=False)

In [92]:
own_features.shape

(3044, 61)

### Get features for test data

In [20]:
#read test set 
test_df = read_ds("data/TEST.csv")
print(test_df.shape)
test_df.head(5)

(340, 6818)


Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815
0,Zerg,s,s,s,hotkey10,s,hotkey60,s,hotkey00,s,...,,,,,,,,,,
1,Terran,s,hotkey30,hotkey00,t5,s,hotkey32,s,hotkey32,s,...,,,,,,,,,,
2,Protoss,Base,s,s,Base,s,hotkey20,hotkey30,hotkey60,hotkey40,...,,,,,,,,,,
3,Zerg,s,s,s,s,hotkey20,s,hotkey40,t5,t10,...,,,,,,,,,,
4,Protoss,Base,s,s,Base,s,hotkey50,hotkey30,t5,hotkey40,...,,,,,,,,,,


In [21]:
#create features on test set (COUNT first 5 keys pressed per run)
df_count5_test =test_df[test_df.columns[2:8]].apply(count_values, axis=1).fillna(0)
df_count5_test = df_count5_test.iloc[:, :27]
df_count5_test.head(5)

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,hotkey22,...,hotkey51,hotkey52,hotkey60,hotkey61,hotkey62,hotkey70,hotkey80,hotkey90,hotkey92,s
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [22]:
#create features on test set (COUNT all keys pressed per run)
df_count_test = test_df[test_df.columns[2:]].apply(count_values, axis=1).fillna(0)
df_count_test = df_count_test.iloc[:, :33]
df_count_test.head(5)

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey01,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,...,hotkey70,hotkey71,hotkey72,hotkey80,hotkey81,hotkey82,hotkey90,hotkey91,hotkey92,s
0,19.0,3.0,14.0,0.0,847.0,7.0,31.0,352.0,10.0,0.0,...,4.0,0.0,43.0,0.0,0.0,0.0,6.0,0.0,29.0,847.0
1,0.0,0.0,2.0,0.0,33.0,14.0,0.0,336.0,31.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,47.0,571.0
2,19.0,0.0,1.0,0.0,0.0,10.0,0.0,479.0,53.0,0.0,...,3.0,0.0,32.0,5.0,0.0,49.0,0.0,0.0,0.0,556.0
3,17.0,2.0,0.0,0.0,0.0,68.0,89.0,525.0,13.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1587.0
4,20.0,0.0,3.0,0.0,0.0,4.0,7.0,74.0,8.0,0.0,...,1.0,0.0,3.0,4.0,0.0,1.0,0.0,0.0,0.0,236.0


In [23]:
#merge features and save
test_set = pd.concat([df_count5_test, df_count_test], axis=1)
print(test_set.shape)
t#est_set.to_csv('data/test_features.csv', index=False)

(340, 60)


### Save predictions

#### Translate categories back to battleneturls

In [25]:
# Compare column names
train_columns = set(own_features.columns)  # Columns from the training data
test_columns = set(test_set.columns)    # Columns from the test data

if train_columns != test_columns:
    print("Mismatch in columns!")
    print("Missing in test:", train_columns - test_columns)
    print("Extra in test:", test_columns - train_columns)

Mismatch in columns!
Missing in test: {'battleneturl'}
Extra in test: set()


In [36]:
#create translation file
translation = pd.concat([urls, features_train['battleneturl']] ,axis=1)
translation.columns = ['url','battleneturl']
translation.head(5)

Unnamed: 0,url,battleneturl
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,53
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,29
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,53
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,29
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,53


In [37]:
#saved as translation
#translation.drop_duplicates().to_csv('translation.csv',sep=',', index=False)