In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from typing import Optional, List, Callable, Any, Union, Dict
from itertools import product
from statistics import mean
from pathlib import Path
import gzip
import os

In [22]:
def read_ds_gzip(path: Optional[Path]=None, ds: str = "TRAIN") -> pd.DataFrame:
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with gzip.open(f'/kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

def read_ds(path: Optional[Path]=None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(f'/kaggle/input/train-sc2-keystrokes/{ds}.CSV' if path is None else path) as f:
        max_actions = max(( len( str(c).split(",")) for c in f.readlines() ))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype= str)

In [23]:
features_train = read_ds(Path(os.path.abspath('')) / "data/TRAIN.CSV") # Replace with correct path 
# features_test = read_ds("TEST")
features_train.shape #, features_test.shape

(3052, 10539)

### Load and preprocess data

In [2]:
df_features = pd.read_csv('data/processed_df.csv')
df_features.head()

Unnamed: 0.1,Unnamed: 0,battleneturl,s_mean,base_mean,mineral_mean,hotkeys_mean,actions_mean,max_time,played_race_Protoss,played_race_Terran,played_race_Zerg
0,0,53,2.036254,0.199396,0.015106,4.492447,6.743202,1655.0,True,False,False
1,1,29,1.620482,0.036145,0.0,4.596386,6.253012,1655.0,True,False,False
2,2,53,2.128713,0.232673,0.014851,4.29703,6.673267,1010.0,True,False,False
3,3,29,1.965347,0.10396,0.0,4.787129,6.856436,1005.0,True,False,False
4,4,53,1.925926,0.018519,0.0,3.787037,5.731481,540.0,True,False,False


In [13]:
df_features = df_features.drop('Unnamed: 0', axis=1)

In [34]:
data = df_features
#data = features_train

In [27]:
data.head()

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,t5,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [31]:
data.dtypes

battleneturl    object
played_race     object
0               object
1               object
2               object
                 ...  
10532           object
10533           object
10534           object
10535           object
10536           object
Length: 10539, dtype: object

In [35]:
# Split the data into training and testing sets
X = data.drop('battleneturl', axis=1)
y = data['battleneturl']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Ensure all column names in X_train are strings
X_train.columns = X_train.columns.astype(str)

# Repeat the same for X_test if applicable
X_test.columns = X_test.columns.astype(str)

### Train the model

In [37]:
# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [38]:
# Make predictions
y_pred = rf_model.predict(X_test)

### Evaluate

In [39]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.37
Confusion Matrix:
[[3 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         2
           3       0.44      0.50      0.47         8
           4       0.50      0.50      0.50         4
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         1
           7       0.25      0.33      0.29         3
           8       0.00      0.00      0.00         4
           9       0.80      1.00      0.89         4
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         2
          12       0.00      0.00      0.00         2
          13       0.40      0.33      0.36         6
          14 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Optional feature importance

In [40]:
# Feature importance
importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

Feature Importances:
               Feature  Importance
3         hotkeys_mean    0.190358
0               s_mean    0.184579
4         actions_mean    0.180466
1            base_mean    0.159960
5             max_time    0.140290
2         mineral_mean    0.127654
6  played_race_Protoss    0.007088
8     played_race_Zerg    0.004815
7   played_race_Terran    0.004789
