In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset/starcraft_player_data.csv')
df

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.7180,0.003515,0.000220,7,0.000110,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.000000,0.000000
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001193,5,0.000000,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.0430,22,0.000745,6,0.000000,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,0.000053,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.000000,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.000000,0.001329,0.002368,22.6885,62.0813,9.3740,15,0.001174,4,0.000000,0.000019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,10089,8,?,?,?,259.6296,0.020425,0.000743,9,0.000621,0.000146,0.004555,18.6059,42.8342,6.2754,46,0.000877,5,0.000000,0.000000
3391,10090,8,?,?,?,314.6700,0.028043,0.001157,10,0.000246,0.001083,0.004259,14.3023,36.1156,7.1965,16,0.000788,4,0.000000,0.000000
3392,10092,8,?,?,?,299.4282,0.028341,0.000860,7,0.000338,0.000169,0.004439,12.4028,39.5156,6.3979,19,0.001260,4,0.000000,0.000000
3393,10094,8,?,?,?,375.8664,0.036436,0.000594,5,0.000204,0.000780,0.004346,11.6910,34.8547,7.9615,15,0.000613,6,0.000000,0.000631


In [3]:
def sanitize(df: pd.DataFrame):
    """
    Helper utility for replacing the '?' in the Age, TotalHours, HoursPerWeek to an Integer.
    :param df:
    :return:
    """
    df['Age'] = df['Age'].str.replace('?', '0').astype(int)
    df['TotalHours'] = df['TotalHours'].str.replace('?', '0').astype(int)
    df['HoursPerWeek'] = df['HoursPerWeek'].str.replace('?', '0').astype(int)
    df['LeagueIndex'] -= 1

In [4]:
sanitize(df)
df = df.drop('GameID', axis=1)

  df['Age'] = df['Age'].str.replace('?', '0').astype(int)
  df['TotalHours'] = df['TotalHours'].str.replace('?', '0').astype(int)
  df['HoursPerWeek'] = df['HoursPerWeek'].str.replace('?', '0').astype(int)


In [5]:
train_cv, test = train_test_split(df, test_size=0.3)
train, cv = train_test_split(train_cv, test_size=1/7)

In [6]:
train_x = train.drop('LeagueIndex', axis=1)
train_y = train['LeagueIndex']

cv_x = cv.drop('LeagueIndex', axis=1)
cv_y = cv['LeagueIndex']

test_x = test.drop('LeagueIndex', axis=1)
test_y = test['LeagueIndex']

# XGBoost Trees

In [7]:
import xgboost as xgb
from mlxtend.feature_selection import SequentialFeatureSelector

In [8]:
num_features = 5

sfs = SequentialFeatureSelector(xgb.XGBClassifier(), k_features=num_features, forward=True, scoring='accuracy', cv=None)
selected_features = sfs.fit(train_x, train_y)

In [9]:
selected_columns = [train_x.columns[idx] for idx in selected_features.k_feature_idx_]
selected_columns

['HoursPerWeek',
 'SelectByHotkeys',
 'MinimapRightClicks',
 'ActionLatency',
 'WorkersMade']

In [10]:
clf = xgb.XGBClassifier()
clf.fit(X=train_x[selected_columns], y=train_y)
cv_y_predict = clf.predict(X=cv_x[selected_columns])

In [11]:
rmse_xgb = np.sqrt(((cv_y - cv_y_predict)**2).sum()/len(cv_y))
rmse_xgb

1.1918843212720427

In [12]:
accuracy_xgb = (cv_y == cv_y_predict).astype(int).sum() / len(cv_y)
accuracy_xgb

0.37058823529411766

## k-Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

n_neighbors = 5

knn_clf = KNeighborsClassifier(n_neighbors=n_neighbors)
knn_clf.fit(X=train_x[selected_columns], y=train_y)

In [14]:
cv_y_predict = knn_clf.predict(X=cv_x[selected_columns])

In [15]:
accuracy_knn = (cv_y == cv_y_predict).astype(int).sum() / len(cv_y)
accuracy_knn

0.31176470588235294

In [16]:
rmse_knn = np.sqrt(((cv_y - cv_y_predict)**2).sum()/len(cv_y))
rmse_knn

1.443036015204219

## Neural Network

In [17]:
import torch
from torch import nn
import torch.optim as optim

In [18]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(18, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 8)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


In [19]:
model = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [20]:
x_tensor = torch.tensor(train_x.values)
y_tensor = torch.tensor(train_y.values)

In [21]:
num_iterations = 100

for i in range(num_iterations):
    optimizer.zero_grad()
    outputs = model(x_tensor.float())
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    print(f'{loss.item()}')

31.649259567260742
47.256526947021484
470.9613342285156
878.7858276367188
66.7001724243164
2.7941336631774902
2.652486801147461
3.052873134613037
4.072297096252441
3.06057071685791
7.495682239532471
3.668062686920166
5.822879791259766
4.905445575714111
3.7339372634887695
2.566244125366211
1.9677746295928955
1.969374656677246
1.8673611879348755
1.749459981918335
1.7551307678222656
1.626258134841919
1.6231257915496826
1.5943822860717773
1.5583118200302124
1.5415680408477783
1.5203452110290527
1.4980368614196777
1.4904162883758545
1.4797449111938477
1.462666630744934
1.4516242742538452
1.4414016008377075
1.4273405075073242
1.4253885746002197
1.419634461402893
1.4164704084396362
1.3979496955871582
1.4002764225006104
1.3933809995651245
1.3953148126602173
1.3838762044906616
1.3817824125289917
1.3718081712722778
1.375815510749817
1.3615866899490356
1.360528588294983
1.3552179336547852
1.3511744737625122
1.3494795560836792
1.3427613973617554
1.3444873094558716
1.3396000862121582
1.337896943092

In [22]:
cv_x_tensor = torch.tensor(cv_x.values)
cv_y_tensor = torch.tensor(cv_y.values)
cv_y_predict = torch.argmax(model(cv_x_tensor.float()), dim=1)

In [23]:
accuracy_nn = (cv_y_tensor == cv_y_predict).sum() / len(cv_y_tensor)
accuracy_nn

tensor(0.3676)

In [24]:
rmse_nn = np.sqrt(((cv_y - cv_y_predict.numpy()) ** 2).sum() / len(cv_y))
rmse_nn

1.0131488480155786

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    'criterion': ('gini', 'entropy', 'log_loss'),

}
rf_clf = RandomForestClassifier()
clf = GridSearchCV(rf_clf, parameters)
clf.fit(train_x, train_y)

In [26]:
cv_y_predict = clf.predict(cv_x)

In [27]:
accuracy_rf = (cv_y == cv_y_predict).astype(int).sum() / len(cv_y)
accuracy_rf

0.4147058823529412

In [28]:
rmse_rf = np.sqrt(((cv_y - cv_y_predict) ** 2).sum() / len(cv_y))
rmse_rf

1.0218207509435417

## Final Results

### Findings

Based on the accuracy results, we find that the random forest model performs the best. However, if more data is available, we would go with Neural Networks; since they eliminate the need for feature engineering and are more suitable for large training data set. This dataset is a small-to-medium sized dataset. We also find that if the `TotalHours`, `Age` or `HoursPerWeek` of the any player is `?` (Question Mark), then it is a strong indicator of that player belonging to the `Professional` League tier (Tier 7), which makes sense since many of these players prefer playing on "smurf accounts" and only log into their professional account when playing in tournaments. Most of the metrics (except for `ActionLatency`) should go up with skill.

In [29]:
# Test set accuracy for Random Forest
test_y_predict = clf.predict(test_x)
accuracy_rf_test = (test_y == test_y_predict).astype(int).sum() / len(test_y)
accuracy_rf_test

0.4180569185475957

###  Hypothetical: after seeing your work, your stakeholders come to you and say that they can collect more data, but want your guidance before starting. How would you advise them based on your EDA and model results?

While Random Forests seems to perform better in the EDA, as data would increase, Neural Networks will perform better and will also eliminate the need of feature engineering. We can also reduce the number of features since most of them are correlated.