# Imports

In [3]:
import time
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import os, sys
import seaborn as sns
import json
import sqlite3

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, explained_variance_score, max_error, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, CatBoostRegressor

In [4]:
ROOT_DATA_FOLDER = "../data"
API_DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
RANDOM_STATE = 40

# Functions

In [5]:
def pad(l: list, n: int, value=0):
    return l + [value] * (n - len(l))

In [6]:
def get_stat_columns(stat_name):
    """
    Get the columns of a stat from all players.
    :param feature_name: 
    :return: 
    """
    return [f"player_{i}_{stat_name}" for i in range(1, 13)]

In [7]:
def get_cols_name(player_range_start: int, player_range_end: int):
    """
    Get columns of all stats from the range of players
    :param player_range_start: 
    :param player_range_end: 
    :return: 
    """
    return np.array(
        [[f"player_{i}_{stat}" for stat in STATS_NAME] for i in range(player_range_start, player_range_end)]).reshape(
        (player_range_end - player_range_start) * N_STATS)

In [8]:
def get_player_columns(player_number):
    """
    Get all columns of a player.
    :param player_number: 
    :return: 
    """
    return [f"player_{player_number}_{stat}" for stat in STATS_NAME]

In [9]:
# Shuffle teams so that half of the activities were won
def shuffle_winning_team(df):
    mid = len(df) // 2

    # First shuffle dataframe to ensure dates are shuffle as well
    df = df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

    player_cols_team_A = get_cols_name(1, 7)
    player_cols_team_B = get_cols_name(7, 13)
    tmp_cols = [f"tmp_{i}" for i in range(6*N_STATS)]

    # False positive warning (see https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas)
    df.loc[:mid] = (df.loc[:mid].rename(columns={player_cols_team_A[i]: tmp_cols[i] for i in range(len(player_cols_team_A))})
                                .rename(columns={player_cols_team_B[i]: player_cols_team_A[i] for i in range(len(player_cols_team_A))})
                                .rename(columns={tmp_cols[i]: player_cols_team_B[i] for i in range(len(player_cols_team_A))}))\
                                [df.columns]  # reorder columns
    df.loc[:mid, "winner"] = [1 for i in range(mid + 1)]
    
    df = df.sample(frac=1.0, random_state=RANDOM_STATE+1).reset_index(drop=True)
    return df

# Load data

In [10]:
%%time
train = pd.read_csv(os.path.join(ROOT_DATA_FOLDER, "train.csv"))
train["period"] = pd.to_datetime(train["period"], format="%Y-%m-%d %H:%M:%S")
train

CPU times: total: 23.9 s
Wall time: 26.7 s


Unnamed: 0,player_1_activities_entered,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_activities_entered,...,player_12_score_pga,player_12_win_ratio,player_12_kd,player_12_kda,winner,mode,period,instance_id,win_score,loss_score
0,261.0,168.135969,8.103448,3.252874,8.176245,5.896552,0.444444,0.991097,1.388941,2417.0,...,0.000000,0.000000,0.000000,0.000000,0.0,71,2017-09-05 23:32:24,1885169,58,35
1,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.000000,0.000000,0.0,71,2017-09-05 23:42:27,2024057,39,36
2,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.000000,0.000000,0.0,73,2017-09-05 23:53:01,2159868,85,54
3,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.000000,0.000000,0.0,73,2017-09-06 00:04:05,2269987,77,25
4,17.0,130.364239,10.176471,5.176471,10.764706,15.411765,0.352941,0.945355,1.426230,49.0,...,0.000000,0.000000,0.000000,0.000000,0.0,73,2017-09-06 05:42:33,7875392,95,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730453,-1.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,0.0,73,2022-12-28 18:58:17,12153675395,149,98
1730454,15174.0,170.628974,16.674839,4.118492,8.530513,24.418545,0.545341,1.954729,2.437524,2024.0,...,21.497872,0.451064,1.239148,1.604013,0.0,73,2022-12-28 19:16:05,12153780908,142,116
1730455,3852.0,153.905220,9.868640,1.928868,6.599429,7.002077,0.594756,1.495378,1.787656,15174.0,...,27.348982,0.592195,1.516892,1.976510,0.0,73,2022-12-28 19:32:36,12153874512,151,149
1730456,2196.0,163.112964,16.972678,4.117486,9.576503,26.020036,0.528233,1.772325,2.202282,1141.0,...,22.379874,0.483019,1.239597,1.707276,0.0,73,2022-12-28 19:45:04,12153943044,151,138


# Preprocessing

In [14]:
STATS_NAME = ["activities_entered", "combat_rating", "kills_pga", "assists_pga", "deaths_pga", "score_pga", "win_ratio", "kd", "kda"]
N_STATS = len(STATS_NAME)
PLAYERS_COLUMNS = np.array([[f"player_{i}_{stat}" for stat in STATS_NAME] for i in range(1, 13)]).reshape(12 * N_STATS)
OTHER_COLUMNS = ["instance_id", "period", "mode", "winner", "win_score", "loss_score"]

In [None]:
# Select rows without stats missing
req = [f"(train['{col}'] > -0.5)" for col in PLAYERS_COLUMNS]
req = " & ".join(req)
train = train[eval(req)]  # hey, it's not stupid if it works
train.reset_index(inplace=True, drop=True)
train

In [20]:
train["delta_score"] = train["win_score"] - train["loss_score"]
train["delta_score_relative"] = train["delta_score"] / train["win_score"]
train

Unnamed: 0,player_1_activities_entered,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_activities_entered,...,player_12_kd,player_12_kda,winner,mode,period,instance_id,win_score,loss_score,delta_score,delta_score_relative
0,261.0,168.135969,8.103448,3.252874,8.176245,5.896552,0.444444,0.991097,1.388941,2417.0,...,0.000000,0.000000,0.0,71,2017-09-05 23:32:24,1885169,58,35,23,0.396552
1,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.0,71,2017-09-05 23:42:27,2024057,39,36,3,0.076923
2,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.0,73,2017-09-05 23:53:01,2159868,85,54,31,0.364706
3,390.0,161.895219,11.684615,3.951282,8.828205,15.935897,0.551282,1.323555,1.771130,2417.0,...,0.000000,0.000000,0.0,73,2017-09-06 00:04:05,2269987,77,25,52,0.675325
4,17.0,130.364239,10.176471,5.176471,10.764706,15.411765,0.352941,0.945355,1.426230,49.0,...,0.000000,0.000000,0.0,73,2017-09-06 05:42:33,7875392,95,51,44,0.463158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730453,-1.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,0.0,73,2022-12-28 18:58:17,12153675395,149,98,51,0.342282
1730454,15174.0,170.628974,16.674839,4.118492,8.530513,24.418545,0.545341,1.954729,2.437524,2024.0,...,1.239148,1.604013,0.0,73,2022-12-28 19:16:05,12153780908,142,116,26,0.183099
1730455,3852.0,153.905220,9.868640,1.928868,6.599429,7.002077,0.594756,1.495378,1.787656,15174.0,...,1.516892,1.976510,0.0,73,2022-12-28 19:32:36,12153874512,151,149,2,0.013245
1730456,2196.0,163.112964,16.972678,4.117486,9.576503,26.020036,0.528233,1.772325,2.202282,1141.0,...,1.239597,1.707276,0.0,73,2022-12-28 19:45:04,12153943044,151,138,13,0.086093


# Data selection

In [86]:
sel = train

# sel = sel[sel["delta_score_relative"] > 0.3]

sel = shuffle_winning_team(sel)

sel.reset_index(inplace=True, drop=True)
sel

Unnamed: 0,player_1_activities_entered,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_activities_entered,...,player_12_kd,player_12_kda,winner,mode,period,instance_id,win_score,loss_score,delta_score,delta_score_relative
0,684.0,129.719438,13.621345,3.967836,9.986842,16.421053,0.504386,1.363929,1.761236,1079.0,...,0.147986,0.292091,1.0,73,2022-03-08 13:15:46,10357118750,101,45,56,0.554455
1,607.0,128.560847,10.056013,3.462932,9.672158,24.514003,0.507414,1.039687,1.397718,652.0,...,0.900839,1.168574,1.0,81,2022-05-04 15:12:19,10697285586,135,51,84,0.622222
2,598.0,141.466934,14.588629,2.867893,8.911371,30.474916,0.530100,1.637080,1.958904,1792.0,...,0.622037,0.884658,0.0,81,2022-02-26 22:53:43,10241670294,204,103,101,0.495098
3,5480.0,69.823546,6.185949,2.811314,7.910036,6.271168,0.448540,0.782038,1.137449,4046.0,...,0.000000,0.000000,1.0,80,2021-11-18 02:40:05,9619042430,5,4,1,0.200000
4,97.0,128.534587,9.463918,4.319588,7.587629,17.216495,0.597938,1.247283,1.816576,420.0,...,0.559621,0.787803,1.0,73,2022-03-02 22:49:36,10297754577,153,73,80,0.522876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730453,8576.0,145.038565,12.196945,3.788013,7.690648,21.850280,0.539412,1.585945,2.078493,5550.0,...,0.956754,1.298986,1.0,43,2022-02-19 22:20:39,10131025529,99,38,61,0.616162
1730454,71.0,136.904327,9.084507,4.873239,12.098592,24.760563,0.450704,0.750873,1.153667,1745.0,...,0.943489,1.228501,1.0,73,2022-10-07 19:56:16,11754116152,151,103,48,0.317881
1730455,1454.0,117.344981,10.724209,3.468363,9.207703,17.129986,0.502751,1.164700,1.541380,653.0,...,0.449231,0.665385,1.0,73,2022-06-04 22:16:47,10891536631,150,116,34,0.226667
1730456,1778.0,116.194666,8.974691,3.221035,12.407762,19.362767,0.443757,0.723313,0.982911,353.0,...,0.309339,0.536843,1.0,81,2022-08-02 21:04:24,11243188304,201,93,108,0.537313


# Catboost

In [93]:
X = sel #[list(PLAYERS_COLUMNS) + ["mode"]]
Y = sel["winner"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=RANDOM_STATE)

In [99]:
%%time
clf = CatBoostClassifier(verbose=False, task_type='GPU', random_state=RANDOM_STATE)
clf.fit(
    X_train[list(PLAYERS_COLUMNS) + ["mode"]],
    y_train,
    cat_features=["mode"])

y_pred = clf.predict(X_test[list(PLAYERS_COLUMNS) + ["mode"]])
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy per mode:")
for mode in X_test["mode"].unique():
    x = X_test[X_test["mode"] == mode]
    yp = clf.predict(x[list(PLAYERS_COLUMNS) + ["mode"]])
    yt = y_test[x.index]
    print("Accuracy mode ", mode, ": ", accuracy_score(yt, yp))

Accuracy: 0.7217714750220557
F1-score: 0.7206203493623778
[[188419  71064]
 [ 73375 186280]]

Accuracy per mode:
Accuracy mode  84 :  0.7836557884813934
Accuracy mode  73 :  0.7071831959891661
Accuracy mode  44 :  0.7103025233514568
Accuracy mode  43 :  0.6976603955619874
Accuracy mode  37 :  0.6792766675379607
Accuracy mode  80 :  0.6854468709855019
Accuracy mode  62 :  0.6409701169337375
Accuracy mode  59 :  0.787379421221865
Accuracy mode  90 :  0.7198378883923933
Accuracy mode  89 :  0.6184062850729517
Accuracy mode  25 :  0.6404069079725574
Accuracy mode  81 :  0.6659210879484656
Accuracy mode  71 :  0.7273837059881778
Accuracy mode  88 :  0.6844262295081968
CPU times: total: 1min 4s
Wall time: 52.2 s


# Analysis

In [95]:
df = X_test
df["pred"] = y_pred
df

Unnamed: 0,player_1_activities_entered,player_1_combat_rating,player_1_kills_pga,player_1_assists_pga,player_1_deaths_pga,player_1_score_pga,player_1_win_ratio,player_1_kd,player_1_kda,player_2_activities_entered,...,player_12_kda,winner,mode,period,instance_id,win_score,loss_score,delta_score,delta_score_relative,pred
1522636,1448.0,132.604234,11.895718,3.270718,9.470994,15.709945,0.482044,1.256016,1.601356,3598.0,...,0.000000,1.0,84,2021-03-12 22:45:12,8144251976,5,4,1,0.200000,0.0
336961,648.0,93.274126,9.171296,2.353395,10.689815,9.302469,0.404321,0.857947,1.078100,2507.0,...,1.037335,1.0,73,2021-11-14 06:53:05,9604621326,152,66,86,0.565789,1.0
1337930,10394.0,170.622477,14.731287,4.802386,7.213585,22.686069,0.596113,2.042159,2.707901,2858.0,...,1.122668,0.0,44,2022-09-12 15:23:44,11569694464,150,129,21,0.140000,0.0
968499,252.0,95.265262,8.492063,4.273810,9.892857,13.257937,0.420635,0.858404,1.290413,1412.0,...,2.142606,1.0,73,2022-06-13 06:40:05,10956530935,150,60,90,0.600000,1.0
426328,496.0,125.532057,11.401210,3.389113,10.270161,17.679435,0.520161,1.110130,1.440126,534.0,...,1.191873,0.0,73,2019-12-21 11:29:52,5406735344,145,119,26,0.179310,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345721,301.0,87.727411,8.544850,2.950166,10.099668,14.744186,0.478405,0.846053,1.138158,337.0,...,1.198707,1.0,81,2022-05-04 16:04:14,10697495642,201,128,73,0.363184,1.0
1609981,249.0,77.233649,6.622490,2.401606,9.787149,13.281124,0.405622,0.676652,0.922035,726.0,...,1.750868,0.0,73,2021-06-04 18:18:08,8605440745,93,29,64,0.688172,0.0
371389,124.0,115.970415,9.274194,3.927419,10.580645,13.983871,0.459677,0.876524,1.247713,-1.0,...,0.000000,0.0,37,2021-06-12 13:59:19,8653899611,4,1,3,0.750000,0.0
512195,897.0,145.990276,12.552954,4.774805,10.370123,24.351171,0.465998,1.210492,1.670931,389.0,...,-1.000000,0.0,43,2022-01-11 23:46:06,9931842138,151,83,68,0.450331,1.0


In [97]:
wrong = df[df["pred"] != df["winner"]]
wrong.delta_score_relative.describe()

count    144244.000000
mean          0.380394
std           0.271069
min           0.000000
25%           0.170370
50%           0.320000
75%           0.589404
max           1.000000
Name: delta_score_relative, dtype: float64

# Code saves

In [None]:
# print("\nAccuracy per mode:")
# for mode in X_test["mode"].unique():
#     x = X_test[X_test["mode"] == mode]
#     yp = clf.predict(x)
#     yt = y_test[x.index]
#     print("Accuracy mode ", mode, ": ", accuracy_score(yt, yp))

In [82]:
# dimensionality reduction test
from sklearn.kernel_approximation import RBFSampler
from numpy.random import RandomState
prng = RandomState(RANDOM_STATE)

rbf_sampler = RBFSampler(gamma=1, random_state=RANDOM_STATE)

rbf_train = np.concatenate([X_train[get_cols_name(1, 7)].values, X_train[get_cols_name(7, 13)].values])
prng.shuffle(rbf_train)
rbf_sampler.fit(rbf_train)

X_train_team_A = rbf_sampler.transform(X_train[get_cols_name(1, 7)])
X_train_team_B = rbf_sampler.transform(X_train[get_cols_name(7, 13)])
X_train = np.concatenate([X_train_team_A, X_train_team_B], axis=1)

X_test_team_A = rbf_sampler.transform(X_test[get_cols_name(1, 7)])
X_test_team_B = rbf_sampler.transform(X_test[get_cols_name(7, 13)])
X_test = np.concatenate([X_test_team_A, X_test_team_B], axis=1)



In [75]:
# dimensionality reduction test
from sklearn.decomposition import PCA
from numpy.random import RandomState
prng = RandomState(RANDOM_STATE)

pca = PCA(n_components=0.9, svd_solver='full')

pca_train = np.concatenate([X_train[get_cols_name(1, 7)].values, X_train[get_cols_name(7, 13)].values])
prng.shuffle(pca_train)
pca.fit(pca_train)

X_train_team_A = pca.transform(X_train[get_cols_name(1, 7)])
X_train_team_B = pca.transform(X_train[get_cols_name(7, 13)])
X_train = np.concatenate([X_train_team_A, X_train_team_B], axis=1)

X_test_team_A = pca.transform(X_test[get_cols_name(1, 7)])
X_test_team_B = pca.transform(X_test[get_cols_name(7, 13)])
X_test = np.concatenate([X_test_team_A, X_test_team_B], axis=1)

