In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

# classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('players_feats.csv')

In [4]:
data.head()

Unnamed: 0,p1_id,p1_total_kills,p1_headshots,p1_total_deaths,p1_kd_ratio,p1_damage_per_round,p1_grenade_damage_per_round,p1_maps_played,p1_rounds_played,p1_kills_per_round,...,p5_kill_death_difference,p5_total_opening_kills,p5_total_opening_deaths,p5_opening_kill_ratio,p5_opening_kill_rating,p5_team_win_percent_after_first_kill,p5_first_kill_in_won_rounds,team_id,map_name,map_id
0,4954,90,42.2,112,0.8,76.3,5.9,6,156,0.58,...,5,25,12,2.08,1.28,84.0,25.0,6665,Ancient,635
1,5794,45,60.0,57,0.79,82.3,10.9,3,68,0.66,...,96,54,34,1.59,1.17,70.4,16.7,7532,Ancient,635
2,4954,156,51.9,167,0.93,63.5,3.4,10,265,0.59,...,22,26,19,1.37,1.1,88.5,20.5,6665,Dust2,583
3,5794,449,53.5,427,1.05,86.7,13.1,23,618,0.73,...,104,62,49,1.27,1.1,79.0,17.4,7532,Dust2,583
4,7998,173,32.9,130,1.33,82.4,2.9,9,225,0.77,...,19,27,25,1.08,1.08,81.5,16.2,4608,Dust2,439


In [5]:
train = pd.read_csv('train.csv')

In [6]:
train.isnull().values.any()

False

In [7]:
result = pd.merge(train, data, on=['map_id'])
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Columns: 132 entries, map_id to map_name_y
dtypes: float64(80), int64(50), object(2)
memory usage: 1.4+ MB


In [8]:
team1_df = result[:][::2]
team2_df = result[:][1::2]

In [9]:
final_df = pd.merge(team1_df, team2_df, on=['map_id'])

In [10]:
final_df.info()
final_df.isnull().values.any().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 0 to 712
Columns: 263 entries, map_id to map_name_y_y
dtypes: float64(160), int64(99), object(4)
memory usage: 1.4+ MB


1

In [11]:
final_df.dropna(inplace=True)

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 0 to 712
Columns: 263 entries, map_id to map_name_y_y
dtypes: float64(160), int64(99), object(4)
memory usage: 1.4+ MB


In [13]:
final_df = pd.get_dummies(final_df, columns = ["map_name_x_x"] , prefix_sep = "_",drop_first = True)
final_df.head()

Unnamed: 0,map_id,team1_id_x,team2_id_x,who_win_x,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,...,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y,team_id_y,map_name_y_y,map_name_x_x_Dust2,map_name_x_x_Inferno,map_name_x_x_Mirage,map_name_x_x_Nuke,map_name_x_x_Overpass,map_name_x_x_Vertigo
0,289,6665,7718,0,4954,258,36.0,293,0.88,71.1,...,76.2,19.0,7718,Ancient,0,0,0,0,0,0
1,715,4411,10577,0,8611,178,39.3,208,0.86,64.1,...,75.0,23.3,10577,Inferno,0,1,0,0,0,0
2,157,11251,9455,1,7938,494,52.8,397,1.24,94.0,...,53.1,15.5,9455,Nuke,0,0,0,1,0,0
3,524,4608,7532,0,7998,474,29.1,304,1.56,86.9,...,73.3,7.9,7532,Mirage,0,0,1,0,0,0
4,404,8637,6667,1,2898,217,55.8,248,0.88,64.8,...,80.0,12.4,6667,Overpass,0,0,0,0,1,0


In [18]:
y = pd.DataFrame(final_df['who_win_x'])
y.shape

(697, 1)

In [19]:
final_df.drop(['who_win_x', 'who_win_y','map_name_x_y', 'map_name_y_x', 'map_name_y_y'], inplace=True, axis=1)

In [20]:
final_df

Unnamed: 0,map_id,team1_id_x,team2_id_x,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,...,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y,team_id_y,map_name_x_x_Dust2,map_name_x_x_Inferno,map_name_x_x_Mirage,map_name_x_x_Nuke,map_name_x_x_Overpass,map_name_x_x_Vertigo
0,289,6665,7718,4954,258,36.0,293,0.88,71.1,6.3,...,1.06,76.2,19.0,7718,0,0,0,0,0,0
1,715,4411,10577,8611,178,39.3,208,0.86,64.1,6.5,...,1.39,75.0,23.3,10577,0,1,0,0,0,0
2,157,11251,9455,7938,494,52.8,397,1.24,94.0,5.8,...,1.09,53.1,15.5,9455,0,0,0,1,0,0
3,524,4608,7532,7998,474,29.1,304,1.56,86.9,2.4,...,0.82,73.3,7.9,7532,0,0,1,0,0,0
4,404,8637,6667,2898,217,55.8,248,0.88,64.8,4.6,...,1.01,80.0,12.4,6667,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,709,6667,4773,429,320,47.8,396,0.81,71.2,11.4,...,1.12,57.1,13.8,4773,0,1,0,0,0,0
709,528,9215,5995,11154,203,50.2,131,1.55,82.1,3.4,...,0.00,0.0,0.0,5995,0,0,0,0,0,0
710,163,4869,9565,922,447,44.3,538,0.83,65.2,5.7,...,0.87,69.0,10.2,9565,0,0,1,0,0,0
711,96,10426,4991,973,499,43.1,472,1.06,73.8,2.7,...,0.90,86.1,12.7,4991,0,0,0,1,0,0


In [21]:
X = pd.DataFrame(final_df)
X.info()
X.isnull().values.any()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 0 to 712
Columns: 263 entries, map_id to map_name_x_x_Vertigo
dtypes: float64(160), int64(97), uint8(6)
memory usage: 1.4 MB


False

In [22]:
X.shape

(697, 263)

In [23]:
# X_new = SelectKBest(chi2, k=50).fit_transform(X, y)

In [24]:
def optimize_hyperparams(model, parameters, x_train, y_train):
    nfolds = 10
    cross_val = StratifiedKFold(nfolds)
    grid = GridSearchCV(model, parameters, cv=cross_val, refit=True, verbose=1, n_jobs=4)
    grid.fit(x_train, y_train)
    print(f'Accuracy : {grid.best_score_} with params {grid.best_estimator_}')
    return grid.best_estimator_

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [26]:
rf_clf = RandomForestClassifier(criterion='gini')
print(X_train.shape, y_train.shape)
rf_params = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [5, 10, 15, 20, 25],
    'max_features': [16, 32, 64, 128, 256],
    'max_leaf_nodes': [5, 10],
    'bootstrap': [True]
}
best_rf = optimize_hyperparams(rf_clf, rf_params, X_train, y_train.values.ravel())

(557, 263) (557, 1)
Fitting 10 folds for each of 250 candidates, totalling 2500 fits
Accuracy : 0.5999025974025974 with params RandomForestClassifier(max_depth=5, max_features=256, max_leaf_nodes=5,
                       n_estimators=5)


In [16]:
import optuna

In [35]:
def objective(trial):
    x = X_train
    y = y_train.values.ravel()
    
    max_depth = trial.suggest_int('rf_max_depth', 2, 64, log=True)
    max_samples = trial.suggest_float('rf_max_sample', 0.2, 1)
    max_features = trial.suggest_int('rf_max_feauters', 16, 256)
    max_leaves = trial.suggest_int('xgb_max_leaves', 5, 20)
    n_estimators = trial.suggest_int('rf_n_estimators', 10, 50)
    learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.1)
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        max_leaves=max_leaves
    )
    
    
#     rf_model = RandomForestClassifier(
#         max_depth= max_depth,
#         max_samples=max_samples,
#         max_features=max_features,
#         max_leaf_nodes=max_leaf_nodes,
#         n_estimators=n_estimators,
#         random_state=1337
#     )
    score = cross_val_score(xgb_model, x, y, cv=10).mean()
    return score

In [36]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 100)

trial = study.best_trial
print(f'best score = {trial.value}')
print(f'Best params: ')
for key, value in trial.params.items():
    print(f'{key} {value}')

[32m[I 2022-12-02 17:17:38,624][0m A new study created in memory with name: no-name-7885bd2c-6966-429a-ab0d-32073b5e7f7a[0m
[32m[I 2022-12-02 17:17:40,139][0m Trial 0 finished with value: 0.512012987012987 and parameters: {'rf_max_depth': 12, 'rf_max_sample': 0.9608208645897685, 'rf_max_feauters': 190, 'xgb_max_leaves': 7, 'rf_n_estimators': 15, 'xgb_learning_rate': 0.06721726210289394}. Best is trial 0 with value: 0.512012987012987.[0m
[32m[I 2022-12-02 17:17:42,291][0m Trial 1 finished with value: 0.5298051948051947 and parameters: {'rf_max_depth': 7, 'rf_max_sample': 0.590660755941969, 'rf_max_feauters': 57, 'xgb_max_leaves': 20, 'rf_n_estimators': 35, 'xgb_learning_rate': 0.07991685098908391}. Best is trial 1 with value: 0.5298051948051947.[0m
[32m[I 2022-12-02 17:17:43,248][0m Trial 2 finished with value: 0.5531818181818182 and parameters: {'rf_max_depth': 2, 'rf_max_sample': 0.9790784647980277, 'rf_max_feauters': 130, 'xgb_max_leaves': 12, 'rf_n_estimators': 36, 'xgb_l

[32m[I 2022-12-02 17:18:18,765][0m Trial 26 finished with value: 0.49912337662337664 and parameters: {'rf_max_depth': 5, 'rf_max_sample': 0.23098212324126993, 'rf_max_feauters': 85, 'xgb_max_leaves': 9, 'rf_n_estimators': 19, 'xgb_learning_rate': 0.096076787549579}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:18:20,968][0m Trial 27 finished with value: 0.5477272727272727 and parameters: {'rf_max_depth': 9, 'rf_max_sample': 0.4413070278570474, 'rf_max_feauters': 126, 'xgb_max_leaves': 6, 'rf_n_estimators': 29, 'xgb_learning_rate': 0.0641762339276886}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:18:23,426][0m Trial 28 finished with value: 0.5082142857142857 and parameters: {'rf_max_depth': 9, 'rf_max_sample': 0.5677001064531711, 'rf_max_feauters': 124, 'xgb_max_leaves': 13, 'rf_n_estimators': 34, 'xgb_learning_rate': 0.05031621399936785}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:18:26,641][0

[32m[I 2022-12-02 17:19:09,232][0m Trial 52 finished with value: 0.5496103896103897 and parameters: {'rf_max_depth': 2, 'rf_max_sample': 0.7140980654231001, 'rf_max_feauters': 70, 'xgb_max_leaves': 18, 'rf_n_estimators': 31, 'xgb_learning_rate': 0.09990086623981992}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:19:10,084][0m Trial 53 finished with value: 0.5477597402597403 and parameters: {'rf_max_depth': 2, 'rf_max_sample': 0.7037530291404694, 'rf_max_feauters': 70, 'xgb_max_leaves': 19, 'rf_n_estimators': 30, 'xgb_learning_rate': 0.09934223475393233}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:19:10,974][0m Trial 54 finished with value: 0.5388636363636363 and parameters: {'rf_max_depth': 2, 'rf_max_sample': 0.8239231057529857, 'rf_max_feauters': 68, 'xgb_max_leaves': 19, 'rf_n_estimators': 34, 'xgb_learning_rate': 0.09884891198275389}. Best is trial 2 with value: 0.5531818181818182.[0m
[32m[I 2022-12-02 17:19:11,970][

[32m[I 2022-12-02 17:19:44,659][0m Trial 78 finished with value: 0.5424675324675325 and parameters: {'rf_max_depth': 2, 'rf_max_sample': 0.8230699389538674, 'rf_max_feauters': 76, 'xgb_max_leaves': 19, 'rf_n_estimators': 43, 'xgb_learning_rate': 0.0927727059716212}. Best is trial 60 with value: 0.5603246753246753.[0m
[32m[I 2022-12-02 17:19:47,264][0m Trial 79 finished with value: 0.5153896103896104 and parameters: {'rf_max_depth': 8, 'rf_max_sample': 0.3759069709880516, 'rf_max_feauters': 139, 'xgb_max_leaves': 17, 'rf_n_estimators': 39, 'xgb_learning_rate': 0.08946611718043658}. Best is trial 60 with value: 0.5603246753246753.[0m
[32m[I 2022-12-02 17:19:49,117][0m Trial 80 finished with value: 0.5133441558441558 and parameters: {'rf_max_depth': 4, 'rf_max_sample': 0.8556805162209682, 'rf_max_feauters': 45, 'xgb_max_leaves': 18, 'rf_n_estimators': 33, 'xgb_learning_rate': 0.08031088462419131}. Best is trial 60 with value: 0.5603246753246753.[0m
[32m[I 2022-12-02 17:19:50,040

best score = 0.5674675324675325
Best params: 
rf_max_depth 2
rf_max_sample 0.5879392068867081
rf_max_feauters 90
xgb_max_leaves 19
rf_n_estimators 36
xgb_learning_rate 0.09462556173008914


In [None]:
xgb_clf = xgb.XGBClassifier()
xgb_params = {
    'n_estimators': [50, 100, 150, 200],
    'max_leaves': [5, 10, 15],
    'max_depth': [5, 25, 75],
    'booster': ['gbtree', 'gblinear'],
    'learning_rate': [0.1, 0.2, 0.3]
}
best_xgb = optimize_hyperparams(xgb_clf, xgb_params, X_train, y_train)
train_and_test_model(best_xgb, X_train, y_train, X_test, y_test, kf, ntrain, ntest, nclass, NFOLDS, labels)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits
