In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

# classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('players_feats.csv')

In [4]:
data.head()

Unnamed: 0,p1_id,p1_total_kills,p1_headshots,p1_total_deaths,p1_kd_ratio,p1_damage_per_round,p1_grenade_damage_per_round,p1_maps_played,p1_rounds_played,p1_kills_per_round,...,p5_kill_death_difference,p5_total_opening_kills,p5_total_opening_deaths,p5_opening_kill_ratio,p5_opening_kill_rating,p5_team_win_percent_after_first_kill,p5_first_kill_in_won_rounds,team_id,map_name,map_id
0,4954,90,42.2,112,0.8,76.3,5.9,6,156,0.58,...,5,25,12,2.08,1.28,84.0,25.0,6665,Ancient,635
1,5794,45,60.0,57,0.79,82.3,10.9,3,68,0.66,...,96,54,34,1.59,1.17,70.4,16.7,7532,Ancient,635
2,4954,156,51.9,167,0.93,63.5,3.4,10,265,0.59,...,22,26,19,1.37,1.1,88.5,20.5,6665,Dust2,583
3,5794,449,53.5,427,1.05,86.7,13.1,23,618,0.73,...,104,62,49,1.27,1.1,79.0,17.4,7532,Dust2,583
4,7998,173,32.9,130,1.33,82.4,2.9,9,225,0.77,...,19,27,25,1.08,1.08,81.5,16.2,4608,Dust2,439


In [5]:
train = pd.read_csv('train.csv')

In [6]:
train.isnull().values.any()

False

In [7]:
result = pd.merge(train, data, on=['map_id'])
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Columns: 132 entries, map_id to map_name_y
dtypes: float64(80), int64(50), object(2)
memory usage: 1.4+ MB


In [8]:
team1_df = result[:][::2]
team2_df = result[:][1::2]

In [9]:
final_df = pd.merge(team1_df, team2_df, on=['map_id'])

In [10]:
final_df.info()
final_df.isnull().values.any().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 0 to 712
Columns: 263 entries, map_id to map_name_y_y
dtypes: float64(160), int64(99), object(4)
memory usage: 1.4+ MB


1

In [11]:
final_df.dropna(inplace=True)

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 0 to 712
Columns: 263 entries, map_id to map_name_y_y
dtypes: float64(160), int64(99), object(4)
memory usage: 1.4+ MB


In [13]:
final_df = pd.get_dummies(final_df, columns = ["map_name_x_x"] , prefix_sep = "_",drop_first = True)
final_df.head()

Unnamed: 0,map_id,team1_id_x,team2_id_x,who_win_x,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,...,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y,team_id_y,map_name_y_y,map_name_x_x_Dust2,map_name_x_x_Inferno,map_name_x_x_Mirage,map_name_x_x_Nuke,map_name_x_x_Overpass,map_name_x_x_Vertigo
0,289,6665,7718,0,4954,258,36.0,293,0.88,71.1,...,76.2,19.0,7718,Ancient,0,0,0,0,0,0
1,715,4411,10577,0,8611,178,39.3,208,0.86,64.1,...,75.0,23.3,10577,Inferno,0,1,0,0,0,0
2,157,11251,9455,1,7938,494,52.8,397,1.24,94.0,...,53.1,15.5,9455,Nuke,0,0,0,1,0,0
3,524,4608,7532,0,7998,474,29.1,304,1.56,86.9,...,73.3,7.9,7532,Mirage,0,0,1,0,0,0
4,404,8637,6667,1,2898,217,55.8,248,0.88,64.8,...,80.0,12.4,6667,Overpass,0,0,0,0,1,0


In [18]:
y = pd.DataFrame(final_df['who_win_x'])
y.shape

(697, 1)

In [19]:
final_df.drop(['who_win_x', 'who_win_y','map_name_x_y', 'map_name_y_x', 'map_name_y_y'], inplace=True, axis=1)

In [20]:
final_df

Unnamed: 0,map_id,team1_id_x,team2_id_x,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,...,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y,team_id_y,map_name_x_x_Dust2,map_name_x_x_Inferno,map_name_x_x_Mirage,map_name_x_x_Nuke,map_name_x_x_Overpass,map_name_x_x_Vertigo
0,289,6665,7718,4954,258,36.0,293,0.88,71.1,6.3,...,1.06,76.2,19.0,7718,0,0,0,0,0,0
1,715,4411,10577,8611,178,39.3,208,0.86,64.1,6.5,...,1.39,75.0,23.3,10577,0,1,0,0,0,0
2,157,11251,9455,7938,494,52.8,397,1.24,94.0,5.8,...,1.09,53.1,15.5,9455,0,0,0,1,0,0
3,524,4608,7532,7998,474,29.1,304,1.56,86.9,2.4,...,0.82,73.3,7.9,7532,0,0,1,0,0,0
4,404,8637,6667,2898,217,55.8,248,0.88,64.8,4.6,...,1.01,80.0,12.4,6667,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,709,6667,4773,429,320,47.8,396,0.81,71.2,11.4,...,1.12,57.1,13.8,4773,0,1,0,0,0,0
709,528,9215,5995,11154,203,50.2,131,1.55,82.1,3.4,...,0.00,0.0,0.0,5995,0,0,0,0,0,0
710,163,4869,9565,922,447,44.3,538,0.83,65.2,5.7,...,0.87,69.0,10.2,9565,0,0,1,0,0,0
711,96,10426,4991,973,499,43.1,472,1.06,73.8,2.7,...,0.90,86.1,12.7,4991,0,0,0,1,0,0


In [21]:
X = pd.DataFrame(final_df)
X.info()
X.isnull().values.any()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 0 to 712
Columns: 263 entries, map_id to map_name_x_x_Vertigo
dtypes: float64(160), int64(97), uint8(6)
memory usage: 1.4 MB


False

In [22]:
X.shape

(697, 263)

In [23]:
# X_new = SelectKBest(chi2, k=50).fit_transform(X, y)

In [24]:
def optimize_hyperparams(model, parameters, x_train, y_train):
    nfolds = 10
    cross_val = StratifiedKFold(nfolds)
    grid = GridSearchCV(model, parameters, cv=cross_val, refit=True, verbose=1, n_jobs=4)
    grid.fit(x_train, y_train)
    print(f'Accuracy : {grid.best_score_} with params {grid.best_estimator_}')
    return grid.best_estimator_

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [26]:
rf_clf = RandomForestClassifier(criterion='gini')
print(X_train.shape, y_train.shape)
rf_params = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [5, 10, 15, 20, 25],
    'max_features': [16, 32, 64, 128, 256],
    'max_leaf_nodes': [5, 10],
    'bootstrap': [True]
}
best_rf = optimize_hyperparams(rf_clf, rf_params, X_train, y_train.values.ravel())

(557, 263) (557, 1)
Fitting 10 folds for each of 250 candidates, totalling 2500 fits
Accuracy : 0.5999025974025974 with params RandomForestClassifier(max_depth=5, max_features=256, max_leaf_nodes=5,
                       n_estimators=5)


In [16]:
import optuna

In [47]:
def objective(trial):
    x = X_train
    y = y_train.values.ravel()
    
    max_depth = trial.suggest_int('xgb_max_depth', 2, 64, log=True)
    max_leaves = trial.suggest_int('xgb_max_leaves', 5, 20)
    n_estimators = trial.suggest_int('xgb_n_estimators', 100, 200)
    learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.5)
    gamma = trial.suggest_float('xgb_gamma', 1, 9)
    reg_alpha = trial.suggest_int('xgb_reg_alpha', 40, 180)
    min_child_weight = trial.suggest_int('xgb_minchild_weight', 0, 10)
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        max_leaves=max_leaves,
        gamma=gamma
    )
    
    
#     rf_model = RandomForestClassifier(
#         max_depth= max_depth,
#         max_samples=max_samples,
#         max_features=max_features,
#         max_leaf_nodes=max_leaf_nodes,
#         n_estimators=n_estimators,
#         random_state=1337
#     )
    score = cross_val_score(xgb_model, x, y, cv=5).mean()
    return score

In [48]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 100)

trial = study.best_trial
print(f'best score = {trial.value}')
print(f'Best params: ')
for key, value in trial.params.items():
    print(f'{key} {value}')

[32m[I 2022-12-02 17:49:37,345][0m A new study created in memory with name: no-name-1f5afa4d-018d-4d1a-8014-2f8e70051993[0m
[32m[I 2022-12-02 17:49:43,166][0m Trial 0 finished with value: 0.49734555984555984 and parameters: {'xgb_max_depth': 15, 'xgb_max_leaves': 17, 'xgb_n_estimators': 183, 'xgb_learning_rate': 0.297842491952613, 'xgb_gamma': 2.212870938906363, 'xgb_reg_alpha': 80, 'xgb_minchild_weight': 1}. Best is trial 0 with value: 0.49734555984555984.[0m
[32m[I 2022-12-02 17:49:46,469][0m Trial 1 finished with value: 0.5170688545688545 and parameters: {'xgb_max_depth': 11, 'xgb_max_leaves': 16, 'xgb_n_estimators': 101, 'xgb_learning_rate': 0.34046772727261604, 'xgb_gamma': 3.3213882378976542, 'xgb_reg_alpha': 50, 'xgb_minchild_weight': 5}. Best is trial 1 with value: 0.5170688545688545.[0m
[32m[I 2022-12-02 17:49:48,222][0m Trial 2 finished with value: 0.52253861003861 and parameters: {'xgb_max_depth': 2, 'xgb_max_leaves': 19, 'xgb_n_estimators': 176, 'xgb_learning_rat

[32m[I 2022-12-02 17:51:24,074][0m Trial 24 finished with value: 0.5332368082368083 and parameters: {'xgb_max_depth': 2, 'xgb_max_leaves': 5, 'xgb_n_estimators': 177, 'xgb_learning_rate': 0.12258192003811542, 'xgb_gamma': 6.702731087102576, 'xgb_reg_alpha': 123, 'xgb_minchild_weight': 4}. Best is trial 11 with value: 0.5475225225225225.[0m
[32m[I 2022-12-02 17:51:27,599][0m Trial 25 finished with value: 0.5188706563706564 and parameters: {'xgb_max_depth': 5, 'xgb_max_leaves': 7, 'xgb_n_estimators': 186, 'xgb_learning_rate': 0.02037801621869545, 'xgb_gamma': 4.12481934003872, 'xgb_reg_alpha': 106, 'xgb_minchild_weight': 6}. Best is trial 11 with value: 0.5475225225225225.[0m
[32m[I 2022-12-02 17:51:34,032][0m Trial 26 finished with value: 0.4919723294723295 and parameters: {'xgb_max_depth': 30, 'xgb_max_leaves': 10, 'xgb_n_estimators': 194, 'xgb_learning_rate': 0.4354831395155554, 'xgb_gamma': 5.268979764971886, 'xgb_reg_alpha': 138, 'xgb_minchild_weight': 1}. Best is trial 11 w

[32m[I 2022-12-02 17:53:26,468][0m Trial 48 finished with value: 0.5152670527670528 and parameters: {'xgb_max_depth': 13, 'xgb_max_leaves': 12, 'xgb_n_estimators': 196, 'xgb_learning_rate': 0.31613319498749115, 'xgb_gamma': 8.07835256866917, 'xgb_reg_alpha': 131, 'xgb_minchild_weight': 8}. Best is trial 45 with value: 0.5546975546975548.[0m
[32m[I 2022-12-02 17:53:32,647][0m Trial 49 finished with value: 0.5313867438867439 and parameters: {'xgb_max_depth': 24, 'xgb_max_leaves': 20, 'xgb_n_estimators': 174, 'xgb_learning_rate': 0.24760373674899175, 'xgb_gamma': 6.000921621436197, 'xgb_reg_alpha': 149, 'xgb_minchild_weight': 5}. Best is trial 45 with value: 0.5546975546975548.[0m
[32m[I 2022-12-02 17:53:36,972][0m Trial 50 finished with value: 0.5188545688545689 and parameters: {'xgb_max_depth': 38, 'xgb_max_leaves': 10, 'xgb_n_estimators': 118, 'xgb_learning_rate': 0.11953529649634889, 'xgb_gamma': 6.550325156069556, 'xgb_reg_alpha': 168, 'xgb_minchild_weight': 0}. Best is trial

[32m[I 2022-12-02 17:56:03,331][0m Trial 72 finished with value: 0.5241312741312741 and parameters: {'xgb_max_depth': 9, 'xgb_max_leaves': 12, 'xgb_n_estimators': 197, 'xgb_learning_rate': 0.18863680509593295, 'xgb_gamma': 5.759137043793956, 'xgb_reg_alpha': 147, 'xgb_minchild_weight': 2}. Best is trial 55 with value: 0.5548423423423424.[0m
[32m[I 2022-12-02 17:56:10,151][0m Trial 73 finished with value: 0.5188545688545689 and parameters: {'xgb_max_depth': 55, 'xgb_max_leaves': 8, 'xgb_n_estimators': 192, 'xgb_learning_rate': 0.010496353208464697, 'xgb_gamma': 5.122024106962755, 'xgb_reg_alpha': 130, 'xgb_minchild_weight': 3}. Best is trial 55 with value: 0.5548423423423424.[0m
[32m[I 2022-12-02 17:56:16,279][0m Trial 74 finished with value: 0.5402992277992278 and parameters: {'xgb_max_depth': 12, 'xgb_max_leaves': 10, 'xgb_n_estimators': 184, 'xgb_learning_rate': 0.22576321011493258, 'xgb_gamma': 5.5786126814148735, 'xgb_reg_alpha': 155, 'xgb_minchild_weight': 4}. Best is tria

[32m[I 2022-12-02 17:59:04,948][0m Trial 96 finished with value: 0.5296975546975548 and parameters: {'xgb_max_depth': 19, 'xgb_max_leaves': 8, 'xgb_n_estimators': 189, 'xgb_learning_rate': 0.1659253711695106, 'xgb_gamma': 8.35849653709927, 'xgb_reg_alpha': 131, 'xgb_minchild_weight': 2}. Best is trial 55 with value: 0.5548423423423424.[0m
[32m[I 2022-12-02 17:59:13,790][0m Trial 97 finished with value: 0.5242760617760618 and parameters: {'xgb_max_depth': 16, 'xgb_max_leaves': 8, 'xgb_n_estimators': 193, 'xgb_learning_rate': 0.19130258144927112, 'xgb_gamma': 7.367898523519252, 'xgb_reg_alpha': 176, 'xgb_minchild_weight': 3}. Best is trial 55 with value: 0.5548423423423424.[0m
[32m[I 2022-12-02 17:59:22,095][0m Trial 98 finished with value: 0.5403153153153153 and parameters: {'xgb_max_depth': 13, 'xgb_max_leaves': 11, 'xgb_n_estimators': 200, 'xgb_learning_rate': 0.29029290590327617, 'xgb_gamma': 8.797185014112916, 'xgb_reg_alpha': 145, 'xgb_minchild_weight': 3}. Best is trial 55

best score = 0.5548423423423424
Best params: 
xgb_max_depth 18
xgb_max_leaves 11
xgb_n_estimators 191
xgb_learning_rate 0.13503888827516713
xgb_gamma 7.56591636937328
xgb_reg_alpha 121
xgb_minchild_weight 4


In [49]:
xgb_clf = xgb.XGBClassifier()
xgb_params = {
    'n_estimators': [200, 250],
    'max_leaves': [5, 10, 15],
    'max_depth': [2, 5],
    'booster': ['gblinear'],
    'learning_rate': [0.3, 0.5]
}
best_xgb = optimize_hyperparams(xgb_clf, xgb_params, X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Parameters: { "max_depth", "max_leaves" } are not used.

Accuracy : 0.5978246753246754 with params XGBClassifier(base_score=0.5, booster='gblinear', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=-1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.5, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=0, num_parallel_tree=None,
              predictor=None, random_state=0, ...)


In [50]:
optuna.visualization.plot_param_importances(study)

In [51]:
optuna.visualization.plot_optimization_history(study)