In [1]:
import pandas as pd
import optuna
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV,TimeSeriesSplit, cross_validate, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# To avoid truncating columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Setting random seed for reproducibility
RANDOM_SEED = 1

# Load the Data

In [60]:
train = pd.read_csv("vct_data/train_preprocessed.csv")
train.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Round Win %_TeamA,First Blood %_TeamA,Clutches_TeamA,Attacker Win %_TeamA,Defender Win %_TeamA,Overtime Win %_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Round Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamA,First Blood %_RollAvg_TeamA,Headshot %_RollAvg_TeamA,Clutches_RollAvg_TeamA,Econ_RollAvg_TeamA,Recent Win %_TeamA,KDA_TeamB,Round Win %_TeamB,First Blood %_TeamB,Clutches_TeamB,Attacker Win %_TeamB,Defender Win %_TeamB,Overtime Win %_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Round Win %_RollAvg_TeamB,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamB,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamB,Recent Win %_TeamB,Team A Tournament Win %,Team A Map Win %,Team A H2H Win %,Team B Tournament Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Ascent,Leviatán,7,6,1.0,0.0,Furious Gaming,13,7,6.0,0.0,41.883333,0.990122,186.0,60.0,76.0,22.0,-16.0,0.699855,119.6,0.25,10.0,10.0,0.0,0.990134,218.4,76.0,60.0,25.0,16.0,0.699858,132.8,0.19,10.0,10.0,0.0,17455.584114,8510.17329,Full buy: 20k+,17455.711283,8510.381277,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.517956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.51867,0.0,0.0,2.0,1.0,3.0,1.0,9.0,0.0,3.0,1.0,9.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,1.078947,0.35,0.5,0.0,0.5,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.683333,0.65,0.5,0.0,0.875,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559545,0.534095,0
1,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Ascent,Furious Gaming,13,7,6.0,0.0,Leviatán,7,6,1.0,0.0,41.883333,0.990122,218.4,76.0,60.0,25.0,16.0,0.699855,132.8,0.19,10.0,10.0,0.0,0.990134,186.0,60.0,76.0,22.0,-16.0,0.699858,119.6,0.25,10.0,10.0,0.0,17455.584114,8510.17329,Full buy: 20k+,17455.711283,8510.381277,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.517956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.51867,0.0,0.0,9.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,9.0,0.0,3.0,1.0,1.683333,0.65,0.5,0.0,0.875,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.078947,0.35,0.5,0.0,0.5,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465905,0.440455,1
2,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Bind,Leviatán,7,5,2.0,0.0,Furious Gaming,13,6,7.0,0.0,38.933333,0.990122,175.8,56.0,81.0,27.0,-25.0,0.699855,113.6,0.16,10.0,10.0,0.0,0.990134,228.8,81.0,56.0,35.0,25.0,0.699858,164.6,0.202,10.0,10.0,0.0,17455.584114,8510.17329,Full buy: 20k+,17455.711283,8510.381277,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.517956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.51867,0.0,0.0,4.0,3.0,0.0,0.0,8.0,0.0,5.0,0.0,8.0,0.0,5.0,0.0,4.0,3.0,0.0,0.0,1.024691,0.35,0.5,0.0,0.416667,0.25,0.0,0.990122,186.0,119.6,1.078947,0.699855,0.35,0.5,0.125,0.0,0.5,0.25,0.0,53.517956,0.0,2.071429,0.65,0.5,0.0,0.75,0.583333,0.0,0.990134,218.4,132.8,1.683333,0.699858,0.65,0.875,0.5,0.0,0.5,0.19,0.0,53.51867,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559545,0.534095,0
3,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Bind,Furious Gaming,13,6,7.0,0.0,Leviatán,7,5,2.0,0.0,38.933333,0.990122,228.8,81.0,56.0,35.0,25.0,0.699855,164.6,0.202,10.0,10.0,0.0,0.990134,175.8,56.0,81.0,27.0,-25.0,0.699858,113.6,0.16,10.0,10.0,0.0,17455.584114,8510.17329,Full buy: 20k+,17455.711283,8510.381277,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.517956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.51867,0.0,0.0,8.0,0.0,5.0,0.0,4.0,3.0,0.0,0.0,4.0,3.0,0.0,0.0,8.0,0.0,5.0,0.0,2.071429,0.65,0.5,0.0,0.75,0.583333,0.0,0.990122,218.4,132.8,1.683333,0.699855,0.65,0.875,0.5,0.0,0.5,0.19,0.0,53.517956,1.0,1.024691,0.35,0.5,0.0,0.416667,0.25,0.0,0.990134,186.0,119.6,1.078947,0.699858,0.35,0.5,0.125,0.0,0.5,0.25,0.0,53.51867,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465905,0.440455,1
4,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Procyon Team vs KRÜ Esports,Bind,Procyon Team,1,1,0.0,0.0,KRÜ Esports,13,2,11.0,0.0,28.033333,0.990122,187.2,40.0,67.0,21.0,-27.0,0.699855,130.8,0.192,6.0,8.0,-2.0,0.990134,253.0,67.0,40.0,23.0,27.0,0.699858,150.0,0.314,8.0,6.0,2.0,17455.584114,8510.17329,Full buy: 20k+,17455.711283,8510.381277,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.517956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.51867,0.0,0.0,0.0,0.0,1.0,0.0,9.0,3.0,1.0,0.0,9.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.910448,0.071429,0.428571,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.25,0.928571,0.571429,0.0,1.0,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.313396,0


In [61]:
test = pd.read_csv("vct_data/test_preprocessed.csv")
test.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Round Win %_TeamA,First Blood %_TeamA,Clutches_TeamA,Attacker Win %_TeamA,Defender Win %_TeamA,Overtime Win %_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Round Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamA,First Blood %_RollAvg_TeamA,Headshot %_RollAvg_TeamA,Clutches_RollAvg_TeamA,Econ_RollAvg_TeamA,Recent Win %_TeamA,KDA_TeamB,Round Win %_TeamB,First Blood %_TeamB,Clutches_TeamB,Attacker Win %_TeamB,Defender Win %_TeamB,Overtime Win %_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Round Win %_RollAvg_TeamB,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamB,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamB,Recent Win %_TeamB,Team A Tournament Win %,Team A Map Win %,Team A H2H Win %,Team B Tournament Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour Malaysia & Singapore Stage 2: C...,Group Stage,Day 7,BLEED vs Galaxy Esports,Icebox,Galaxy Esports,13,4,9.0,0.0,BLEED,7,3,4.0,0.0,43.333333,1.164,222.2,82.0,60.0,18.0,22.0,0.73,145.6,0.226,11.0,9.0,2.0,0.806,179.2,60.0,82.0,24.0,-22.0,0.63,119.2,0.24,9.0,11.0,-2.0,19605.0,10715.0,Full buy: 20k+,15860.0,4590.0,Semi-buy: 10-20k,14.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,67.2,5.0,2.0,10.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,46.2,5.0,0.0,10.0,1.0,2.0,0.0,4.0,2.0,0.0,1.0,4.0,2.0,0.0,1.0,10.0,1.0,2.0,0.0,1.666667,0.65,0.55,1.0,0.5,0.75,0.0,0.924,194.4,125.38,1.301095,0.7012,0.440863,0.45,0.463308,0.0,0.513037,0.2098,1.6,50.04,0.4,1.02439,0.35,0.45,3.0,0.25,0.5,0.0,0.971,197.86,129.62,1.379461,0.7036,0.489425,0.5625,0.436111,0.025,0.475159,0.2452,1.8,52.34,0.6,0.5,0.2,0.5,0.571429,0.666667,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.447334,0.414839,1
1,Champions Tour Malaysia & Singapore Stage 2: C...,Group Stage,Day 8,BLEED vs KPMOONIIBLM9,Ascent,BLEED,11,5,6.0,0.0,KPMOONIIBLM9,13,6,7.0,0.0,46.366667,0.92,196.2,80.0,86.0,35.0,-6.0,0.684,131.0,0.23,9.0,15.0,-6.0,1.128,205.8,86.0,81.0,43.0,5.0,0.726,143.2,0.29,15.0,9.0,6.0,17050.0,8370.833333,Full buy: 20k+,18195.833333,11104.166667,Full buy: 20k+,12.0,5.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,54.2,6.0,2.0,17.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,59.2,7.0,3.0,8.0,1.0,2.0,0.0,10.0,0.0,3.0,0.0,10.0,0.0,3.0,0.0,8.0,1.0,2.0,0.0,1.337209,0.458333,0.375,1.0,0.416667,0.5,0.0,0.9364,194.06,127.78,1.312059,0.6876,0.459425,0.5,0.436111,0.025,0.460159,0.2482,1.8,51.3,0.3,1.592593,0.541667,0.625,1.0,0.5,0.583333,0.0,1.0512,211.96,136.7,1.607471,0.735,0.524396,0.545014,0.516667,0.0,0.550017,0.251,1.4,57.5,0.4,0.533333,0.666667,1.0,0.68,0.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57295,0.371567,0
2,Champions Tour Malaysia & Singapore Stage 2: C...,Group Stage,Day 8,BLEED vs KPMOONIIBLM9,Ascent,KPMOONIIBLM9,13,6,7.0,0.0,BLEED,11,5,6.0,0.0,46.366667,1.128,205.8,86.0,81.0,43.0,5.0,0.726,143.2,0.29,15.0,9.0,6.0,0.92,196.2,80.0,86.0,35.0,-6.0,0.684,131.0,0.23,9.0,15.0,-6.0,18195.833333,11104.166667,Full buy: 20k+,17050.0,8370.833333,Full buy: 20k+,17.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,59.2,7.0,3.0,12.0,5.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,54.2,6.0,2.0,10.0,0.0,3.0,0.0,8.0,1.0,2.0,0.0,8.0,1.0,2.0,0.0,10.0,0.0,3.0,0.0,1.592593,0.541667,0.625,1.0,0.5,0.583333,0.0,1.0512,211.96,136.7,1.607471,0.735,0.524396,0.545014,0.516667,0.0,0.550017,0.251,1.4,57.5,0.6,1.337209,0.458333,0.375,1.0,0.416667,0.5,0.0,0.9364,194.06,127.78,1.312059,0.6876,0.459425,0.5,0.436111,0.025,0.460159,0.2482,1.8,51.3,0.7,0.68,0.666667,0.0,0.533333,0.666667,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.628433,0.414839,1
3,Champions Tour Malaysia & Singapore Stage 2: C...,Group Stage,Day 8,BLEED vs KPMOONIIBLM9,Icebox,BLEED,10,5,5.0,0.0,KPMOONIIBLM9,13,6,7.0,0.0,47.7,0.898,199.4,79.0,86.0,27.0,-7.0,0.68,131.4,0.244,16.0,7.0,9.0,1.054,208.4,86.0,79.0,28.0,7.0,0.672,134.6,0.246,7.0,16.0,-9.0,17465.217391,5791.304348,Full buy: 20k+,17930.434783,9091.304348,Full buy: 20k+,12.0,3.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,52.4,9.0,0.0,10.0,10.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,52.8,7.0,4.0,8.0,1.0,0.0,1.0,8.0,1.0,4.0,0.0,8.0,1.0,4.0,0.0,8.0,1.0,0.0,1.0,1.232558,0.434783,0.695652,4.0,0.416667,0.454545,0.0,0.9648,198.04,130.76,1.367839,0.6984,0.486508,0.491667,0.477778,0.025,0.472659,0.243,1.9,52.7,0.3,1.443038,0.565217,0.304348,2.0,0.545455,0.583333,0.0,1.0178,207.98,134.68,1.434472,0.7212,0.497312,0.520014,0.475,0.0,0.531267,0.2582,1.4,54.7,0.4,0.5,0.6,0.5,0.692308,0.7,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.57295,0.371567,0
4,Champions Tour Malaysia & Singapore Stage 2: C...,Group Stage,Day 8,BLEED vs KPMOONIIBLM9,Icebox,KPMOONIIBLM9,13,6,7.0,0.0,BLEED,10,5,5.0,0.0,47.7,1.054,208.4,86.0,79.0,28.0,7.0,0.672,134.6,0.246,7.0,16.0,-9.0,0.898,199.4,79.0,86.0,27.0,-7.0,0.68,131.4,0.244,16.0,7.0,9.0,17930.434783,9091.304348,Full buy: 20k+,17465.217391,5791.304348,Full buy: 20k+,10.0,10.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,52.8,7.0,4.0,12.0,3.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,52.4,9.0,0.0,8.0,1.0,4.0,0.0,8.0,1.0,0.0,1.0,8.0,1.0,0.0,1.0,8.0,1.0,4.0,0.0,1.443038,0.565217,0.304348,2.0,0.545455,0.583333,0.0,1.0178,207.98,134.68,1.434472,0.7212,0.497312,0.520014,0.475,0.0,0.531267,0.2582,1.4,54.7,0.6,1.232558,0.434783,0.695652,4.0,0.416667,0.454545,0.0,0.9648,198.04,130.76,1.367839,0.6984,0.486508,0.491667,0.477778,0.025,0.472659,0.243,1.9,52.7,0.7,0.692308,0.7,0.5,0.5,0.6,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.628433,0.414839,1


# Data Preprocessing

## Feature selection / Dimensionality reduction

## Extract features and target

In [4]:
features = [
    "Team A_Encoded", 
    "Team B_Encoded",

    "Team A H2H Win %",
    "Team B H2H Win %",

    "Team A Tournament Win %",
    "Team B Tournament Win %",

    "Recent Win %_TeamA",
    "Recent Win %_TeamB",

    "Map_Abyss",
    "Map_Ascent",
    "Map_Bind",
    "Map_Breeze",
    "Map_Fracture",
    "Map_Haven",
    "Map_Icebox",
    "Map_Lotus",
    "Map_Pearl",
    "Map_Split",
    "Map_Sunset",

    "Team A Map Win %",
    "Team B Map Win %",

    "Round Win %_RollAvg_TeamA",
    "Round Win %_RollAvg_TeamB",

    "Attacker Win %_RollAvg_TeamA",
    "Attacker Win %_RollAvg_TeamB",

    "Defender Win %_RollAvg_TeamA",
    "Defender Win %_RollAvg_TeamB",

    "Overtime Win %_RollAvg_TeamA",
    "Overtime Win %_RollAvg_TeamB",

    "Rating_RollAvg_TeamA",
    "Rating_RollAvg_TeamB",

    "Average Combat Score_RollAvg_TeamA",
    "Average Combat Score_RollAvg_TeamB", 

    "Average Damage Per Round_RollAvg_TeamA",
    "Average Damage Per Round_RollAvg_TeamB",

    "KDA_RollAvg_TeamA",
    "KDA_RollAvg_TeamB",

    "Kill, Assist, Trade, Survive %_RollAvg_TeamA", 
    "Kill, Assist, Trade, Survive %_RollAvg_TeamB",

    "First Blood %_RollAvg_TeamA",
    "First Blood %_RollAvg_TeamB",

    "Headshot %_RollAvg_TeamA",
    "Headshot %_RollAvg_TeamB",

    "Clutches_RollAvg_TeamA",
    "Clutches_RollAvg_TeamB",

    "Econ_RollAvg_TeamA",
    "Econ_RollAvg_TeamB",
]

X_train = train[features]
y_train = train["Winner"]

X_test = test[features]
y_test = test["Winner"]

In [5]:
X_train.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Team A H2H Win %,Team B H2H Win %,Recent Win %_TeamA,Recent Win %_TeamB,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A Map Win %,Team B Map Win %,Attacker Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamB,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",First Blood %_RollAvg_TeamA,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamA,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB
0,0.559545,0.534094,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.465905,0.440455,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.559545,0.534094,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.875,0.125,0.5,0.0,0.0,0.990122,0.990134,186.0,218.4,119.6,132.8,1.078947,1.683333,0.699855,0.699858,0.5,0.5,0.25,0.19,0.0,0.0,53.517956,53.51867
3,0.465905,0.440455,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875,0.5,0.5,0.125,0.0,0.0,0.990122,0.990134,218.4,186.0,132.8,119.6,1.683333,1.078947,0.699855,0.699858,0.5,0.5,0.19,0.25,0.0,0.0,53.517956,53.51867
4,0.0,0.313396,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X_train.shape

(40497, 43)

In [7]:
X_test.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Team A H2H Win %,Team B H2H Win %,Recent Win %_TeamA,Recent Win %_TeamB,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A Map Win %,Team B Map Win %,Attacker Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamB,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",First Blood %_RollAvg_TeamA,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamA,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB
0,0.447334,0.427049,0.0,0.0,0.4,0.6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.5625,0.463308,0.436111,0.0,0.025,0.924,0.971,194.4,197.86,125.38,129.62,1.301095,1.379461,0.7012,0.7036,0.513037,0.475159,0.2098,0.2452,1.6,1.8,50.04,52.34
1,0.561244,0.371567,0.0,0.0,0.3,0.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.545014,0.436111,0.516667,0.025,0.0,0.9364,1.0512,194.06,211.96,127.78,136.7,1.312059,1.607471,0.6876,0.735,0.460159,0.550017,0.2482,0.251,1.8,1.4,51.3,57.5
2,0.628433,0.427049,0.0,0.0,0.6,0.7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.545014,0.5,0.516667,0.436111,0.0,0.025,1.0512,0.9364,211.96,194.06,136.7,127.78,1.607471,1.312059,0.735,0.6876,0.550017,0.460159,0.251,0.2482,1.4,1.8,57.5,51.3
3,0.561244,0.371567,0.0,0.0,0.3,0.4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491667,0.520014,0.477778,0.475,0.025,0.0,0.9648,1.0178,198.04,207.98,130.76,134.68,1.367839,1.434472,0.6984,0.7212,0.472659,0.531267,0.243,0.2582,1.9,1.4,52.7,54.7
4,0.628433,0.427049,0.0,0.0,0.6,0.7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.520014,0.491667,0.475,0.477778,0.0,0.025,1.0178,0.9648,207.98,198.04,134.68,130.76,1.434472,1.367839,0.7212,0.6984,0.531267,0.472659,0.2582,0.243,1.4,1.9,54.7,52.7


In [8]:
X_test.shape

(10125, 43)

## Scaling features (for Logistic Regression only)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Selection

We will be using the following models:
- Logistic Regression
- Random Forest
- XGBoost

# Model Training and Evaluation

Function for cross-validating the models

In [10]:
SCORING = ["accuracy", "precision", "recall", "f1", "roc_auc"]

def evaluate_model(model, X_train, y_train, cv=5, scoring=SCORING):
    tscv = TimeSeriesSplit(n_splits=cv)
    scores = cross_validate(model, X_train, y_train, cv=tscv, scoring=scoring)

    for metric in SCORING:
        print(f"{metric}: {scores[f'test_{metric}'].mean()}")

## Training baseline models

In [11]:
logreg_base = LogisticRegression(random_state=RANDOM_SEED)
rf_base = RandomForestClassifier(random_state=RANDOM_SEED)
xgb_base = XGBClassifier(random_state=RANDOM_SEED)

In [12]:

print("Logistic Regression Baseline")
evaluate_model(logreg_base, X_train_scaled, y_train)

Logistic Regression Baseline
accuracy: 0.6401244628833902
precision: 0.6442510573823335
recall: 0.6255933609958506
f1: 0.6347549979983406
roc_auc: 0.6999770796285318


In [13]:
print("Random Forest Baseline")
evaluate_model(rf_base, X_train, y_train)

Random Forest Baseline
accuracy: 0.6546451326122389
precision: 0.6571254593167938
recall: 0.6467517969659048
f1: 0.6518551835545527
roc_auc: 0.724818819293508


In [14]:
print("XGBoost Baseline")
evaluate_model(xgb_base, X_train, y_train)

XGBoost Baseline
accuracy: 0.6421988442732257
precision: 0.6402750492669487
recall: 0.6492420909350369
f1: 0.6445866901474966
roc_auc: 0.7057225844694723


## Hyperparameter tuning

In [15]:
# Ensures the splits are in chronological order
tscv = TimeSeriesSplit(n_splits=5)

ITERATIONS = 100

# Number of CPU cores to use (-1 means for all cores)
N_JOBS = -1

### Randomized Search

In [None]:
# Logistic Regression
logreg_params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
    "max_iter": [100, 200, 300, 400, 500, 1000]
}

logreg_random = RandomizedSearchCV(LogisticRegression(random_state=RANDOM_SEED), 
                                   logreg_params, 
                                   n_iter=ITERATIONS, 
                                   cv=tscv, 
                                   scoring=SCORING,
                                   refit="accuracy", 
                                   random_state=RANDOM_SEED, 
                                   n_jobs=N_JOBS,
                                   error_score="raise")

logreg_random.fit(X_train_scaled, y_train)

print("Logistic Regression Random Tuned")
print("Best params:", logreg_random.best_params_)
print("Best score (ROC-AUC):", logreg_random.best_score_)

print("\nCross-validation scores:")
evaluate_model(logreg_random.best_estimator_, X_train_scaled, y_train)

# Save model
joblib.dump(logreg_random.best_estimator_, "models/logreg_random_v1.pkl")

In [None]:
# Random Forest
rf_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [None, 5, 10, 15, 20, 25, 30, 50, 100],
    "min_samples_split": [2, 5, 10, 15, 20],
    "min_samples_leaf": [1, 2, 4, 8, 16],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

rf_random = RandomizedSearchCV(RandomForestClassifier(random_state=RANDOM_SEED),
                              rf_params,
                              n_iter=ITERATIONS,
                              cv=tscv,
                              scoring=SCORING,
                              refit="accuracy",
                              random_state=RANDOM_SEED,
                              n_jobs=N_JOBS,
                              error_score="raise")

rf_random.fit(X_train, y_train)

print("Random Forest Random Tuned")
print("Best params:", rf_random.best_params_)
print("Best score (ROC-AUC):", rf_random.best_score_)

print("\nCross-validation scores:")
evaluate_model(rf_random.best_estimator_, X_train, y_train)

joblib.dump(rf_random.best_estimator_, "models/rf_random_v1.pkl")

In [None]:
# XGBoost
xgb_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "learning_rate": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "gamma": [0, 0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10],
    "reg_alpha": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    "reg_lambda": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
}

xgb_tuned = RandomizedSearchCV(XGBClassifier(random_state=RANDOM_SEED),
                               xgb_params,
                               n_iter=ITERATIONS,
                               cv=tscv,
                               scoring=SCORING,
                               refit="roc_auc",
                               random_state=RANDOM_SEED,
                               n_jobs=N_JOBS,
                               error_score="raise")

xgb_tuned.fit(X_train, y_train)

print("XGBoost Random Tuned")
print("Best params:", xgb_tuned.best_params_)
print("Best score (ROC-AUC):", xgb_tuned.best_score_)

print("\nCross-validation scores:")
evaluate_model(xgb_tuned.best_estimator_, X_train, y_train)

joblib.dump(xgb_tuned.best_estimator_, "models/xgb_random_v1.pkl")

### Optuna

In [35]:
# Random Forest Tuning
def tune_rf(X_train, y_train, tscv, scoring, random_state, n_iter=100):
    
    def rf_objective(trial):
        rf_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 50),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        }

        rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, bootstrap=True, **rf_params)
        scores = cross_val_score(rf, X_train, y_train, cv=tscv, scoring=scoring)
        return scores.mean()
    
    study = optuna.create_study(direction="maximize")
    study.optimize(rf_objective, n_trials=n_iter)
    return study.best_params, study.best_value


# Tune model
rf_params_optuna, rf_score_optuna = tune_rf(X_train, y_train, tscv, "accuracy", RANDOM_SEED)

# Display results
print("\nRandom Forest Optuna Tuned")
print("Best params:", rf_params_optuna)
print("Best score:", rf_score_optuna)


[I 2024-09-22 01:14:03,049] A new study created in memory with name: no-name-de824323-417d-4b42-8d0f-3defc6b41f7a
[I 2024-09-22 01:14:25,227] Trial 0 finished with value: 0.6697288487183286 and parameters: {'n_estimators': 707, 'max_depth': 48, 'min_samples_split': 7, 'min_samples_leaf': 16, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6697288487183286.
[I 2024-09-22 01:14:51,718] Trial 1 finished with value: 0.6688990961623944 and parameters: {'n_estimators': 837, 'max_depth': 30, 'min_samples_split': 20, 'min_samples_leaf': 20, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6697288487183286.
[I 2024-09-22 01:17:04,654] Trial 2 finished with value: 0.6532226996592089 and parameters: {'n_estimators': 535, 'max_depth': 32, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 0 with value: 0.6697288487183286.
[I 2024-09-22 01:17:10,432] Trial 3 finished with value: 0.6670617869313972 and parameters: {'n_estimators': 167, 'max_depth': 48, '


Random Forest Optuna Tuned
Best params: {'n_estimators': 439, 'max_depth': 41, 'min_samples_split': 23, 'min_samples_leaf': 29, 'max_features': 'sqrt'}
Best score: 0.6706771373536821


In [36]:
rf_tuned_optuna = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, **rf_params_optuna)

print("Random Forest Optuna Tuned")
evaluate_model(rf_tuned_optuna, X_train, y_train)

Random Forest Optuna Tuned
accuracy: 0.6706771373536821
precision: 0.6697207573064079
recall: 0.6743712661090254
f1: 0.6718755463153838
roc_auc: 0.7388560714834573


In [42]:
# Save model
rf_tuned_optuna.fit(X_train, y_train)
joblib.dump(rf_tuned_optuna, "models/rf_optuna_v2.pkl")
print("Model trained and saved")

Model trained and saved


In [37]:
# XGBoost Tuning
def tune_xgb(X_train, y_train, tscv, scoring, random_state, n_iter=100):
    
    def xgb_objective(trial):
        xgb_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 20),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 10.0, log=True),
        }

        xgb = XGBClassifier(random_state=random_state, n_jobs=-1, **xgb_params)
        scores = cross_val_score(xgb, X_train, y_train, cv=tscv, scoring=scoring)
        return scores.mean()
    
    study = optuna.create_study(direction="maximize")
    study.optimize(xgb_objective, n_trials=n_iter)
    return study.best_params, study.best_value

# Tune model
xgb_params_optuna, xgb_score_optuna = tune_xgb(X_train, y_train, tscv, "accuracy", RANDOM_SEED)

# Display results
print("\nXGBoost Optuna Tuned")
print("Best params:", xgb_params_optuna)
print("Best score:", xgb_score_optuna)

[I 2024-09-22 01:51:28,265] A new study created in memory with name: no-name-ccb5db23-1f76-431c-94c1-7ab1531682d9
[I 2024-09-22 01:51:38,452] Trial 0 finished with value: 0.6589124314713292 and parameters: {'n_estimators': 181, 'learning_rate': 0.0025269173189151947, 'max_depth': 9, 'subsample': 0.8172076896954834, 'colsample_bytree': 0.9165033704318796, 'gamma': 2.3283971753848127, 'reg_alpha': 0.0005270653200605051, 'reg_lambda': 7.951326833969146e-05}. Best is trial 0 with value: 0.6589124314713292.
[I 2024-09-22 01:51:47,481] Trial 1 finished with value: 0.6679804415468957 and parameters: {'n_estimators': 920, 'learning_rate': 0.007476610487055094, 'max_depth': 6, 'subsample': 0.9114002113464024, 'colsample_bytree': 0.8405211650749881, 'gamma': 3.048068938978585, 'reg_alpha': 8.638091353493817, 'reg_lambda': 0.0007502825765550572}. Best is trial 1 with value: 0.6679804415468957.
[I 2024-09-22 01:51:53,627] Trial 2 finished with value: 0.6517409986664692 and parameters: {'n_estimato


XGBoost Optuna Tuned
Best params: {'n_estimators': 279, 'learning_rate': 0.00804210815880431, 'max_depth': 9, 'subsample': 0.7016342803624044, 'colsample_bytree': 0.4809894955932661, 'gamma': 3.862525199223897, 'reg_alpha': 0.7653156447707654, 'reg_lambda': 5.940053230183238}
Best score: 0.669521410579345


In [38]:
xgb_tuned_optuna = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1, **xgb_params_optuna)

print("\nXGBoost Optuna Tuned")
evaluate_model(xgb_tuned_optuna, X_train, y_train)


XGBoost Optuna Tuned
accuracy: 0.669521410579345
precision: 0.6675019523713092
recall: 0.6766253660892664
f1: 0.671688036354927
roc_auc: 0.7391724077367231


In [43]:
# Save model
xgb_tuned_optuna.fit(X_train, y_train)
joblib.dump(xgb_tuned_optuna, "models/xgb_optuna_v2.pkl")
print("Model trained and saved")

Model trained and saved


In [39]:
# Logistic Regression Tuning
def tune_logreg(X_train, y_train, tscv, scoring, random_state, n_iter=100):

    def logreg_objective(trial):
        logreg_params = {
            "C": trial.suggest_float("C", 1e-5, 100, log=True),
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
            "solver": trial.suggest_categorical("solver", ["liblinear", "saga"]),
        }

        logreg = LogisticRegression(random_state=random_state, max_iter=2000, **logreg_params)
        scores = cross_val_score(logreg, X_train, y_train, cv=tscv, scoring=scoring)
        return scores.mean()
    
    study = optuna.create_study(direction="maximize")
    study.optimize(logreg_objective, n_trials=n_iter)
    return study.best_params, study.best_value


# Tune model
logreg_params_optuna, logreg_score_optuna = tune_logreg(X_train_scaled, y_train, tscv, "accuracy", RANDOM_SEED)

# Display results
print("Logistic Regression Optuna Tuned")
print("Best params:", logreg_params_optuna)
print("Best score:", logreg_score_optuna)

[I 2024-09-22 02:10:46,497] A new study created in memory with name: no-name-e1dc3ce4-8c1c-4282-bde1-2fca67ed530c
[I 2024-09-22 02:10:48,570] Trial 0 finished with value: 0.6401244628833901 and parameters: {'C': 2.24242622950689, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.6401244628833901.
[I 2024-09-22 02:10:48,949] Trial 1 finished with value: 0.6499925914950363 and parameters: {'C': 0.002681332800100651, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 1 with value: 0.6499925914950363.
[I 2024-09-22 02:10:49,914] Trial 2 finished with value: 0.6496666172766336 and parameters: {'C': 0.0042025114025820215, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 1 with value: 0.6499925914950363.
[I 2024-09-22 02:11:00,155] Trial 3 finished with value: 0.6418136020151134 and parameters: {'C': 0.6405232467370149, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 1 with value: 0.6499925914950363.
[I 2024-09-22 02:11:01,811] Trial 4 finished with value

Logistic Regression Optuna Tuned
Best params: {'C': 0.00023433365823034136, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.6521262409245814


In [40]:
logreg_tuned_optuna = LogisticRegression(random_state=RANDOM_SEED, max_iter=2000, **logreg_params_optuna)

print("Logistic Regression Optuna Tuned")
evaluate_model(logreg_tuned_optuna, X_train_scaled, y_train)

Logistic Regression Optuna Tuned
accuracy: 0.6521262409245814
precision: 0.6568037436725922
recall: 0.636499488462974
f1: 0.6464430693078328
roc_auc: 0.712426178401282


In [None]:
# LightGBM Tuning


In [None]:
# CatBoost Tuning


In [None]:
# SVM Tuning


## Model Stacking

In [41]:
# Level 1 models that train on the train set
base_models = [
    ("rf", rf_tuned_optuna),
    ("xgb", xgb_tuned_optuna)
]

# Trains on the predictions of the base models
meta_model = LogisticRegression(random_state=RANDOM_SEED, max_iter=2000)
# meta_model = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, **rf_params_optuna)
# meta_model = XGBClassifier(random_state=RANDOM_SEED, **xgb_params_optuna)

# Leverages the predictions of the base models to improve predictive performance
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model,
                                    n_jobs=-1)

print("Stacking Classifier")
evaluate_model(stacking_model, X_train, y_train)

Stacking Classifier
accuracy: 0.6702326270558602
precision: 0.6700408107386963
recall: 0.6717048277678983
f1: 0.670564636958488
roc_auc: 0.7395990427890843


In [44]:
# Stacked Classifier
stacking_model.fit(X_train, y_train)
joblib.dump(stacking_model, "models/stacked_base_rf_xgb_meta_lr_v2.pkl")
print("Model trained and saved")

Model trained and saved


## Feature selection

In [45]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(rf_tuned_optuna, threshold="mean")
sfm.fit(X_train, y_train)

selected_features = X_train.columns[sfm.get_support()]
print(f"Selected Features: {selected_features}")

Selected Features: Index(['Team A_Encoded', 'Team B_Encoded', 'Recent Win %_TeamA',
       'Attacker Win %_RollAvg_TeamA', 'Attacker Win %_RollAvg_TeamB',
       'Defender Win %_RollAvg_TeamA', 'Defender Win %_RollAvg_TeamB',
       'Average Combat Score_RollAvg_TeamA',
       'Average Combat Score_RollAvg_TeamB',
       'Average Damage Per Round_RollAvg_TeamA',
       'Average Damage Per Round_RollAvg_TeamB', 'KDA_RollAvg_TeamA',
       'KDA_RollAvg_TeamB', 'Kill, Assist, Trade, Survive %_RollAvg_TeamA',
       'Kill, Assist, Trade, Survive %_RollAvg_TeamB',
       'First Blood %_RollAvg_TeamA', 'First Blood %_RollAvg_TeamB'],
      dtype='object')


In [49]:
len(selected_features)

17

In [46]:
sfm_xgb = SelectFromModel(xgb_tuned_optuna, threshold="mean")
sfm_xgb.fit(X_train, y_train)

xgb_selected_features = X_train.columns[sfm_xgb.get_support()]
print(f"Selected Features: {xgb_selected_features}")

Selected Features: Index(['Team A_Encoded', 'Team B_Encoded', 'Recent Win %_TeamA',
       'Recent Win %_TeamB', 'Attacker Win %_RollAvg_TeamA',
       'Attacker Win %_RollAvg_TeamB', 'Defender Win %_RollAvg_TeamB',
       'Average Combat Score_RollAvg_TeamA',
       'Average Combat Score_RollAvg_TeamB',
       'Average Damage Per Round_RollAvg_TeamA',
       'Average Damage Per Round_RollAvg_TeamB', 'KDA_RollAvg_TeamA',
       'KDA_RollAvg_TeamB', 'Kill, Assist, Trade, Survive %_RollAvg_TeamB'],
      dtype='object')


In [50]:
len(xgb_selected_features)

14

## Retraining the models and cross-validation

In [51]:
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

rf_reduced_feats = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1)
rf_reduced_feats.fit(X_train_selected, y_train)

xgb_reduced_feats = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1) 
xgb_reduced_feats.fit(X_train_selected, y_train)

In [52]:
rf_reduced_feats_scores = cross_validate(rf_reduced_feats, X_train_selected, y_train, cv=tscv, scoring=SCORING)
xgb_reduced_feats_scores = cross_validate(xgb_reduced_feats, X_train_selected, y_train, cv=tscv, scoring=SCORING)

print("Random Forest Reduced Features")
for metric in SCORING:
    print(f"{metric}: {rf_reduced_feats_scores[f'test_{metric}'].mean()}")

print("\nXGBoost Reduced Features")
for metric in SCORING:
    print(f"{metric}: {xgb_reduced_feats_scores[f'test_{metric}'].mean()}")

Random Forest Reduced Features
accuracy: 0.6535486738776115
precision: 0.6584465049590867
recall: 0.6380979604382093
f1: 0.6479024010097757
roc_auc: 0.7214904827767897

XGBoost Reduced Features
accuracy: 0.6405097051415025
precision: 0.6388758528488012
recall: 0.6465160245010867
f1: 0.6426698795914341
roc_auc: 0.7062244352236053


In [53]:
# Test set evaluation
def evaluate_test_set(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))


print("Random Forest Reduced Features Test Set")
evaluate_test_set(rf_reduced_feats, X_test_selected, y_test)

print("\nXGBoost Reduced Features Test Set")
evaluate_test_set(xgb_reduced_feats, X_test_selected, y_test)

Random Forest Reduced Features Test Set
Accuracy: 0.5871604938271605
Precision: 0.5880007972892166
Recall: 0.5826585028639146
F1: 0.5853174603174603
ROC-AUC: 0.6275024965950556

XGBoost Reduced Features Test Set
Accuracy: 0.5820246913580247
Precision: 0.5802588371643809
Recall: 0.5933241161366779
F1: 0.58671875
ROC-AUC: 0.6213230092614956


In [54]:
# Tuning
rf_reduced_params, rf_reduced_best_score = tune_rf(X_train_selected, y_train, tscv, "accuracy", RANDOM_SEED)

print("Random Forest Reduced Features Optuna Tuned")
print("Best params:", rf_reduced_params)
print("Best score:", rf_reduced_best_score)

[I 2024-09-22 19:46:27,765] A new study created in memory with name: no-name-403e3571-2c49-4158-941c-216cb2bda141
[I 2024-09-22 19:47:42,054] Trial 0 finished with value: 0.6632389983701288 and parameters: {'n_estimators': 636, 'max_depth': 49, 'min_samples_split': 24, 'min_samples_leaf': 24, 'max_features': None}. Best is trial 0 with value: 0.6632389983701288.
[I 2024-09-22 19:47:58,767] Trial 1 finished with value: 0.6676841013483479 and parameters: {'n_estimators': 454, 'max_depth': 10, 'min_samples_split': 21, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.6676841013483479.
[I 2024-09-22 19:49:07,936] Trial 2 finished with value: 0.6652837457401096 and parameters: {'n_estimators': 801, 'max_depth': 7, 'min_samples_split': 25, 'min_samples_leaf': 13, 'max_features': None}. Best is trial 1 with value: 0.6676841013483479.
[I 2024-09-22 19:49:52,446] Trial 3 finished with value: 0.6663209364350273 and parameters: {'n_estimators': 879, 'max_depth': 12, 'm

Random Forest Reduced Features Optuna Tuned
Best params: {'n_estimators': 631, 'max_depth': 31, 'min_samples_split': 28, 'min_samples_leaf': 27, 'max_features': 'sqrt'}
Best score: 0.6693732404800711


In [55]:
rf_reduced_tuned = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, **rf_reduced_params)

print("Random Forest Reduced Features Optuna Tuned")
rf_reduced_tuned.fit(X_train_selected, y_train)

# Cross validation
rf_reduced_tuned_scores = cross_validate(rf_reduced_tuned, X_train_selected, y_train, cv=tscv, scoring=SCORING)

for metric in SCORING:
    print(f"{metric}: {rf_reduced_tuned_scores[f'test_{metric}'].mean()}")

Random Forest Reduced Features Optuna Tuned
accuracy: 0.6693732404800711
precision: 0.6671145541622379
recall: 0.6765644558607213
f1: 0.671629370190142
roc_auc: 0.7384522075127884


In [56]:
# Test set evaluation
print("Random Forest Reduced Features Tuned Test Set")
evaluate_test_set(rf_reduced_tuned, X_test_selected, y_test)

Random Forest Reduced Features Tuned Test Set
Accuracy: 0.5986172839506173
Precision: 0.6002810680586228
Recall: 0.5905589571400356
F1: 0.5953803265631222
ROC-AUC: 0.640996712852277


In [57]:
# XGBoost Reduced Features
xgb_reduced_params, xgb_reduced_best_score = tune_xgb(X_train_selected, y_train, tscv, "accuracy", RANDOM_SEED)

print("XGBoost Reduced Features Optuna Tuned")
print("Best params:", xgb_reduced_params)
print("Best score:", xgb_reduced_best_score)

[I 2024-09-22 20:36:35,027] A new study created in memory with name: no-name-7dd002d7-b2ec-4b0d-8bda-4502802a3ebf
[I 2024-09-22 20:37:13,061] Trial 0 finished with value: 0.650466735812713 and parameters: {'n_estimators': 982, 'learning_rate': 0.019921122083847005, 'max_depth': 16, 'subsample': 0.604644832479114, 'colsample_bytree': 0.7020619949126894, 'gamma': 1.2492983189755962, 'reg_alpha': 0.0009213098010957778, 'reg_lambda': 1.4904755384045978}. Best is trial 0 with value: 0.650466735812713.
[I 2024-09-22 20:37:22,145] Trial 1 finished with value: 0.6630315602311454 and parameters: {'n_estimators': 260, 'learning_rate': 0.013502952210469868, 'max_depth': 16, 'subsample': 0.8033270116723275, 'colsample_bytree': 0.7276119207689231, 'gamma': 3.803772978577647, 'reg_alpha': 0.17730451060219618, 'reg_lambda': 3.1414577964353056}. Best is trial 1 with value: 0.6630315602311454.
[I 2024-09-22 20:37:32,769] Trial 2 finished with value: 0.662231441695066 and parameters: {'n_estimators': 29

XGBoost Reduced Features Optuna Tuned
Best params: {'n_estimators': 518, 'learning_rate': 0.004983315523177665, 'max_depth': 13, 'subsample': 0.6106857957877571, 'colsample_bytree': 0.5808598044555509, 'gamma': 4.30992302896286, 'reg_alpha': 6.062777929424166, 'reg_lambda': 6.003877366430758e-05}
Best score: 0.6697584827381834


In [58]:
xgb_reduced_tuned = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1, **xgb_reduced_params)

print("XGBoost Reduced Features Optuna Tuned")
xgb_reduced_tuned.fit(X_train_selected, y_train)

# Cross validation
xgb_reduced_tuned_scores = cross_validate(xgb_reduced_tuned, X_train_selected, y_train, cv=tscv, scoring=SCORING)

for metric in SCORING:
    print(f"{metric}: {xgb_reduced_tuned_scores[f'test_{metric}'].mean()}")

XGBoost Reduced Features Optuna Tuned
accuracy: 0.6697584827381834
precision: 0.6671377748462455
recall: 0.6782240136995322
f1: 0.6724907446429867
roc_auc: 0.7379171617379086


In [59]:
# Test set evaluation
print("XGBoost Reduced Features Tuned Test Set")
evaluate_test_set(xgb_reduced_tuned, X_test_selected, y_test)

XGBoost Reduced Features Tuned Test Set
Accuracy: 0.6004938271604938
Precision: 0.5989887203422792
Recall: 0.6083349792613075
F1: 0.6036256736893679
ROC-AUC: 0.6447931878169126
