In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV,TimeSeriesSplit, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# To avoid truncating columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Setting random seed for reproducibility
RANDOM_SEED = 1

# Load the Data

In [2]:
train = pd.read_csv("vct_data/train_preprocessed.csv")
train.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Clutches_TeamA,First Blood %_TeamA,Attacker Win %_TeamA,Defender Win %_TeamA,Overtime Win %_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Attacker Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamA,First Blood %_RollAvg_TeamA,Headshot %_RollAvg_TeamA,Clutches_RollAvg_TeamA,Econ_RollAvg_TeamA,Recent Win %_TeamA,KDA_TeamB,Clutches_TeamB,First Blood %_TeamB,Attacker Win %_TeamB,Defender Win %_TeamB,Overtime Win %_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamB,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamB,Recent Win %_TeamB,Team A Map Win %,Team A H2H Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Ascent,Leviatán,7,6,1,0,Furious Gaming,13,7,6,0,41.883333,0.992282,186.0,60,76,22,-16,0.701367,119.6,0.25,10,10,0,0.992282,218.4,76,60,25,16,0.701367,132.8,0.19,10,10,0,17518.95592,8480.648076,Full buy: 20k+,17518.95592,8480.648076,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,2.0,1.0,3.0,1.0,9.0,0.0,3.0,1.0,9.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,1.078947,0.0,0.5,0.5,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.683333,0.0,0.5,0.875,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551856,0.530307,0
1,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Ascent,Furious Gaming,13,7,6,0,Leviatán,7,6,1,0,41.883333,0.992282,218.4,76,60,25,16,0.701367,132.8,0.19,10,10,0,0.992282,186.0,60,76,22,-16,0.701367,119.6,0.25,10,10,0,17518.95592,8480.648076,Full buy: 20k+,17518.95592,8480.648076,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,9.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,9.0,0.0,3.0,1.0,1.683333,0.0,0.5,0.875,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.078947,0.0,0.5,0.5,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469694,0.448144,1
2,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Bind,Leviatán,7,5,2,0,Furious Gaming,13,6,7,0,38.933333,0.992282,175.8,56,81,27,-25,0.701367,113.6,0.16,10,10,0,0.992282,228.8,81,56,35,25,0.701367,164.6,0.202,10,10,0,17518.95592,8480.648076,Full buy: 20k+,17518.95592,8480.648076,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,4.0,3.0,0.0,0.0,8.0,0.0,5.0,0.0,8.0,0.0,5.0,0.0,4.0,3.0,0.0,0.0,1.024691,0.0,0.5,0.416667,0.25,0.0,0.992282,186.0,119.6,1.078947,0.701367,0.5,0.125,0.0,0.5,0.25,0.0,53.592817,0.0,2.071429,0.0,0.5,0.75,0.583333,0.0,0.992282,218.4,132.8,1.683333,0.701367,0.875,0.5,0.0,0.5,0.19,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551856,0.530307,0
3,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Bind,Furious Gaming,13,6,7,0,Leviatán,7,5,2,0,38.933333,0.992282,228.8,81,56,35,25,0.701367,164.6,0.202,10,10,0,0.992282,175.8,56,81,27,-25,0.701367,113.6,0.16,10,10,0,17518.95592,8480.648076,Full buy: 20k+,17518.95592,8480.648076,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,8.0,0.0,5.0,0.0,4.0,3.0,0.0,0.0,4.0,3.0,0.0,0.0,8.0,0.0,5.0,0.0,2.071429,0.0,0.5,0.75,0.583333,0.0,0.992282,218.4,132.8,1.683333,0.701367,0.875,0.5,0.0,0.5,0.19,0.0,53.592817,1.0,1.024691,0.0,0.5,0.416667,0.25,0.0,0.992282,186.0,119.6,1.078947,0.701367,0.5,0.125,0.0,0.5,0.25,0.0,53.592817,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469694,0.448144,1
4,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Procyon Team vs KRÜ Esports,Bind,Procyon Team,1,1,0,0,KRÜ Esports,13,2,11,0,28.033333,0.992282,187.2,40,67,21,-27,0.701367,130.8,0.192,6,8,-2,0.992282,253.0,67,40,23,27,0.701367,150.0,0.314,8,6,2,17518.95592,8480.648076,Full buy: 20k+,17518.95592,8480.648076,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.592817,0.0,0.0,0.0,0.0,1.0,0.0,9.0,3.0,1.0,0.0,9.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.910448,0.0,0.428571,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.25,0.0,0.571429,1.0,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389147,0


In [3]:
test = pd.read_csv("vct_data/test_preprocessed.csv")
test.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Clutches_TeamA,First Blood %_TeamA,Attacker Win %_TeamA,Defender Win %_TeamA,Overtime Win %_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Attacker Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamA,First Blood %_RollAvg_TeamA,Headshot %_RollAvg_TeamA,Clutches_RollAvg_TeamA,Econ_RollAvg_TeamA,Recent Win %_TeamA,KDA_TeamB,Clutches_TeamB,First Blood %_TeamB,Attacker Win %_TeamB,Defender Win %_TeamB,Overtime Win %_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamB,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamB,Recent Win %_TeamB,Team A Map Win %,Team A H2H Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Breeze,T1,18,6,6,6,BLEED,16,6,6,4,202.516667,1.104,214.6,126,116,36,10,0.724,145.2,0.292,15,19,-4,0.85,192.6,116,126,40,-10,0.73,125.6,0.266,19,15,4,19458.823529,4994.117647,Full buy: 20k+,20517.647059,5367.647059,Full buy: 20k+,28.0,4.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,45.8,14.0,6.0,22.0,5.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,37.4,13.0,6.0,11.0,1.0,6.0,0.0,7.0,3.0,6.0,0.0,7.0,3.0,6.0,0.0,11.0,1.0,6.0,0.0,1.396552,2.0,0.441176,0.5,0.5,0.6,0.9476,189.04,123.16,1.261334,0.662,0.557576,0.433333,0.0,0.454615,0.274,2.2,47.0,0.4,1.238095,5.0,0.558824,0.5,0.5,0.4,1.0024,203.16,133.12,1.431985,0.7352,0.533333,0.415152,0.0,0.487308,0.248,2.0,53.2,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625044,0.438879,1
1,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Breeze,BLEED,16,6,6,4,T1,18,6,6,6,202.516667,0.85,192.6,116,126,40,-10,0.73,125.6,0.266,19,15,4,1.104,214.6,126,116,36,10,0.724,145.2,0.292,15,19,-4,20517.647059,5367.647059,Full buy: 20k+,19458.823529,4994.117647,Full buy: 20k+,22.0,5.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,37.4,13.0,6.0,28.0,4.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,45.8,14.0,6.0,7.0,3.0,6.0,0.0,11.0,1.0,6.0,0.0,11.0,1.0,6.0,0.0,7.0,3.0,6.0,0.0,1.238095,5.0,0.558824,0.5,0.5,0.4,1.0024,203.16,133.12,1.431985,0.7352,0.533333,0.415152,0.0,0.487308,0.248,2.0,53.2,0.4,1.396552,2.0,0.441176,0.5,0.5,0.6,0.9476,189.04,123.16,1.261334,0.662,0.557576,0.433333,0.0,0.454615,0.274,2.2,47.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561121,0.374956,0
2,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Lotus,T1,13,9,4,0,BLEED,6,3,3,0,47.75,1.16,221.2,75,59,36,16,0.758,143.4,0.248,11,8,3,0.86,191.8,59,75,24,-16,0.642,128.8,0.244,8,11,-3,19810.526316,9421.052632,Full buy: 20k+,16073.684211,4147.368421,Full buy: 20k+,15.0,4.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,57.6,10.0,4.0,12.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,51.2,7.0,2.0,6.0,3.0,4.0,0.0,3.0,1.0,2.0,0.0,3.0,1.0,2.0,0.0,6.0,3.0,4.0,0.0,1.881356,2.0,0.578947,0.75,0.571429,0.0,1.0032,198.64,129.88,1.360397,0.6888,0.607576,0.45,0.12,0.462851,0.2736,2.4,47.28,0.6,1.106667,2.0,0.421053,0.428571,0.25,0.0,0.9604,201.44,131.28,1.376972,0.7312,0.483333,0.448485,0.08,0.474072,0.2528,2.6,49.84,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.625044,0.438879,1
3,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Lotus,BLEED,6,3,3,0,T1,13,9,4,0,47.75,0.86,191.8,59,75,24,-16,0.642,128.8,0.244,8,11,-3,1.16,221.2,75,59,36,16,0.758,143.4,0.248,11,8,3,16073.684211,4147.368421,Full buy: 20k+,19810.526316,9421.052632,Full buy: 20k+,12.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,51.2,7.0,2.0,15.0,4.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,57.6,10.0,4.0,3.0,1.0,2.0,0.0,6.0,3.0,4.0,0.0,6.0,3.0,4.0,0.0,3.0,1.0,2.0,0.0,1.106667,2.0,0.421053,0.428571,0.25,0.0,0.9604,201.44,131.28,1.376972,0.7312,0.483333,0.448485,0.08,0.474072,0.2528,2.6,49.84,0.2,1.881356,2.0,0.578947,0.75,0.571429,0.0,1.0032,198.64,129.88,1.360397,0.6888,0.607576,0.45,0.12,0.462851,0.2736,2.4,47.28,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.561121,0.374956,0
4,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (C),Gen.G vs Rex Regum Qeon,Icebox,Gen.G,13,6,7,0,Rex Regum Qeon,11,5,6,0,50.833333,1.036,205.0,87,82,28,5,0.742,136.2,0.29,15,9,6,0.944,193.4,82,87,26,-5,0.692,124.4,0.32,9,15,-6,17695.833333,10762.5,Full buy: 20k+,18137.5,9095.833333,Full buy: 20k+,12.0,5.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,56.6,8.0,6.0,18.0,6.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,48.0,11.0,2.0,6.0,1.0,6.0,0.0,9.0,0.0,2.0,0.0,9.0,0.0,2.0,0.0,6.0,1.0,6.0,0.0,1.402439,3.0,0.625,0.5,0.583333,0.0,0.9428,189.44,122.92,1.445625,0.6992,0.35368,0.483333,0.0,0.458196,0.24,1.2,50.48,0.4,1.241379,2.0,0.375,0.416667,0.5,0.0,0.9816,194.16,128.76,1.450438,0.71,0.483333,0.507143,0.0875,0.402105,0.2608,2.0,48.8,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.574527,0.456347,1


# Data Preprocessing

## Extract features and target

In [4]:
features = [
    "Team A_Encoded", 
    "Team B_Encoded",

    "Team A H2H Win %",
    "Team B H2H Win %",

    "Recent Win %_TeamA",
    "Recent Win %_TeamB",

    "Map_Abyss",
    "Map_Ascent",
    "Map_Bind",
    "Map_Breeze",
    "Map_Fracture",
    "Map_Haven",
    "Map_Icebox",
    "Map_Lotus",
    "Map_Pearl",
    "Map_Split",
    "Map_Sunset",

    "Team A Map Win %",
    "Team B Map Win %",

    "Attacker Win %_RollAvg_TeamA",
    "Attacker Win %_RollAvg_TeamB",

    "Defender Win %_RollAvg_TeamA",
    "Defender Win %_RollAvg_TeamB",

    "Overtime Win %_RollAvg_TeamA",
    "Overtime Win %_RollAvg_TeamB",

    "Rating_RollAvg_TeamA",
    "Rating_RollAvg_TeamB",

    "Average Combat Score_RollAvg_TeamA",
    "Average Combat Score_RollAvg_TeamB", 

    "Average Damage Per Round_RollAvg_TeamA",
    "Average Damage Per Round_RollAvg_TeamB",

    "KDA_RollAvg_TeamA",
    "KDA_RollAvg_TeamB",

    "Kill, Assist, Trade, Survive %_RollAvg_TeamA", 
    "Kill, Assist, Trade, Survive %_RollAvg_TeamB",

    "First Blood %_RollAvg_TeamA",
    "First Blood %_RollAvg_TeamB",

    "Headshot %_RollAvg_TeamA",
    "Headshot %_RollAvg_TeamB",

    "Clutches_RollAvg_TeamA",
    "Clutches_RollAvg_TeamB",

    "Econ_RollAvg_TeamA",
    "Econ_RollAvg_TeamB",

]

X_train = train[features]
y_train = train["Winner"]

X_test = test[features]
y_test = test["Winner"]

In [5]:
X_train.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Team A H2H Win %,Team B H2H Win %,Recent Win %_TeamA,Recent Win %_TeamB,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A Map Win %,Team B Map Win %,Attacker Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamB,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",First Blood %_RollAvg_TeamA,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamA,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB
0,0.551856,0.530307,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.469694,0.448144,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.551856,0.530307,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.875,0.125,0.5,0.0,0.0,0.992282,0.992282,186.0,218.4,119.6,132.8,1.078947,1.683333,0.701367,0.701367,0.5,0.5,0.25,0.19,0.0,0.0,53.592817,53.592817
3,0.469694,0.448144,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875,0.5,0.5,0.125,0.0,0.0,0.992282,0.992282,218.4,186.0,132.8,119.6,1.683333,1.078947,0.701367,0.701367,0.5,0.5,0.19,0.25,0.0,0.0,53.592817,53.592817
4,0.0,0.389147,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_train.shape

(48414, 43)

In [6]:
X_test.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Team A H2H Win %,Team B H2H Win %,Recent Win %_TeamA,Recent Win %_TeamB,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A Map Win %,Team B Map Win %,Attacker Win %_RollAvg_TeamA,Attacker Win %_RollAvg_TeamB,Defender Win %_RollAvg_TeamA,Defender Win %_RollAvg_TeamB,Overtime Win %_RollAvg_TeamA,Overtime Win %_RollAvg_TeamB,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",First Blood %_RollAvg_TeamA,First Blood %_RollAvg_TeamB,Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Clutches_RollAvg_TeamA,Clutches_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB
0,0.625044,0.438879,0.0,0.0,0.4,0.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.557576,0.533333,0.433333,0.415152,0.0,0.0,0.9476,1.0024,189.04,203.16,123.16,133.12,1.261334,1.431985,0.662,0.7352,0.454615,0.487308,0.274,0.248,2.2,2.0,47.0,53.2
1,0.561121,0.374956,0.0,0.0,0.4,0.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.533333,0.557576,0.415152,0.433333,0.0,0.0,1.0024,0.9476,203.16,189.04,133.12,123.16,1.431985,1.261334,0.7352,0.662,0.487308,0.454615,0.248,0.274,2.0,2.2,53.2,47.0
2,0.625044,0.438879,0.0,0.0,0.6,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.607576,0.483333,0.45,0.448485,0.12,0.08,1.0032,0.9604,198.64,201.44,129.88,131.28,1.360397,1.376972,0.6888,0.7312,0.462851,0.474072,0.2736,0.2528,2.4,2.6,47.28,49.84
3,0.561121,0.374956,0.0,0.0,0.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.483333,0.607576,0.448485,0.45,0.08,0.12,0.9604,1.0032,201.44,198.64,131.28,129.88,1.376972,1.360397,0.7312,0.6888,0.474072,0.462851,0.2528,0.2736,2.6,2.4,49.84,47.28
4,0.574527,0.456347,0.0,0.0,0.4,0.8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35368,0.483333,0.483333,0.507143,0.0,0.0875,0.9428,0.9816,189.44,194.16,122.92,128.76,1.445625,1.450438,0.6992,0.71,0.458196,0.402105,0.24,0.2608,1.2,2.0,50.48,48.8


In [8]:
X_test.shape

(2208, 43)

## Scaling features (for Logistic Regression only)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Selection

We will be using the following models:
- Logistic Regression
- Random Forest
- XGBoost

# Model Training, Validation, and Testing

Functions for cross validating and testing the models

In [10]:
SCORING = ["accuracy", "precision", "recall", "f1", "roc_auc"]

def evaluate_model(model, X_train, y_train, cv=5):
    tscv = TimeSeriesSplit(n_splits=cv)
    scores = cross_validate(model, X_train, y_train, cv=tscv, scoring=SCORING)

    for metric in SCORING:
        print(f"{metric}: {scores[f'test_{metric}'].mean()}")
    

def test_model(model, X_train, y_train, X_test, y_test):
    # Train model on full training set
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))

## Training baseline models

In [11]:
logreg_base = LogisticRegression(random_state=RANDOM_SEED)
rf_base = RandomForestClassifier(random_state=RANDOM_SEED)
xgb_base = XGBClassifier(random_state=RANDOM_SEED)

In [12]:

print("Logistic Regression Baseline")
evaluate_model(logreg_base, X_train_scaled, y_train)

Logistic Regression Baseline
accuracy: 0.6322468707398686
precision: 0.6354724517124217
recall: 0.6207614582123819
f1: 0.6280137097476827
roc_auc: 0.6915287220951528


In [13]:
print("Random Forest Baseline")
evaluate_model(rf_base, X_train, y_train)

Random Forest Baseline
accuracy: 0.6462015119593506
precision: 0.650190517266639
recall: 0.6338980008220092
f1: 0.6418372485195716
roc_auc: 0.7092639454352994


In [14]:
print("XGBoost Baseline")
evaluate_model(xgb_base, X_train, y_train)

XGBoost Baseline
accuracy: 0.6345024166563391
precision: 0.6327372121482251
recall: 0.6429192139429472
f1: 0.6375994170349947
roc_auc: 0.69416866793347


## Hyperparameter tuning

In [None]:
# Ensures the splits are in chronological order
tscv = TimeSeriesSplit(n_splits=5)

# Hyperparameter grids for each model
log_reg_params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
    "max_iter": [100, 200, 300, 400, 500, 1000]
}

rf_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [None, 5, 10, 15, 20, 25, 30, 50, 100],
    "min_samples_split": [2, 5, 10, 15, 20],
    "min_samples_leaf": [1, 2, 4, 8, 16],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

xgb_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "learning_rate": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "gamma": [0, 0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10],
    "reg_alpha": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    "reg_lambda": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
}

# For RandomizedSearchCV
ITERATIONS = 100
N_JOBS = -1

# Initialize models
log_reg_tuned = RandomizedSearchCV(LogisticRegression(random_state=RANDOM_SEED), 
                                   log_reg_params, 
                                   n_iter=ITERATIONS, 
                                   cv=tscv, 
                                   scoring=SCORING,
                                   refit="roc_auc", 
                                   random_state=RANDOM_SEED, 
                                   n_jobs=N_JOBS,
                                   error_score="raise")

rf_tuned = RandomizedSearchCV(RandomForestClassifier(random_state=RANDOM_SEED),
                              rf_params,
                              n_iter=ITERATIONS,
                              cv=tscv,
                              scoring=SCORING,
                              refit="roc_auc",
                              random_state=RANDOM_SEED,
                              n_jobs=N_JOBS,
                              error_score="raise")

xgb_tuned = RandomizedSearchCV(XGBClassifier(random_state=RANDOM_SEED),
                               xgb_params,
                               n_iter=ITERATIONS,
                               cv=tscv,
                               scoring=SCORING,
                               refit="roc_auc",
                               random_state=RANDOM_SEED,
                               n_jobs=N_JOBS,
                               error_score="raise")

# Fit models
log_reg_tuned.fit(X_train_scaled, y_train)
rf_tuned.fit(X_train, y_train)
xgb_tuned.fit(X_train, y_train)

print("Models have been trained")

In [None]:
# Show best parameters
print("Logistic Regression Tuned")
print(log_reg_tuned.best_params_)

print("Random Forest Tuned")
print(rf_tuned.best_params_)

print("XGBoost Tuned")
print(xgb_tuned.best_params_)

In [None]:
print("Logistic Regression Tuned")
evaluate_model(log_reg_tuned.best_estimator_, X_train_scaled, y_train)

In [None]:
print("Random Forest Tuned")
evaluate_model(rf_tuned.best_estimator_, X_train, y_train)

In [None]:
print("XGBoost Tuned")
evaluate_model(xgb_tuned.best_estimator_, X_train, y_train)

## Feature selection

### Tree-based feature importance (Random Forest and XGBoost)

In [13]:
# # Train Random Forest and XGBoost and get feature importances
# rf_base.fit(X_train, y_train)
# xgb_base.fit(X_train, y_train)

# # Get feature importances
# rf_importances = rf_base.feature_importances_
# xgb_importances = xgb_base.feature_importances_

# # Combine feature importances into a DataFrame
# feature_names = X_train.columns
# feature_importances = pd.DataFrame({
#     'Feature': feature_names,
#     'RandomForest_Importance': rf_importances,
#     'XGBoost_Importance': xgb_importances
# })



In [14]:
# # Select top 20 features based on Random Forest
# top_features_rf = feature_importances.nlargest(20, 'RandomForest_Importance')['Feature']
# X_train_rf_top = X_train[top_features_rf]
# X_test_rf_top = X_test[top_features_rf]

In [15]:
# # Sort by importance (Random Forest)
# feature_importances.sort_values(by='RandomForest_Importance', ascending=False, inplace=True)
# feature_importances.head(20)

In [16]:
# Sort by importance (XGBoost)
# feature_importances.sort_values(by='XGBoost_Importance', ascending=False, inplace=True)
# feature_importances.head(20)

### L1 regularization (Logistic Regression)

In [17]:
# logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
# logreg_l1.fit(X_train, y_train)

# # Select important features (non-zero coefficients)
# model = SelectFromModel(logreg_l1, prefit=True)
# X_train_l1 = model.transform(X_train)
# X_test_l1 = model.transform(X_test)

# # Check which features were selected
# selected_features = X_train.columns[(logreg_l1.coef_ != 0).ravel()]
# for feature in selected_features:
#     print(feature)

### Recusive Feature Elimination

In [18]:
# # Apply RFE with Logistic Regression as the estimator
# rfe = RFE(estimator=logreg_base, n_features_to_select=20, step=1)
# rfe.fit(X_train_scaled, y_train)

# # Transform the dataset with selected features
# X_train_rfe = rfe.transform(X_train)
# X_test_rfe = rfe.transform(X_test)

# print("Selected Features by RFE:", X_train.columns[rfe.support_])

In [19]:
# # Apply RFE with Logistic Regression as the estimator
# rfe = RFE(estimator=rf_base, n_features_to_select=20, step=1)
# rfe.fit(X_train, y_train)

# # Transform the dataset with selected features
# X_train_rfe = rfe.transform(X_train)
# X_test_rfe = rfe.transform(X_test)

# print("Selected Features by RFE:", X_train.columns[rfe.support_])

In [20]:
# X_train.head()