In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV,TimeSeriesSplit, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# To avoid truncating columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Setting random seed for reproducibility
RANDOM_SEED = 1

# Load the Data

In [2]:
train = pd.read_csv("vct_data/train_preprocessed.csv")
train.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Headshot %_RollAvg_TeamA,Econ_RollAvg_TeamA,KDA_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Headshot %_RollAvg_TeamB,Econ_RollAvg_TeamB,Team A Map Win %,Team A H2H Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Ascent,Leviatán,7,6,1,0,Furious Gaming,13,7,6,0,41.883333,1.042198,186.0,60,76,22,-16,0.722103,119.6,0.25,10,10,0,0.942223,218.4,76,60,25,16,0.680572,132.8,0.19,10,10,0,17800.025414,9073.021218,Full buy: 20k+,17237.886426,7888.274933,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.486853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.697865,0.0,0.0,2.0,1.0,3.0,1.0,9.0,0.0,3.0,1.0,9.0,0.0,3.0,1.0,2.0,1.0,3.0,1.0,1.078947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.683333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39737,0.467653,1
1,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Leviatán vs Furious Gaming,Bind,Leviatán,7,5,2,0,Furious Gaming,13,6,7,0,38.933333,1.042198,175.8,56,81,27,-25,0.722103,113.6,0.16,10,10,0,0.942223,228.8,81,56,35,25,0.680572,164.6,0.202,10,10,0,17800.025414,9073.021218,Full buy: 20k+,17237.886426,7888.274933,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.486853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.697865,0.0,0.0,4.0,3.0,0.0,0.0,8.0,0.0,5.0,0.0,8.0,0.0,5.0,0.0,4.0,3.0,0.0,0.0,1.024691,1.042198,186.0,119.6,1.078947,0.722103,0.25,55.486853,2.071429,0.942223,218.4,132.8,1.683333,0.680572,0.19,51.697865,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39737,0.467653,1
2,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Procyon Team vs KRÜ Esports,Bind,Procyon Team,1,1,0,0,KRÜ Esports,13,2,11,0,28.033333,1.042198,187.2,40,67,21,-27,0.722103,130.8,0.192,6,8,-2,0.942223,253.0,67,40,23,27,0.680572,150.0,0.314,8,6,2,17800.025414,9073.021218,Full buy: 20k+,17237.886426,7888.274933,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.486853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.697865,0.0,0.0,0.0,0.0,1.0,0.0,9.0,3.0,1.0,0.0,9.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.910448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4189,0.546825,1
3,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Procyon Team vs KRÜ Esports,Ascent,Procyon Team,13,7,6,0,KRÜ Esports,11,6,5,0,49.316667,1.042198,213.2,86,84,41,2,0.722103,134.8,0.198,12,12,0,0.942223,205.8,84,86,23,-2,0.680572,133.2,0.202,12,12,0,17800.025414,9073.021218,Full buy: 20k+,17237.886426,7888.274933,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.486853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.697865,0.0,0.0,9.0,0.0,3.0,1.0,8.0,0.0,3.0,0.0,8.0,0.0,3.0,0.0,9.0,0.0,3.0,1.0,1.511905,1.042198,187.2,130.8,0.910448,0.722103,0.192,55.486853,1.244186,0.942223,253.0,150.0,2.25,0.680572,0.314,51.697865,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4189,0.546825,0
4,Champions Tour LATAM Stage 1: Challengers 1,Open Qualifier: LAS,Round of 16,Procyon Team vs KRÜ Esports,Split,Procyon Team,4,3,1,0,KRÜ Esports,13,4,9,0,32.966667,1.042198,143.2,38,78,18,-40,0.722103,86.6,0.192,3,14,-11,0.942223,245.8,78,38,18,40,0.680572,148.6,0.208,14,3,11,17800.025414,9073.021218,Full buy: 20k+,17237.886426,7888.274933,Full buy: 20k+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.486853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.697865,0.0,0.0,3.0,0.0,1.0,0.0,12.0,0.0,1.0,0.0,12.0,0.0,1.0,0.0,3.0,0.0,1.0,0.0,0.717949,1.042198,200.2,132.8,1.211176,0.722103,0.195,55.486853,2.526316,0.942223,229.4,141.6,1.747093,0.680572,0.258,51.697865,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4189,0.546825,1


In [3]:
test = pd.read_csv("vct_data/test_preprocessed.csv")
test.head()

Unnamed: 0,Tournament,Stage,Match Type,Match Name,Map,Team A,Team A Score,Team A Attacker Score,Team A Defender Score,Team A Overtime Score,Team B,Team B Score,Team B Attacker Score,Team B Defender Score,Team B Overtime Score,Duration,Rating_TeamA,Average Combat Score_TeamA,Kills_TeamA,Deaths_TeamA,Assists_TeamA,Kills - Deaths (KD)_TeamA,"Kill, Assist, Trade, Survive %_TeamA",Average Damage Per Round_TeamA,Headshot %_TeamA,First Kills_TeamA,First Deaths_TeamA,Kills - Deaths (FKD)_TeamA,Rating_TeamB,Average Combat Score_TeamB,Kills_TeamB,Deaths_TeamB,Assists_TeamB,Kills - Deaths (KD)_TeamB,"Kill, Assist, Trade, Survive %_TeamB",Average Damage Per Round_TeamB,Headshot %_TeamB,First Kills_TeamB,First Deaths_TeamB,Kills - Deaths (FKD)_TeamB,Loadout Value_TeamA,Remaining Credits_TeamA,Type_TeamA,Loadout Value_TeamB,Remaining Credits_TeamB,Type_TeamB,2k_TeamA,3k_TeamA,4k_TeamA,5k_TeamA,1v1_TeamA,1v2_TeamA,1v3_TeamA,1v4_TeamA,1v5_TeamA,Econ_TeamA,Spike Plants_TeamA,Spike Defuses_TeamA,2k_TeamB,3k_TeamB,4k_TeamB,5k_TeamB,1v1_TeamB,1v2_TeamB,1v3_TeamB,1v4_TeamB,1v5_TeamB,Econ_TeamB,Spike Plants_TeamB,Spike Defuses_TeamB,Elimination_TeamA,Detonated_TeamA,Defused_TeamA,Time Expiry (No Plant)_TeamA,Eliminated_TeamA,Defused Failed_TeamA,Detonation Denied_TeamA,Time Expiry (Failed to Plant)_TeamA,Elimination_TeamB,Detonated_TeamB,Defused_TeamB,Time Expiry (No Plant)_TeamB,Eliminated_TeamB,Defused Failed_TeamB,Detonation Denied_TeamB,Time Expiry (Failed to Plant)_TeamB,KDA_TeamA,Rating_RollAvg_TeamA,Average Combat Score_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamA,KDA_RollAvg_TeamA,"Kill, Assist, Trade, Survive %_RollAvg_TeamA",Headshot %_RollAvg_TeamA,Econ_RollAvg_TeamA,KDA_TeamB,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamB",Headshot %_RollAvg_TeamB,Econ_RollAvg_TeamB,Team A Map Win %,Team A H2H Win %,Team B Map Win %,Team B H2H Win %,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Team A_Encoded,Team B_Encoded,Winner
0,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Breeze,T1,18,6,6,6,BLEED,16,6,6,4,202.516667,1.104,214.6,126,116,36,10,0.724,145.2,0.292,15,19,-4,0.85,192.6,116,126,40,-10,0.73,125.6,0.266,19,15,4,19458.823529,4994.117647,Full buy: 20k+,20517.647059,5367.647059,Full buy: 20k+,28.0,4.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,45.8,14.0,6.0,22.0,5.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,37.4,13.0,6.0,11.0,1.0,6.0,0.0,7.0,3.0,6.0,0.0,7.0,3.0,6.0,0.0,11.0,1.0,6.0,0.0,1.396552,0.9776,196.16,126.6,1.430208,0.7268,0.2312,50.52,1.238095,0.9036,190.32,126.0,1.20912,0.6716,0.2392,51.48,0.8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.279631,0.541665,0
1,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (A),T1 vs BLEED,Lotus,T1,13,9,4,0,BLEED,6,3,3,0,47.75,1.16,221.2,75,59,36,16,0.758,143.4,0.248,11,8,3,0.86,191.8,59,75,24,-16,0.642,128.8,0.244,8,11,-3,19810.526316,9421.052632,Full buy: 20k+,16073.684211,4147.368421,Full buy: 20k+,15.0,4.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,57.6,10.0,4.0,12.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,51.2,7.0,2.0,6.0,3.0,4.0,0.0,3.0,1.0,2.0,0.0,3.0,1.0,2.0,0.0,6.0,3.0,4.0,0.0,1.881356,0.9828,196.92,129.12,1.358869,0.7148,0.2564,49.56,1.106667,0.9392,196.68,129.84,1.290542,0.7044,0.2464,50.36,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.279631,0.541665,0
2,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (C),Gen.G vs Rex Regum Qeon,Icebox,Gen.G,13,6,7,0,Rex Regum Qeon,11,5,6,0,50.833333,1.036,205.0,87,82,28,5,0.742,136.2,0.29,15,9,6,0.944,193.4,82,87,26,-5,0.692,124.4,0.32,9,15,-6,17695.833333,10762.5,Full buy: 20k+,18137.5,9095.833333,Full buy: 20k+,12.0,5.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,56.6,8.0,6.0,18.0,6.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,48.0,11.0,2.0,6.0,1.0,6.0,0.0,9.0,0.0,2.0,0.0,9.0,0.0,2.0,0.0,6.0,1.0,6.0,0.0,1.402439,1.04,198.4,129.12,1.56191,0.7328,0.2388,51.0,1.241379,1.1832,218.92,143.68,1.915628,0.784,0.2752,54.6,0.533333,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.310576,0.434553,0
3,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (C),Gen.G vs Rex Regum Qeon,Split,Gen.G,11,2,9,0,Rex Regum Qeon,13,3,10,0,48.55,1.026,213.4,89,86,43,3,0.758,136.8,0.258,15,9,6,1.006,202.8,86,89,32,-3,0.692,134.8,0.312,9,15,-6,18575.0,10908.333333,Full buy: 20k+,17095.833333,8683.333333,Full buy: 20k+,16.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,51.8,6.0,0.0,13.0,5.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,51.0,3.0,4.0,11.0,0.0,0.0,0.0,9.0,0.0,4.0,0.0,9.0,0.0,4.0,0.0,11.0,0.0,0.0,0.0,1.534884,1.0196,198.8,129.92,1.464933,0.7232,0.2448,51.48,1.325843,1.1096,209.8,137.36,1.708348,0.7536,0.2904,52.0,0.545455,1.0,0.642857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.310576,0.434553,1
4,Champions Tour 2024: Pacific Kickoff,Group Stage,Opening (C),Gen.G vs Rex Regum Qeon,Sunset,Gen.G,13,8,5,0,Rex Regum Qeon,8,4,4,0,45.433333,1.042,217.0,82,76,38,6,0.752,144.2,0.248,13,8,5,0.946,202.0,76,82,31,-6,0.712,123.6,0.306,8,13,-5,17647.619048,5690.47619,Full buy: 20k+,16347.619048,6752.380952,Semi-buy: 10-20k,16.0,4.0,2.0,0.0,3.0,1.0,1.0,0.0,0.0,55.4,6.0,6.0,9.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,49.0,10.0,1.0,5.0,2.0,6.0,0.0,6.0,0.0,1.0,1.0,6.0,0.0,1.0,1.0,5.0,2.0,6.0,0.0,1.578947,1.0456,205.24,133.32,1.50316,0.734,0.2564,52.0,1.304878,1.062,205.72,135.08,1.629254,0.736,0.2832,50.04,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.310576,0.434553,0


# Data Preprocessing

## Extract features and target

In [4]:
features = [
    "Team A_Encoded", 
    "Team B_Encoded",

    "Map_Abyss",
    "Map_Ascent",
    "Map_Bind",
    "Map_Breeze",
    "Map_Fracture",
    "Map_Haven",
    "Map_Icebox",
    "Map_Lotus",
    "Map_Pearl",
    "Map_Split",
    "Map_Sunset",

    "Rating_RollAvg_TeamA",
    "Rating_RollAvg_TeamB",

    "Average Combat Score_RollAvg_TeamA",
    "Average Combat Score_RollAvg_TeamB", 

    "Average Damage Per Round_RollAvg_TeamA",
    "Average Damage Per Round_RollAvg_TeamB",

    "KDA_RollAvg_TeamA",
    "KDA_RollAvg_TeamB",

    "Kill, Assist, Trade, Survive %_RollAvg_TeamA", 
    "Kill, Assist, Trade, Survive %_RollAvg_TeamB",

    "Headshot %_RollAvg_TeamA",
    "Headshot %_RollAvg_TeamB",

    "Econ_RollAvg_TeamA",
    "Econ_RollAvg_TeamB",

    "Team A Map Win %",
    "Team B Map Win %",

    "Team A H2H Win %",
    "Team B H2H Win %",
]

X_train = train[features]
y_train = train["Winner"]

X_test = test[features]
y_test = test["Winner"]

## Dropping non-numeric columns

In [5]:
X_train = X_train.select_dtypes(include=['number'])
X_train.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB,Team A Map Win %,Team B Map Win %,Team A H2H Win %,Team B H2H Win %
0,0.39737,0.467653,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.39737,0.467653,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.042198,0.942223,186.0,218.4,119.6,132.8,1.078947,1.683333,0.722103,0.680572,0.25,0.19,55.486853,51.697865,0.0,0.0,0.0,1.0
2,0.4189,0.546825,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4189,0.546825,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.042198,0.942223,187.2,253.0,130.8,150.0,0.910448,2.25,0.722103,0.680572,0.192,0.314,55.486853,51.697865,0.0,0.0,0.0,1.0
4,0.4189,0.546825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.042198,0.942223,200.2,229.4,132.8,141.6,1.211176,1.747093,0.722103,0.680572,0.195,0.258,55.486853,51.697865,0.0,0.0,0.5,0.5


In [6]:
X_test = X_test.select_dtypes(include=['number'])
X_test.head()

Unnamed: 0,Team A_Encoded,Team B_Encoded,Map_Abyss,Map_Ascent,Map_Bind,Map_Breeze,Map_Fracture,Map_Haven,Map_Icebox,Map_Lotus,Map_Pearl,Map_Split,Map_Sunset,Rating_RollAvg_TeamA,Rating_RollAvg_TeamB,Average Combat Score_RollAvg_TeamA,Average Combat Score_RollAvg_TeamB,Average Damage Per Round_RollAvg_TeamA,Average Damage Per Round_RollAvg_TeamB,KDA_RollAvg_TeamA,KDA_RollAvg_TeamB,"Kill, Assist, Trade, Survive %_RollAvg_TeamA","Kill, Assist, Trade, Survive %_RollAvg_TeamB",Headshot %_RollAvg_TeamA,Headshot %_RollAvg_TeamB,Econ_RollAvg_TeamA,Econ_RollAvg_TeamB,Team A Map Win %,Team B Map Win %,Team A H2H Win %,Team B H2H Win %
0,0.279631,0.541665,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9776,0.9036,196.16,190.32,126.6,126.0,1.430208,1.20912,0.7268,0.6716,0.2312,0.2392,50.52,51.48,0.8,0.0,0.0,0.0
1,0.279631,0.541665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9828,0.9392,196.92,196.68,129.12,129.84,1.358869,1.290542,0.7148,0.7044,0.2564,0.2464,49.56,50.36,1.0,0.0,1.0,0.0
2,0.310576,0.434553,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.04,1.1832,198.4,218.92,129.12,143.68,1.56191,1.915628,0.7328,0.784,0.2388,0.2752,51.0,54.6,0.533333,0.5,0.0,0.0
3,0.310576,0.434553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0196,1.1096,198.8,209.8,129.92,137.36,1.464933,1.708348,0.7232,0.7536,0.2448,0.2904,51.48,52.0,0.545455,0.642857,1.0,0.0
4,0.310576,0.434553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0456,1.062,205.24,205.72,133.32,135.08,1.50316,1.629254,0.734,0.736,0.2564,0.2832,52.0,50.04,0.0,0.0,0.5,0.5


## Scaling features (for Logistic Regression only)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Selection

We will be using the following models:
- Logistic Regression
- Random Forest
- XGBoost

# Model Training, Validation, and Testing

Functions for cross validating and testing the models

In [8]:
scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"]

def evaluate_model(model, X_train, y_train, cv=5):
    tscv = TimeSeriesSplit(n_splits=cv)
    scores = cross_validate(model, X_train, y_train, cv=tscv, scoring=scoring)

    for metric in scoring:
        print(f"{metric}: {scores[f'test_{metric}'].mean()}")
    

def test_model(model, X_train, y_train, X_test, y_test):
    # Train model on full training set
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))

## Training baseline models

In [9]:
logreg_base = LogisticRegression(random_state=RANDOM_SEED)
rf_base = RandomForestClassifier(random_state=RANDOM_SEED)
xgb_base = XGBClassifier(random_state=RANDOM_SEED)

In [10]:

print("Logistic Regression Baseline")
evaluate_model(logreg_base, X_train_scaled, y_train)

Logistic Regression Baseline
accuracy: 0.6446703024293505
precision: 0.5983711794963225
recall: 0.44161235906859764
f1: 0.5075199538859516
roc_auc: 0.6799361516482787


In [11]:
print("Random Forest Baseline")
evaluate_model(rf_base, X_train, y_train)

Random Forest Baseline
accuracy: 0.6435299950421418
precision: 0.5904263489459425
recall: 0.46114556182314576
f1: 0.5168579067140591
roc_auc: 0.6869382394824185


In [12]:
print("XGBoost Baseline")
evaluate_model(xgb_base, X_train, y_train)

XGBoost Baseline
accuracy: 0.6231531978185424
precision: 0.5501980986945212
recall: 0.5142303692659784
f1: 0.5292375627628502
roc_auc: 0.6614022270354385


## Hyperparameter tuning

In [13]:
# Ensures the splits are in chronological order
tscv = TimeSeriesSplit(n_splits=5)

# Hyperparameter grids for each model
log_reg_params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
    "max_iter": [100, 200, 300, 400, 500, 1000]
}

rf_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_depth": [None, 5, 10, 15, 20, 25, 30, 50, 100],
    "min_samples_split": [2, 5, 10, 15, 20],
    "min_samples_leaf": [1, 2, 4, 8, 16],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}

xgb_params = {
    "n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "learning_rate": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "gamma": [0, 0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10],
    "reg_alpha": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    "reg_lambda": [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
}

# For RandomizedSearchCV
ITERATIONS = 100

# Initialize models
log_reg_tuned = RandomizedSearchCV(LogisticRegression(random_state=RANDOM_SEED), 
                                   log_reg_params, 
                                   n_iter=ITERATIONS, 
                                   cv=tscv, 
                                   scoring=scoring,
                                   refit="accuracy", 
                                   random_state=RANDOM_SEED, 
                                   n_jobs=-1,
                                   error_score="raise")

rf_tuned = RandomizedSearchCV(RandomForestClassifier(random_state=RANDOM_SEED),
                              rf_params,
                              n_iter=ITERATIONS,
                              cv=tscv,
                              scoring=scoring,
                              refit="accuracy",
                              random_state=RANDOM_SEED,
                              n_jobs=-1,
                              error_score="raise")

xgb_tuned = RandomizedSearchCV(XGBClassifier(random_state=RANDOM_SEED),
                               xgb_params,
                               n_iter=ITERATIONS,
                               cv=tscv,
                               scoring=scoring,
                               refit="accuracy",
                               random_state=RANDOM_SEED,
                               n_jobs=-1,
                               error_score="raise")

# Fit models
log_reg_tuned.fit(X_train_scaled, y_train)
rf_tuned.fit(X_train, y_train)
xgb_tuned.fit(X_train, y_train)

print("Models have been trained")

Models have been trained


In [14]:
# Show best parameters
print("Logistic Regression Tuned")
print(log_reg_tuned.best_params_)

print("Random Forest Tuned")
print(rf_tuned.best_params_)

print("XGBoost Tuned")
print(xgb_tuned.best_params_)

Logistic Regression Tuned
{'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 100, 'C': 0.1}
Random Forest Tuned
{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 16, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
XGBoost Tuned
{'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.01, 'colsample_bytree': 0.4}


In [15]:
print("Logistic Regression Tuned")
evaluate_model(log_reg_tuned.best_estimator_, X_train_scaled, y_train)

Logistic Regression Tuned
accuracy: 0.646950917203768
precision: 0.6029891843789422
recall: 0.43717954668547165
f1: 0.5057844634938661
roc_auc: 0.6832287086467509


In [16]:
print("Random Forest Tuned")
evaluate_model(rf_tuned.best_estimator_, X_train, y_train)

Random Forest Tuned
accuracy: 0.6567674764501735
precision: 0.6076998132622422
recall: 0.48720049672934984
f1: 0.5398008957854129
roc_auc: 0.7002143885551275


In [17]:
print("XGBoost Tuned")
evaluate_model(xgb_tuned.best_estimator_, X_train, y_train)

XGBoost Tuned
accuracy: 0.6609816559246405
precision: 0.6190445713418369
recall: 0.4764937219007262
f1: 0.5372113355450571
roc_auc: 0.707541688673998


## Feature selection

### Tree-based feature importance (Random Forest and XGBoost)

In [13]:
# # Train Random Forest and XGBoost and get feature importances
# rf_base.fit(X_train, y_train)
# xgb_base.fit(X_train, y_train)

# # Get feature importances
# rf_importances = rf_base.feature_importances_
# xgb_importances = xgb_base.feature_importances_

# # Combine feature importances into a DataFrame
# feature_names = X_train.columns
# feature_importances = pd.DataFrame({
#     'Feature': feature_names,
#     'RandomForest_Importance': rf_importances,
#     'XGBoost_Importance': xgb_importances
# })



In [14]:
# # Select top 20 features based on Random Forest
# top_features_rf = feature_importances.nlargest(20, 'RandomForest_Importance')['Feature']
# X_train_rf_top = X_train[top_features_rf]
# X_test_rf_top = X_test[top_features_rf]

In [15]:
# # Sort by importance (Random Forest)
# feature_importances.sort_values(by='RandomForest_Importance', ascending=False, inplace=True)
# feature_importances.head(20)

In [16]:
# Sort by importance (XGBoost)
# feature_importances.sort_values(by='XGBoost_Importance', ascending=False, inplace=True)
# feature_importances.head(20)

### L1 regularization (Logistic Regression)

In [17]:
# logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
# logreg_l1.fit(X_train, y_train)

# # Select important features (non-zero coefficients)
# model = SelectFromModel(logreg_l1, prefit=True)
# X_train_l1 = model.transform(X_train)
# X_test_l1 = model.transform(X_test)

# # Check which features were selected
# selected_features = X_train.columns[(logreg_l1.coef_ != 0).ravel()]
# for feature in selected_features:
#     print(feature)

### Recusive Feature Elimination

In [18]:
# # Apply RFE with Logistic Regression as the estimator
# rfe = RFE(estimator=logreg_base, n_features_to_select=20, step=1)
# rfe.fit(X_train_scaled, y_train)

# # Transform the dataset with selected features
# X_train_rfe = rfe.transform(X_train)
# X_test_rfe = rfe.transform(X_test)

# print("Selected Features by RFE:", X_train.columns[rfe.support_])

In [19]:
# # Apply RFE with Logistic Regression as the estimator
# rfe = RFE(estimator=rf_base, n_features_to_select=20, step=1)
# rfe.fit(X_train, y_train)

# # Transform the dataset with selected features
# X_train_rfe = rfe.transform(X_train)
# X_test_rfe = rfe.transform(X_test)

# print("Selected Features by RFE:", X_train.columns[rfe.support_])

In [20]:
# X_train.head()