# Train Model

**INPUT**: "./data/1finalDataset.csv"

**OUTPUT**: Outputs the XGBoostModels "./models/best_xgb_model.json"

In this notebook, we take the final dataset (which contains all the tennis statistics), and we train several models with it (Random Forest, XGBoost, Neural Net). Then, we will save the best models to the models folder.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras import layers
pd.set_option('display.max_columns', None)

In [2]:
final_dataset = pd.read_csv("./data/1finalDatasetWithQualifiersWith2025.csv")
final_dataset

Unnamed: 0,AGE_DIFF,ATP_RANK_DIFF,BEST_OF,DOMINANCE_RATIO_LAST_100_DIFF,DOMINANCE_RATIO_LAST_10_DIFF,DOMINANCE_RATIO_LAST_25_DIFF,DOMINANCE_RATIO_LAST_3_DIFF,DOMINANCE_RATIO_LAST_50_DIFF,DRAW_SIZE,ELO_DIFF,ELO_GRAD_LAST_100_DIFF,ELO_GRAD_LAST_10_DIFF,ELO_GRAD_LAST_25_DIFF,ELO_GRAD_LAST_3_DIFF,ELO_GRAD_LAST_50_DIFF,ELO_SURFACE_DIFF,H2H_DIFF,H2H_SURFACE_DIFF,HEIGHT_DIFF,N_GAMES_DIFF,P_1ST_IN_LAST_100_DIFF,P_1ST_IN_LAST_10_DIFF,P_1ST_IN_LAST_25_DIFF,P_1ST_IN_LAST_3_DIFF,P_1ST_IN_LAST_50_DIFF,P_1ST_WON_LAST_100_DIFF,P_1ST_WON_LAST_10_DIFF,P_1ST_WON_LAST_25_DIFF,P_1ST_WON_LAST_3_DIFF,P_1ST_WON_LAST_50_DIFF,P_2ND_WON_LAST_100_DIFF,P_2ND_WON_LAST_10_DIFF,P_2ND_WON_LAST_25_DIFF,P_2ND_WON_LAST_3_DIFF,P_2ND_WON_LAST_50_DIFF,P_ACE_LAST_100_DIFF,P_ACE_LAST_10_DIFF,P_ACE_LAST_25_DIFF,P_ACE_LAST_3_DIFF,P_ACE_LAST_50_DIFF,P_BP_CONV_LAST_100_DIFF,P_BP_CONV_LAST_10_DIFF,P_BP_CONV_LAST_25_DIFF,P_BP_CONV_LAST_3_DIFF,P_BP_CONV_LAST_50_DIFF,P_BP_SAVED_LAST_100_DIFF,P_BP_SAVED_LAST_10_DIFF,P_BP_SAVED_LAST_25_DIFF,P_BP_SAVED_LAST_3_DIFF,P_BP_SAVED_LAST_50_DIFF,P_DF_LAST_100_DIFF,P_DF_LAST_10_DIFF,P_DF_LAST_25_DIFF,P_DF_LAST_3_DIFF,P_DF_LAST_50_DIFF,P_RET_1ST_WON_LAST_100_DIFF,P_RET_1ST_WON_LAST_10_DIFF,P_RET_1ST_WON_LAST_25_DIFF,P_RET_1ST_WON_LAST_3_DIFF,P_RET_1ST_WON_LAST_50_DIFF,P_RET_2ND_WON_LAST_100_DIFF,P_RET_2ND_WON_LAST_10_DIFF,P_RET_2ND_WON_LAST_25_DIFF,P_RET_2ND_WON_LAST_3_DIFF,P_RET_2ND_WON_LAST_50_DIFF,P_RET_ACE_AGAINST_LAST_100_DIFF,P_RET_ACE_AGAINST_LAST_10_DIFF,P_RET_ACE_AGAINST_LAST_25_DIFF,P_RET_ACE_AGAINST_LAST_3_DIFF,P_RET_ACE_AGAINST_LAST_50_DIFF,P_RPW_LAST_100_DIFF,P_RPW_LAST_10_DIFF,P_RPW_LAST_25_DIFF,P_RPW_LAST_3_DIFF,P_RPW_LAST_50_DIFF,P_TOTAL_PTS_WON_LAST_100_DIFF,P_TOTAL_PTS_WON_LAST_10_DIFF,P_TOTAL_PTS_WON_LAST_25_DIFF,P_TOTAL_PTS_WON_LAST_3_DIFF,P_TOTAL_PTS_WON_LAST_50_DIFF,ROUND,WIN_LAST_100_DIFF,WIN_LAST_10_DIFF,WIN_LAST_25_DIFF,WIN_LAST_3_DIFF,WIN_LAST_50_DIFF,RESULT
0,-7.2,75.0,3,-15.845138,-39.330676,-9.178904,-59.159033,-18.125769,32.0,-211.593366,0.000000,-17.874093,-0.207210,14.307285,-3.013195,76.824621,-1,0,5.0,-116,-15.902727,-13.894775,-12.940863,-9.893268,-12.628535,4.048368,3.808742,7.514767,-1.862074,5.260191,-3.204322,-10.083781,-7.151226,-5.074184,-7.270503,4.529549,1.325511,3.535900,4.406969,4.891978,-2.091575,11.507950,-4.356110,-10.182595,-7.268648,-7.824691,5.168651,1.793890,-9.027778,-8.167362,3.617359,1.847687,2.442587,-0.251001,3.755507,-6.469978,-10.517683,-5.792540,-15.211593,-8.100870,3.480228,-0.987870,4.658938,-5.842850,4.643186,5.154151,1.249256,2.365538,1.338080,5.177047,-2.008793,-6.301502,-0.895563,-10.969291,-2.577068,-1.982562,-4.890390,-0.766501,-7.574704,-2.448479,3,0,-2,-1,0,-10,0
1,-1.1,-3.0,3,2.610722,20.161982,6.409340,25.528841,-3.955079,32.0,-43.750750,0.000000,4.091382,5.629173,17.392028,0.214517,-105.339020,-1,0,-3.0,-71,-8.283156,-18.915271,-16.326448,-22.229408,-12.644965,8.510611,9.353544,11.754722,15.662302,9.318523,-3.273901,-0.454363,-3.757552,2.475417,-5.629407,4.659487,9.553599,6.496802,12.203951,5.574035,-5.166865,16.471306,4.059951,8.333333,-7.730040,0.357237,3.726190,-1.308624,2.478632,-1.810867,-0.842764,1.837609,1.346516,0.130992,-0.131648,0.080127,4.846223,-1.622923,3.945585,-1.663467,-4.091451,1.887372,-1.269796,0.903879,-4.129689,-5.994425,-10.233784,-6.521785,-8.247345,-5.715742,-1.610351,2.979260,-1.365986,2.961861,-2.970097,0.099158,1.988274,0.219124,3.593832,-0.858094,3,0,1,0,0,-6,0
2,-0.5,-53.0,3,19.892313,38.265443,17.937792,37.445987,18.587063,32.0,146.556093,0.825629,6.287036,3.373340,-7.768026,-1.201430,-89.118876,0,0,2.0,57,7.654518,-0.515931,-2.421481,-3.150725,-0.070691,3.221366,4.076490,4.792238,-0.815358,5.491332,-1.129509,1.661565,2.298281,1.311339,0.804643,-0.148871,4.112933,2.523208,-0.700341,1.471952,-0.556107,4.716117,4.900655,-3.253968,0.701335,5.220766,7.015415,-2.201770,18.093712,5.401018,-0.793922,0.943919,1.275572,2.490744,-0.076410,6.185945,10.485622,4.670276,19.595667,5.524567,-3.053498,-0.190430,-4.335791,6.009995,-4.155377,-5.264861,-7.986093,-3.740706,-10.576598,-4.215122,2.166648,6.706104,0.792649,11.724203,0.917437,2.725511,4.196189,1.814689,5.751556,2.576061,1,14,1,2,0,8,1
3,4.1,568.0,3,-126.102706,-117.380271,-124.864504,-110.051901,-121.221671,32.0,-429.558032,0.000000,0.000000,0.000000,0.000000,0.000000,-492.983350,0,0,-2.0,-212,-57.230455,-57.399152,-56.806167,-51.586210,-58.609057,-69.380087,-70.400409,-69.233925,-65.709150,-68.248031,-55.238969,-57.502958,-57.659297,-55.639069,-56.187392,-3.688686,-4.876313,-3.904640,-3.549701,-3.550905,-46.253449,-47.211760,-49.004002,-36.537037,-45.949021,-61.382165,-73.726190,-67.566789,-75.888889,-65.944092,-1.928639,-1.817193,-2.063858,-4.493097,-1.833947,-33.781526,-27.192802,-32.336302,-29.244651,-33.104611,-55.312741,-53.373304,-54.426979,-55.392937,-53.456835,-4.354745,-8.267690,-4.825305,-6.742063,-4.184634,-43.005054,-38.603891,-42.009823,-39.745021,-41.769081,-53.024602,-52.015178,-53.121312,-51.181079,-52.523613,6,0,0,0,0,0,0
4,-0.5,-6.0,3,-12.695194,15.186559,3.144573,-24.724239,-13.978052,32.0,120.145899,0.000000,6.330134,8.239575,-15.113667,0.769305,7.476133,0,0,-13.0,48,-13.566963,-18.914507,-16.303570,-26.784951,-16.744633,2.746073,9.034646,4.680013,-3.328500,3.979961,1.872120,3.353560,-1.132395,1.487637,0.767482,0.965089,0.739669,1.304545,-3.870913,1.852696,-2.085760,0.554855,-0.225162,20.714286,-6.926434,-1.545360,-3.542208,-2.797993,14.074074,-2.574047,1.137708,3.103543,1.279099,2.661542,1.809401,-1.366067,-1.217351,3.776386,-7.506176,-1.898725,-3.730739,8.053369,-0.088248,1.828561,-4.088854,-1.029915,-1.262022,-3.103303,4.407753,-0.641369,-2.199220,2.826825,2.525718,-3.037689,-2.224405,-1.329042,2.064333,0.152787,-3.967319,-1.869612,6,0,2,3,-1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102668,-5.6,-57.0,3,2.631556,-8.997661,3.861531,-9.131115,2.578763,32.0,-71.095982,-1.226678,1.402601,2.341619,-8.893055,1.435790,-104.721214,-2,-1,10.0,-247,0.325293,3.769977,1.484380,6.135855,-1.285177,4.259634,-0.935308,4.355236,-8.619475,5.601987,-1.546385,-2.815746,-2.602498,-2.093623,-2.963007,3.444942,3.960528,3.629576,5.814251,3.678330,0.469549,1.799603,9.229933,-16.666667,2.007484,6.459367,-3.388889,-1.086652,1.865079,2.861558,0.188342,0.124784,0.618027,1.128409,0.478893,0.337526,1.167678,0.447043,6.231660,-0.265516,-2.899892,-2.313433,-2.153682,5.948327,-3.216152,0.443006,-0.617236,-0.500179,-4.546666,1.394903,-1.312681,-1.333043,-1.099647,4.934149,-1.959760,0.581703,-0.622692,0.556285,-0.135928,0.394334,4,-1,0,0,1,-4,0
102669,-5.3,-8.0,3,-0.143653,7.664092,22.903036,44.284865,10.530331,32.0,-26.551416,0.000000,-22.176767,-1.914242,-16.209339,2.989653,-69.667766,1,0,-13.0,13,-2.186695,-3.859333,-3.161251,-2.631031,-2.989829,-0.419049,7.677312,3.783141,8.091122,2.259046,3.245652,-3.895296,3.667194,7.564945,5.109455,-0.406990,3.224531,0.457490,4.180141,0.575910,1.492672,-12.968254,-3.816149,-20.171958,-1.404148,9.485665,2.507506,8.743413,26.579254,9.965840,0.185332,1.038387,0.183797,0.718414,0.114645,0.225139,-2.299736,1.774803,-3.333128,0.661188,-1.746772,-2.323944,3.833072,9.460526,1.448889,1.739188,3.930572,1.853255,10.523895,1.886322,-0.364291,-2.559565,2.465604,-0.584231,1.031693,0.140275,-0.763495,2.158491,2.819065,1.554990,4,0,-1,4,-1,8,1
102670,0.2,5.0,3,4.117980,3.754730,-1.907896,-15.299200,-4.513966,32.0,18.263165,1.983066,-4.154282,-1.164002,1.317687,-0.032567,89.476990,0,0,0.0,-157,-0.524543,3.030797,2.800298,7.053430,0.015655,0.487414,-1.121219,-0.707674,-6.228711,0.336289,3.085963,-0.432816,-0.306020,-9.612649,1.758743,0.545020,0.975959,-0.390308,0.666744,-0.098185,1.389236,3.898990,4.380580,-13.359788,3.112315,0.893947,-3.564796,4.240184,29.215229,-1.941369,-1.718355,-3.144466,-2.961215,-1.378765,-2.360997,-2.760694,-1.850002,-4.942789,-2.276993,-5.438104,2.679601,1.267758,3.771304,0.184805,0.914017,2.180130,0.988488,2.985715,0.752376,3.396630,-0.514156,-0.084194,-0.832424,2.363894,-2.654563,0.507982,0.056507,0.095648,-1.343491,-0.624542,3,7,1,2,0,3,1
102671,15.1,56.0,3,-0.479055,3.656698,-17.001540,-11.902286,-9.975041,32.0,118.378588,-5.666326,28.609292,1.554072,1.338212,-3.254790,158.250375,0,0,-2.0,554,1.944150,4.277369,5.486980,2.729990,3.935973,-3.076011,-3.170245,-5.679511,-4.892657,-5.278965,0.554783,11.019339,3.643445,7.692308,1.251667,-2.803015,-4.106785,-4.174216,-6.896732,-3.870809,1.833985,12.688492,-1.078140,21.190476,1.156955,-9.295639,1.401376,-5.183828,-2.904040,-5.687066,-0.528036,-1.505603,-1.265585,-2.716327,-0.792725,1.528331,-0.068385,-0.277014,7.158614,1.244755,1.954450,1.001831,-3.151109,-15.562310,-1.262148,-2.121000,-2.155551,-1.439483,-2.871337,-2.430899,2.085710,1.779685,-1.479232,0.882604,0.470704,0.054426,2.027271,-1.122836,0.445009,-0.969434,3,-1,1,-3,0,-1,1


## Split Training vs Testing Data

We'll shuffle the data, and do a 85% split between training and testing data.

In [3]:
# Convert data to numpy
data = final_dataset.to_numpy(dtype=object)
np.random.shuffle(data)

# Split the data using an 85% split between training and testing
split = 0.95
value = round(split*len(data))

data_train = data[:value,:]
data_test = data[value:,:]

print("Training Data: "+str(data_train.shape))
print("Testing Data: "+str(data_test.shape))

Training Data: (97539, 87)
Testing Data: (5134, 87)


We need to map the result column to string values (since that's what the sklearn library requires I'm pretty sure)

In [4]:
# Define several mappers
mapper = np.vectorize(lambda x: "Player 2 Wins" if x == 0 else "Player 1 Wins")
reverse_mapper = np.vectorize(lambda x: 0 if x == "Player 2 Wins" else 1)

# Training data
x_train = data_train[:,:-1]
y_pred_train = mapper(data_train[:,-1:]).squeeze()

# Testing data
x_test = data_test[:,:-1]
y_pred_test = mapper(data_test[:,-1:]).squeeze()

## Train Models

### Train Simple Decision Tree

We can start by training a really simple decision tree (max_depth=4) to see how good it is.

In [5]:
# Instantiate a Decision Tree
decision_sklearn = DecisionTreeClassifier(max_depth=4)
decision_sklearn = decision_sklearn.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = decision_sklearn.predict(x_train)
predictions_test = decision_sklearn.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.6539743077128123
Test Accuracy: 0.6581612777561355


In [6]:
text_representation = tree.export_text(decision_sklearn, feature_names=final_dataset.columns[:-1])
print(text_representation)

|--- ELO_DIFF <= 3.96
|   |--- ELO_SURFACE_DIFF <= -163.22
|   |   |--- ELO_DIFF <= -311.32
|   |   |   |--- ELO_SURFACE_DIFF <= -397.42
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- ELO_SURFACE_DIFF >  -397.42
|   |   |   |   |--- class: Player 2 Wins
|   |   |--- ELO_DIFF >  -311.32
|   |   |   |--- P_TOTAL_PTS_WON_LAST_50_DIFF <= -2.59
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- P_TOTAL_PTS_WON_LAST_50_DIFF >  -2.59
|   |   |   |   |--- class: Player 2 Wins
|   |--- ELO_SURFACE_DIFF >  -163.22
|   |   |--- ATP_RANK_DIFF <= 12.50
|   |   |   |--- ELO_SURFACE_DIFF <= -97.18
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- ELO_SURFACE_DIFF >  -97.18
|   |   |   |   |--- class: Player 1 Wins
|   |   |--- ATP_RANK_DIFF >  12.50
|   |   |   |--- ELO_SURFACE_DIFF <= 27.53
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- ELO_SURFACE_DIFF >  27.53
|   |   |   |   |--- class: Player 2 Wins
|--- ELO_DIFF >  3.96
|   |--- ELO_DIFF <= 172.48
|   | 

As we can see in the output, it seems like it's only taking ELO into account, which we don't really want (since otherwise we could just predict using ELO alone).

Let's see if a Random Forest works better :)

### Train Random Forest

We start by training a pretty big random forest (n_estimators=500)

In [7]:
# Instantiate a Random Forsest
forest_sklearn = RandomForestClassifier(n_estimators=500, max_depth=10, max_features="sqrt", bootstrap=True)
forest_sklearn = forest_sklearn.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = forest_sklearn.predict(x_train)
predictions_test = forest_sklearn.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.7084038179599955
Test Accuracy: 0.6733541098558629


That's a slight improvement :). Let's try a simpler less overfitted model.

In [8]:
# Instantiate a Random Forsest
forest_sklearn2 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=400, min_samples_leaf=250, max_features="sqrt", bootstrap=True)
forest_sklearn2 = forest_sklearn2.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = forest_sklearn2.predict(x_train)
predictions_test = forest_sklearn2.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.6674150852479521
Test Accuracy: 0.6725749902610051


Seems like accuracy kinda decreased. I'm going to run a quick GridSearch to see if we could improve this. Let's see if we can find the best hyperparameters :)

In [None]:
# This is going to take a long time, if you want to comment it out (I already did this myself and the results are in the models folder)
# param_grid = {
#     'n_estimators': [100, 300],
#     'max_depth': [5, 10, 15],
#     'min_samples_split': [10, 20],
#     'min_samples_leaf': [5, 10],
#     'max_features': ['sqrt', 'log2']
# }

# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(),
#     param_grid=param_grid,
#     cv=5, 
#     n_jobs=-1,
#     verbose=4
# )
# grid_search.fit(x_train, y_pred_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100;, score=0.667 total time=  20.3s
[CV 2/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100;, score=0.667 total time=  20.6s
[CV 3/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100;, score=0.663 total time=  20.6s
[CV 4/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100;, score=0.661 total time=  21.1s
[CV 5/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=100;, score=0.669 total time=  21.0s
[CV 1/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=20, n_estimators=100;, score=0.666 total time=  21.7s
[CV 2/5] END max_depth=5, max_features=sqrt, min_samples_leaf=5, min_samples_split=20, n_estimators=100;, score=0.66

In [9]:
# Best parameters
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# Instantiate a Random Forsest
best_forest_model = RandomForestClassifier(max_depth=15, max_features='log2', min_samples_leaf=5, min_samples_split=20, n_estimators=300)
best_forest_model = best_forest_model.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = best_forest_model.predict(x_train)
predictions_test = best_forest_model.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.8020176544766708
Test Accuracy: 0.6729645500584339


The gridSearchCV wasn't that successful, since the test accuracy was actually 0.6611. Let's train an XGBoost model and see if it does better.

### Train XGBoost Algorithm

Let's try with XGBoost and see if we can get better results.

In [10]:
params = {
    "n_estimators": 250,
    "learning_rate": 0.04,
    "max_depth": 5,
    "subsample": 0.9,
    "colsample_bytree": 0.95,
    "gamma": 0.2,
    "reg_alpha": 0.5,
    "reg_lambda": 5,
}

xgb_model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    **params
)

# Fit using training data and early stopping on validation data
xgb_model.fit(x_train, reverse_mapper(y_pred_train))

# Make predictions
predictions_train = xgb_model.predict(x_train)
predictions_test = xgb_model.predict(x_test)

# Calculate accuracy
print("Train Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_train), predictions_train)))
print("Test Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_test), predictions_test)))

Train Accuracy: 0.687427593065338
Test Accuracy: 0.6791975068172964


In [11]:
# Sorting by importance in descending order
sorted_mapped_results = sorted(
    list(zip(final_dataset.columns[:-1], list(xgb_model.feature_importances_))),
    key=lambda x: x[1], 
    reverse=True
)

# Extracting sorted labels and their importances
sorted_labels = [label for label, importance in sorted_mapped_results]
sorted_importances = [importance for _, importance in sorted_mapped_results]

# Displaying results
for label, importance in sorted_mapped_results:
    print(f"{label}: {importance:.4f}")


ELO_DIFF: 0.2596
ELO_SURFACE_DIFF: 0.1060
ATP_RANK_DIFF: 0.0321
P_TOTAL_PTS_WON_LAST_100_DIFF: 0.0256
DOMINANCE_RATIO_LAST_100_DIFF: 0.0255
DOMINANCE_RATIO_LAST_50_DIFF: 0.0170
DOMINANCE_RATIO_LAST_25_DIFF: 0.0154
P_TOTAL_PTS_WON_LAST_50_DIFF: 0.0147
DOMINANCE_RATIO_LAST_10_DIFF: 0.0126
AGE_DIFF: 0.0120
DOMINANCE_RATIO_LAST_3_DIFF: 0.0118
BEST_OF: 0.0117
N_GAMES_DIFF: 0.0095
P_TOTAL_PTS_WON_LAST_10_DIFF: 0.0092
P_ACE_LAST_3_DIFF: 0.0085
ELO_GRAD_LAST_50_DIFF: 0.0085
P_1ST_WON_LAST_100_DIFF: 0.0084
WIN_LAST_10_DIFF: 0.0082
ELO_GRAD_LAST_100_DIFF: 0.0081
H2H_SURFACE_DIFF: 0.0080
P_1ST_WON_LAST_3_DIFF: 0.0080
WIN_LAST_3_DIFF: 0.0079
WIN_LAST_100_DIFF: 0.0077
H2H_DIFF: 0.0075
P_TOTAL_PTS_WON_LAST_25_DIFF: 0.0074
P_2ND_WON_LAST_10_DIFF: 0.0074
P_TOTAL_PTS_WON_LAST_3_DIFF: 0.0071
P_DF_LAST_3_DIFF: 0.0069
P_2ND_WON_LAST_25_DIFF: 0.0068
P_ACE_LAST_10_DIFF: 0.0068
P_1ST_WON_LAST_10_DIFF: 0.0067
P_2ND_WON_LAST_50_DIFF: 0.0067
P_2ND_WON_LAST_3_DIFF: 0.0066
P_1ST_WON_LAST_50_DIFF: 0.0064
P_RET_ACE

That's slightly better. Let's run a gridsearch to really make sure.

In [None]:
# # Define parameter grid with all specified parameters
# param_grid = {
#     'n_estimators': [100, 300],
#     'max_depth': [5, 10],
#     'learning_rate': [0.01, 0.05],
#     'subsample': [0.7],
#     'colsample_bytree': [0.6],
#     'reg_alpha': [0.1, 0.5],
#     'reg_lambda': [0.5, 1.0]
# }

# # Instantiate an XGBoost Classifier
# xgb_model = XGBClassifier()

# # Perform GridSearchCV
# grid_search = GridSearchCV(
#     estimator=xgb_model, 
#     param_grid=param_grid, 
#     scoring='accuracy', 
#     cv=5, 
#     verbose=3, 
#     n_jobs=-1
# )
# grid_search.fit(x_train, reverse_mapper(y_pred_train))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 3/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=1.0, subsample=0.7;, score=0.665 total time=   2.9s
[CV 5/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=0.5, subsample=0.7;, score=0.670 total time=   3.0s
[CV 4/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=0.5, subsample=0.7;, score=0.665 total time=   3.0s
[CV 1/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=0.5, subsample=0.7;, score=0.668 total time=   3.0s
[CV 2/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=0.5, subsample=0.7;, score=0.672 total time=   3.1s
[CV 2/5] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, n_estimators=100, reg_alpha=0.1, reg_lambda=1.0, s

In [None]:
# # Best parameters
# print("Best Parameters:", grid_search.best_params_)

# # Train the best model
# best_xgb_model = grid_search.best_estimator_

# # Make predictions
# predictions_train = best_xgb_model.predict(x_train)
# predictions_test = best_xgb_model.predict(x_test)

# # Calculate accuracy
# print("Train Accuracy:", accuracy_score(reverse_mapper(y_pred_train), predictions_train))
# print("Test Accuracy:", accuracy_score(reverse_mapper(y_pred_test), predictions_test))

Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.7}
Train Accuracy: 0.6818365248180789
Test Accuracy: 0.6672322218943642


In [None]:
# best_xgb_model = grid_search.best_estimator_
# best_xgb_model.save_model("./models/best_xgb_model.json")

See the next notebook (3.Predict.ipynb) for this.