In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error
from utils.updateStats import getStats, updateStats, createStats
pd.set_option('display.max_columns', None)

In [None]:
DATASET = "./data/0cleanDatasetWithQualifiersWith2025.csv"
FILTER_NUM = 10000
CHALLENGERS = False

update_stats_param = {
    "k_factor": None,
    "base_k_factor": 43,
    "max_k_factor": 62,
    "div_number": 800,
    "bonus_after_layoff": True,
}

In [3]:
allData = pd.read_csv(DATASET)

if CHALLENGERS:
    allDataNoChallengers = allData
else:
    allDataNoChallengers = allData[allData["tourney_level"].astype(str).isin(["A", "G", "M", "F", "D", "O"])]

###### Create Dataset ######
clean_data = allDataNoChallengers[~allDataNoChallengers["tourney_date"].astype(str).str.contains("2025")]
clean_data = clean_data.reset_index(drop=True)

model = XGBClassifier()
model.load_model("./models/best_final_xgb_model.json") # 67

  allData = pd.read_csv(DATASET)


In [4]:
############################################################################################################################################
###### EVALUTAION #######
############################################################################################################################################
print("\nStarting evaluation...")

def predict_twice_average(player1: dict, player2: dict, match: dict, xgb_model, prev_stats):
    """
    Returns the probability of player 1 winning
    """
    p1_prob = []
    p2_prob = []

    # Call getStatsPlayers function
    output = getStats(player1, player2, match, prev_stats)
    match_data = pd.DataFrame([dict(sorted(output.items()))])
    probs = xgb_model.predict_proba(np.array(match_data, dtype=object))[:, ::-1]

    p1_prob.append(probs[0][0])
    p2_prob.append(probs[0][1])

    output = getStats(player2, player1, match, prev_stats)
    match_data = pd.DataFrame([dict(sorted(output.items()))])
    probs = xgb_model.predict_proba(np.array(match_data, dtype=object))[:, ::-1]

    p1_prob.append(probs[0][1])
    p2_prob.append(probs[0][0])

    return round(float(np.mean(p1_prob)), 4)

def run_evaluation(xgb_model, evaulation_data, prev_stats):
    predictions = []
    elo_predictions = []
    probabilities = []
    results = []
    counter = 0

    evaulation_data = evaulation_data[evaulation_data["tourney_date"].astype(str).str.contains("2025")]
    for index, row in tqdm(evaulation_data.iterrows(), total=len(evaulation_data)):
        player1 = {
            "ID": row["p1_id"],
            "ATP_RANK": row["p1_rank"],
            "AGE": row["p1_age"],
            "HEIGHT": row["p1_ht"],
        }

        player2 = {
            "ID": row["p2_id"],
            "ATP_RANK": row["p2_rank"],
            "AGE": row["p2_age"],
            "HEIGHT": row["p2_ht"],
        }

        match = {
            "BEST_OF": row["best_of"],
            "DRAW_SIZE": row["draw_size"],
            "SURFACE": row["surface"],
            "ROUND": row["round"],
        }

        ########## PREDICT ##########
        if row["tourney_level"] in ["A", "G", "M", "F", "D", "O"] and row["round"] not in ["Q1", "Q2", "Q3"]:
            # Baseline accuracy
            if prev_stats["elo_players"][row["p1_id"]] >= prev_stats["elo_players"][row["p2_id"]]:
                elo_predictions.append(1)
            else:
                elo_predictions.append(0)
            
            prob_prediction = predict_twice_average(player1, player2, match, xgb_model, prev_stats)
            predictions.append(1) if prob_prediction >= 0.5 else predictions.append(0)
            probabilities.append(prob_prediction)
            counter += 1
            
            # Save result to compare
            results.append(row["RESULT"])

        # Update the stats of the match after it has been predicted!
        prev_stats = updateStats(row, prev_stats, **update_stats_param)

    print (
        f"EVALUATION RESULTS:\n"
        f"Evaluated {counter} matches...\n"
        f"Baseline ELO Accuracy: {accuracy_score(elo_predictions, results)}\n"
        f"Accuracy Score: {accuracy_score(predictions, results)}\n"
        f"MAE: {mean_absolute_error(predictions, results)}"
    )
    
    return accuracy_score(predictions, results), mean_absolute_error(predictions, results)

# Run your custom sequential evaluation on 2025 validation set
allData_upto_2025 = allData[~allData["tourney_date"].astype(str).str.contains("2025")]

prev_stats_eval = createStats()

# Update up until end of 2024
for index, row in tqdm(allData_upto_2025.iterrows(), total=len(allData_upto_2025)):
    ########## UPDATE STATS ##########
    prev_stats_eval = updateStats(row, prev_stats_eval, **update_stats_param)

score, mae = run_evaluation(xgb_model=model, evaulation_data=allData, prev_stats=prev_stats_eval)
print(f"score2025={score:.4f} "
    f"mae={mae:.4f}")



Starting evaluation...


100%|██████████| 191250/191250 [00:10<00:00, 17955.41it/s]
100%|██████████| 6013/6013 [00:03<00:00, 1859.25it/s]

EVALUATION RESULTS:
Evaluated 1386 matches...
Baseline ELO Accuracy: 0.652958152958153
Accuracy Score: 0.6702741702741702
MAE: 0.3297258297258297
score2025=0.6703 mae=0.3297





## Test Wimbledon Accuracy

In [5]:
wimbledonData = pd.read_csv("data/allWimbledon2025.csv")
wimbledonData

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,p1_name,p1_hand,p1_ht,p1_ioc,p1_age,p2_id,p2_seed,p2_entry,p2_name,p2_hand,p2_ht,p2_ioc,p2_age,score,best_of,round,minutes,p1_ace,p1_df,p1_svpt,p1_1stIn,p1_1stWon,p1_2ndWon,p1_SvGms,p1_bpSaved,p1_bpFaced,p2_ace,p2_df,p2_svpt,p2_1stIn,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p2_rank,RESULT
0,2025-06-30_Wimbledon_2025-540-700_105916_200514,Wimbledon,Grass,128,G,2025-06-30,2025-540-700,200514,,,Jurij Rodionov,L,191.0,AUT,26.1,105916,,,Marton Fucsovics,R,188.0,HUN,33.4,6-4 6-3,3,Q1,72.0,3.0,2.0,60.0,34.0,24.0,13.0,9.0,5.0,7.0,6.0,1.0,56.0,35.0,29.0,13.0,10.0,0.0,0.0,195.0,105.0,0
1,2025-06-30_Wimbledon_2025-540-701_208260_209903,Wimbledon,Grass,128,G,2025-06-30,2025-540-701,209903,,,Lukas Neumayer,R,183.0,AUT,22.8,208260,,,Zachary Svajda,R,175.0,USA,22.6,6-2 6-3,3,Q1,62.0,2.0,1.0,64.0,48.0,32.0,4.0,8.0,9.0,12.0,12.0,2.0,51.0,35.0,28.0,12.0,9.0,3.0,3.0,169.0,229.0,0
2,2025-06-30_Wimbledon_2025-540-702_123828_210136,Wimbledon,Grass,128,G,2025-06-30,2025-540-702,123828,,,Jan Choinski,R,196.0,GBR,29.1,210136,,,Mark Lajal,R,191.0,EST,22.1,3-6 6-4 6-3,3,Q1,117.0,7.0,0.0,85.0,63.0,42.0,12.0,14.0,5.0,8.0,9.0,3.0,93.0,63.0,44.0,16.0,14.0,3.0,5.0,200.0,168.0,0
3,2025-06-30_Wimbledon_2025-540-703_208210_209263,Wimbledon,Grass,128,G,2025-06-30,2025-540-703,208210,,,Chris Rodesch,R,198.0,LUX,24.0,209263,,,Matteo Gigante,L,180.0,ITA,23.5,6-2 6-4,3,Q1,74.0,7.0,1.0,60.0,42.0,30.0,11.0,9.0,6.0,6.0,1.0,8.0,64.0,38.0,23.0,12.0,9.0,4.0,7.0,163.0,133.0,1
4,2025-06-30_Wimbledon_2025-540-704_207680_209262,Wimbledon,Grass,128,G,2025-06-30,2025-540-704,209262,,,Tristan Schoolkate,R,183.0,AUS,24.3,207680,,,Facundo Diaz Acosta,L,183.0,ARG,24.5,6-7(4) 6-1 6-3,3,Q1,115.0,5.0,1.0,85.0,56.0,44.0,15.0,14.0,2.0,4.0,7.0,3.0,83.0,56.0,38.0,11.0,14.0,1.0,7.0,104.0,199.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,2025-06-30_Wimbledon_2025-540-222_111575_126203,Wimbledon,Grass,128,G,2025-06-30,2025-540-222,126203,,,Taylor Fritz,R,196.0,USA,27.7,111575,,,Karen Khachanov,R,198.0,RUS,29.1,6-3 6-4 1-6 7-6(4),5,QF,156.0,16.0,2.0,105.0,67.0,54.0,22.0,19.0,1.0,4.0,6.0,3.0,119.0,66.0,48.0,28.0,19.0,5.0,8.0,5.0,20.0,1
230,2025-06-30_Wimbledon_2025-540-223_111815_207989,Wimbledon,Grass,128,G,2025-06-30,2025-540-223,207989,,,Carlos Alcaraz,R,183.0,ESP,22.2,111815,,,Cameron Norrie,L,188.0,GBR,29.9,6-2 6-3 6-3,5,QF,99.0,13.0,4.0,76.0,55.0,49.0,7.0,13.0,5.0,5.0,3.0,3.0,83.0,57.0,36.0,9.0,13.0,6.0,11.0,2.0,61.0,1
231,2025-06-30_Wimbledon_2025-540-224_104925_206173,Wimbledon,Grass,128,G,2025-06-30,2025-540-224,104925,,,Novak Djokovic,R,188.0,SRB,38.1,206173,,,Jannik Sinner,R,191.0,ITA,23.9,6-3 6-3 6-4,5,SF,115.0,12.0,0.0,92.0,62.0,48.0,5.0,14.0,5.0,10.0,12.0,2.0,73.0,54.0,44.0,12.0,14.0,1.0,2.0,6.0,1.0,0
232,2025-06-30_Wimbledon_2025-540-225_126203_207989,Wimbledon,Grass,128,G,2025-06-30,2025-540-225,126203,,,Taylor Fritz,R,196.0,USA,27.7,207989,,,Carlos Alcaraz,R,183.0,ESP,22.2,6-4 5-7 6-3 7-6(6),5,SF,169.0,19.0,6.0,121.0,75.0,60.0,26.0,22.0,4.0,7.0,13.0,3.0,113.0,73.0,64.0,23.0,21.0,1.0,2.0,5.0,2.0,0


In [6]:
# Run your custom sequential evaluation on 2025 validation set
prev_stats_eval_wimbledon = createStats()

# Update up until end of 2024
for index, row in tqdm(allData.iterrows(), total=len(allData)):
    ########## UPDATE STATS ##########
    prev_stats_eval_wimbledon = updateStats(row, prev_stats_eval_wimbledon, **update_stats_param)

score, mae = run_evaluation(xgb_model=model, evaulation_data=wimbledonData, prev_stats=prev_stats_eval_wimbledon)
print(f"score2025={score:.4f} "
    f"mae={mae:.4f}")

100%|██████████| 197263/197263 [00:11<00:00, 17669.73it/s]
100%|██████████| 234/234 [00:00<00:00, 810.55it/s] 

EVALUATION RESULTS:
Evaluated 127 matches...
Baseline ELO Accuracy: 0.6692913385826772
Accuracy Score: 0.6771653543307087
MAE: 0.3228346456692913
score2025=0.6772 mae=0.3228





In [10]:
# Run your custom sequential evaluation on 2025 validation set
prev_stats_eval_final_wimbledon = createStats()

# Update up until end of 2024
for index, row in tqdm(allData.iterrows(), total=len(allData)):
    ########## UPDATE STATS ##########
    prev_stats_eval_final_wimbledon = updateStats(row, prev_stats_eval_final_wimbledon, **update_stats_param)
    
for index, row in tqdm(wimbledonData[:-1].iterrows(), total=len(wimbledonData[:-1])):
    ########## UPDATE STATS ##########
    prev_stats_eval_final_wimbledon = updateStats(row, prev_stats_eval_final_wimbledon, **update_stats_param)

100%|██████████| 197263/197263 [00:11<00:00, 17815.32it/s]
100%|██████████| 233/233 [00:00<00:00, 6022.44it/s]


In [11]:
player1 = {
    "Name": "Jannik Sinner",
    "ID": 206173,
    "ATP_POINTS": 11000,
    "ATP_RANK": 1,
    "AGE": 24, 
    "HEIGHT": 191,
}

player2 = {
    "Name": "Carlos Alcaraz",
    "ID": 207989,
    "ATP_POINTS": 5000,
    "ATP_RANK": 2,
    "AGE": 22,
    "HEIGHT": 183,
}

match = {
    "BEST_OF": 5,
    "DRAW_SIZE": 128,
    "SURFACE": "Grass",
    "ROUND": "F"
}

prob_prediction = predict_twice_average(player1, player2, match, model, prev_stats_eval_final_wimbledon)

This is the probability of Jannik Sinner winning in the Wimbledon final 2025. It predicts Sinner has a 46.71% chance of winning

In [12]:
prob_prediction

0.4671

## Explore bookies accuracy on Wimbledon

In [13]:
bets2025 = pd.read_csv("data/bets2025.csv")
wimbledon2025bets = bets2025[bets2025["Tournament"].astype(str).str.contains("Wimbledon")]

In [14]:
wimbledon2025bets

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
1493,36,London,Wimbledon,2025-06-30,Grand Slam,Outdoor,Grass,1st Round,5,Tarvet O.,Riedi L.,733,503.0,36,77.0,6.0,4.0,6.0,4.0,6.0,4.0,,,,,3.0,0.0,Completed,2.10,1.75,2.12,1.79,2.18,1.84,2.06,1.75
1494,36,London,Wimbledon,2025-06-30,Grand Slam,Outdoor,Grass,1st Round,5,Lehecka J.,Dellien H.,25,79.0,1965,751.0,4.0,6.0,6.0,2.0,6.0,2.0,7.0,6.0,,,3.0,1.0,Completed,1.02,15.00,,,1.02,55.00,1.01,22.38
1495,36,London,Wimbledon,2025-06-30,Grand Slam,Outdoor,Grass,1st Round,5,Thompson J.,Kopriva V.,44,78.0,1200,757.0,3.0,6.0,4.0,6.0,6.0,3.0,7.0,6.0,6.0,1.0,3.0,2.0,Completed,1.40,3.00,1.43,3.03,1.43,3.35,1.38,3.02
1496,36,London,Wimbledon,2025-06-30,Grand Slam,Outdoor,Grass,1st Round,5,Mannarino A.,O Connell C.,123,77.0,477,775.0,6.0,2.0,6.0,4.0,6.0,3.0,,,,,3.0,0.0,Completed,1.36,2.90,1.43,3.03,1.45,3.15,1.40,2.93
1497,36,London,Wimbledon,2025-06-30,Grand Slam,Outdoor,Grass,1st Round,5,Bellucci M.,Crawford O.,73,248.0,864,229.0,6.0,7.0,6.0,3.0,6.0,4.0,6.0,4.0,,,3.0,1.0,Completed,1.30,3.60,1.30,3.81,1.34,3.85,1.30,3.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,36,London,Wimbledon,2025-07-09,Grand Slam,Outdoor,Grass,Quarterfinals,5,Sinner J.,Shelton B.,1,10.0,10430,3130.0,7.0,6.0,6.0,4.0,6.0,4.0,,,,,3.0,0.0,Completed,1.25,3.60,1.32,3.60,1.34,3.90,1.29,3.58
1616,36,London,Wimbledon,2025-07-09,Grand Slam,Outdoor,Grass,Quarterfinals,5,Djokovic N.,Cobolli F.,6,24.0,4630,2035.0,6.0,7.0,6.0,2.0,7.0,5.0,6.0,4.0,,,3.0,1.0,Completed,1.10,7.00,1.11,8.33,1.12,8.60,1.09,7.53
1617,36,London,Wimbledon,2025-07-11,Grand Slam,Outdoor,Grass,Semifinals,5,Alcaraz C.,Fritz T.,2,5.0,9300,4635.0,6.0,4.0,5.0,7.0,6.0,3.0,7.0,6.0,,,3.0,1.0,Completed,1.17,4.75,1.20,5.13,1.24,5.25,1.17,4.98
1618,36,London,Wimbledon,2025-07-11,Grand Slam,Outdoor,Grass,Semifinals,5,Sinner J.,Djokovic N.,1,6.0,10430,4630.0,6.0,3.0,6.0,3.0,6.0,4.0,,,,,3.0,0.0,Completed,1.40,2.80,1.47,2.93,1.47,3.05,1.42,2.85


In [15]:
# Make a working copy and ensure odds are numeric
df = wimbledon2025bets.copy()
df[["B365W","B365L"]] = df[["B365W","B365L"]].apply(pd.to_numeric, errors="coerce")

# Implied probability of the pre-match favorite (1 / min odds)
df["fav_prob"] = 1 / df[["B365W","B365L"]].min(axis=1)

# Valid rows = both odds present and not equal (ties rare)
m = df["B365W"].notna() & df["B365L"].notna() & (df["B365W"] != df["B365L"])

# Accuracy: how often the winner was the favorite
accuracy = (df.loc[m, "B365W"] < df.loc[m, "B365L"]).mean()

y = (df.loc[m, "B365W"] < df.loc[m, "B365L"]).astype(int)
mae = (df.loc[m, "fav_prob"] - y).abs().mean()

print(f"Accuracy: {accuracy:.2%}  (n={m.sum()})")
print(f"MAE: {mae:.4f}")

Accuracy: 72.44%  (n=127)
MAE: 0.3612


The bookies had a 72.44% accuracy on Wimbledon 2025 :(

In [16]:
# Make a working copy and ensure odds are numeric
df = bets2025.copy()
df = df[~df["Tournament"].astype(str).str.contains("Wimbledon")]
df[["B365W","B365L"]] = df[["B365W","B365L"]].apply(pd.to_numeric, errors="coerce")

# Implied probability of the pre-match favorite (1 / min odds)
df["fav_prob"] = 1 / df[["B365W","B365L"]].min(axis=1)

# Valid rows = both odds present and not equal (ties rare)
m = df["B365W"].notna() & df["B365L"].notna() & (df["B365W"] != df["B365L"])

# Accuracy: how often the winner was the favorite
accuracy = (df.loc[m, "B365W"] < df.loc[m, "B365L"]).mean()

y = (df.loc[m, "B365W"] < df.loc[m, "B365L"]).astype(int)
mae = (df.loc[m, "fav_prob"] - y).abs().mean()

print(f"Accuracy: {accuracy:.2%}  (n={m.sum()})")
print(f"MAE: {mae:.4f}")

Accuracy: 68.00%  (n=1450)
MAE: 0.3973


However, they had a 68% accuracy on the rest of 2025, which is honestly pretty similar to us. Of course, I specifically optimized the hyperparameters of my model on the 2025 ATP data, so it's not the fairest of comparisons.