# Predict Wimbledon

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from utils.updateStats import getStats, updateStats, createStats
pd.set_option('display.max_columns', None)

In [2]:
from datetime import datetime
def calculate_age(dob_str: str,
                  ref_str: str,
                  *,
                  as_float: bool = True,
                  precision: int = 1) -> float | int:
    """
    Parameters
    ----------
    dob_str   : str  YYYYMMDD   (e.g. '20030505')
    ref_str   : str  YYYYMMDD   (e.g. '20250630')
    as_float  : bool return decimal-years instead of whole years
    precision : int  number of decimal places when as_float=True

    Returns
    -------
    int   (whole years)            when as_float=False  (default)
    float (years, <precision> dp)  when as_float=True
    """
    dob = datetime.strptime(dob_str, "%Y%m%d").date()
    ref = datetime.strptime(ref_str, "%Y-%m-%d").date()

    # — whole‑year age —
    yrs = ref.year - dob.year - ((ref.month, ref.day) < (dob.month, dob.day))
    if not as_float:
        return yrs

    # — decimal‑year age (uses the mean tropical year: 365.2425 days) —
    age = (ref - dob).days / 365.2425
    return round(age, precision)

In [3]:
player_info = pd.read_csv("./data/atp_players.csv")
player_info.set_index(["name_first", "name_last"], inplace=True)

  player_info = pd.read_csv("./data/atp_players.csv")


## Get info about each player

In [4]:
import json

# Load the JSON file
with open('data/rankings/atp_rankings_wimbledon.json', 'r') as file:
    player_rank_dict = json.load(file)

print(player_rank_dict)


{'Benjamin Bonzi': '64', 'Andrea Fiorentini': '949', 'Melvin Kumar': '1634', 'Tristan Berard': '1770', 'James Story': '562', 'Mason Naumovski': '1770', 'Jake Delaney': '508', 'Alessandro Spadola': '1216', 'Zhan Zheng': '2153', 'Ko Suzuki': '1689', 'Michael Gloeckler': '1442', 'Holden Koons': '1868', 'Alexis Gautier': '750', 'Hady Habib': '170', 'Philip Hjorth': '951', 'Leonardo Aboian': '815', 'Benito Massacri': '1463', 'Andy Nguyen': '1868', 'Joao Eduardo Schiessl': '594', 'Juan Sebastian Gomez': '936', 'Colton Smith': '134', 'Marcus Walters': '1164', 'Benjamin Ignacio Torres Fernandez': '2164', 'Daniel Siniakov': '1178', 'Ilija Palavestra': '1868', 'Hunter Reese': '1590', 'Jonas Schaer': '1966', 'Radovan Michalik': '1274', 'Renzo Olivo': '312', 'Matyas Fuele': '1295', 'Thanasi Kokkinakis': '146', 'Shaheed Alam': '1770', 'Filip Jeff Planinsek': '691', 'Dayne Kelly': '1624', 'Saveliy Ivanov': '841', 'Sergey Fomin': '442', 'Herman Hoeyeraal': '1309', 'Alibek Kachmazov': '182', 'Roberto 

In [5]:
wimbledon_players = {}
with open("data/wimbledon_players.txt", 'r') as f:
    for line in tqdm(f.readlines()):
        player_name = line.strip()

        wimbledon_players[str(player_name)] = {}
        wimbledon_players[str(player_name)]["Name"] = str(player_name)

        if player_name == "Tomas Martin Etcheverry":
            first_name = "Tomas Martin"
            last_name = "Etcheverry"
        elif player_name == "Chun-Hsin Tseng":
            first_name = "Chun Hsin"
            last_name = "Tseng"
        elif player_name == "Jan-Lennard Struff":
            first_name = "Jan Lennard"
            last_name = "Struff"
        else:
            parts = player_name.split()
            first_name = parts[0].title()
            last_name = " ".join(parts[1:]) if len(parts) > 1 else ""
            last_name = last_name.title()

        wimbledon_players[str(player_name)]["ID"] = int(player_info.loc[(first_name, last_name)]["player_id"])
        player_age = calculate_age(
            str(int(player_info.loc[(first_name, last_name)]["dob"])),
            "2025-06-30"
        )
        wimbledon_players[str(player_name)]["AGE"] = player_age
        wimbledon_players[str(player_name)]["HEIGHT"] = int(player_info.loc[(first_name, last_name)]["height"])
        wimbledon_players[str(player_name)]["ATP_RANK"] = int(player_rank_dict[player_name.title()])

  0%|          | 0/130 [00:00<?, ?it/s]

  wimbledon_players[str(player_name)]["ID"] = int(player_info.loc[(first_name, last_name)]["player_id"])
  wimbledon_players[str(player_name)]["ID"] = int(player_info.loc[(first_name, last_name)]["player_id"])
  str(int(player_info.loc[(first_name, last_name)]["dob"])),
  str(int(player_info.loc[(first_name, last_name)]["dob"])),
  wimbledon_players[str(player_name)]["HEIGHT"] = int(player_info.loc[(first_name, last_name)]["height"])
  wimbledon_players[str(player_name)]["HEIGHT"] = int(player_info.loc[(first_name, last_name)]["height"])
100%|██████████| 130/130 [00:00<00:00, 1995.53it/s]


## Start predicting

In [6]:
update_stats_param = {
    "k_factor": None,
    "base_k_factor": 43,
    "max_k_factor": 62,
    "div_number": 800,
    "bonus_after_layoff": True
}

In [7]:
clean_data = pd.read_csv("./data/0cleanDatasetWithQualifiersWith2025.csv")
prev_stats = createStats()

# Iterate through each row in clean_data
for index, row in tqdm(clean_data.iterrows(), total=len(clean_data)):
    ########## UPDATE STATS ##########
    # We only need to update the stats, since we don't need to create a dataset
    prev_stats = updateStats(row, prev_stats, **update_stats_param)

  clean_data = pd.read_csv("./data/0cleanDatasetWithQualifiersWith2025.csv")
100%|██████████| 197263/197263 [01:04<00:00, 3050.37it/s]


In [8]:
wimbledon_data = pd.read_csv("./data/allWimbledon2025.csv")

for index, row in tqdm(wimbledon_data[:-1].iterrows(), total=len(wimbledon_data[:-1])):
    ########## UPDATE STATS ##########
    prev_stats = updateStats(row, prev_stats, **update_stats_param)

100%|██████████| 233/233 [00:00<00:00, 1991.35it/s]


In [9]:
# Load the model from models
xgb_model = XGBClassifier()
# xgb_model.load_model("./models/xgb_modelWithQualifiersWith2025_3rdRound.json")
xgb_model.load_model("./models/best_final_xgb_model.json")

# I define this here to make the results more easy to interpret
mapper = np.vectorize(lambda x: "Player 2 Wins" if x == 0 else "Player 1 Wins")

In [10]:
def predict_twice(p1_name: str, p2_name: str):
    p1_prob = []
    p2_prob = []

    match = {
        "BEST_OF": 5,
        "DRAW_SIZE": 128,
        "SURFACE": "Grass",
        "ROUND": "R128"
    }

    player1 = wimbledon_players[p1_name]
    player2 = wimbledon_players[p2_name]

    # Call getStatsPlayers function
    output = getStats(player1, player2, match, prev_stats)
    match_data = pd.DataFrame([dict(sorted(output.items()))])
    probs = xgb_model.predict_proba(np.array(match_data, dtype=object))[:, ::-1]

    p1_prob.append(probs[0][0])
    p2_prob.append(probs[0][1])

    output = getStats(player2, player1, match, prev_stats)
    match_data = pd.DataFrame([dict(sorted(output.items()))])
    probs = xgb_model.predict_proba(np.array(match_data, dtype=object))[:, ::-1]

    p1_prob.append(probs[0][1])
    p2_prob.append(probs[0][0])

    return {
        p1_name: f"{round(float(np.mean(p1_prob)) * 100)}%",
        p2_name: f"{round(float(np.mean(p2_prob)) * 100)}%",
    }

In [11]:
predict_twice("Carlos Alcaraz", "Jannik Sinner")

{'Carlos Alcaraz': '53%', 'Jannik Sinner': '47%'}

In [13]:
# My Model
# Round 1 - Top Half
# 32 matches
# 21 correct
# 12 incorrect

# Round 1 - Bottom Half
# 32 matches
# 18 correct
# 12 incorrect

In [14]:
# IBM's Model
# Round 1 - Top Half
# 32 matches
# 19 correct
# 13 incorrect

# Round 1 - Bottom Half
# 32 matches
# 17 correct
# 15 incorrect

In [15]:
## No Qualifiers          # Qual        # IBM

# Thompson - Yes          - Yes         Yes
# Nuno Borges - Yes       - Yes         Yes
# Karen - Yes             - Yes         Yes
# Ethan Quinn - No        - No          No
# Learner Tien - No       - No          No
# Joao Fonseca - Yes      - Yes         Yes
# Jiri Lehecka - No       - No          No
# Frances Tiafoe - No     - No          No
# Andrey Rublev - Yes     - Yes         Yes
# Adrian Mannarino - No   - No          No
# Alcaraz - Yes           - Yes         
