In [6]:
import pandas as pd

data = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_2024.csv"

df = (
    pd.read_csv(data)          # full-season file from the repo
      .assign(date=lambda d: pd.to_datetime(d["tourney_date"], format="%Y%m%d"))
)

# # Keep only columns we need for a quick demo
cols = ["date", "tourney_name","tourney_date", "surface", "winner_name",
        "loser_name", "winner_rank", "loser_rank"]
df = df[cols]
df

Unnamed: 0,date,tourney_name,tourney_date,surface,winner_name,loser_name,winner_rank,loser_rank
0,2024-01-01,Brisbane,20240101,Hard,Grigor Dimitrov,Holger Rune,14.0,8.0
1,2024-01-01,Brisbane,20240101,Hard,Holger Rune,Roman Safiullin,8.0,39.0
2,2024-01-01,Brisbane,20240101,Hard,Grigor Dimitrov,Jordan Thompson,14.0,55.0
3,2024-01-01,Brisbane,20240101,Hard,Holger Rune,James Duckworth,8.0,116.0
4,2024-01-01,Brisbane,20240101,Hard,Roman Safiullin,Matteo Arnaldi,39.0,44.0
...,...,...,...,...,...,...,...,...
3071,2024-02-03,Davis Cup WG2 PO: URU vs MDA,20240203,Clay,Joaquin Aguilar Cardozo,Ilya Snitari,1109.0,740.0
3072,2024-02-02,Davis Cup WG2 PO: VIE vs RSA,20240202,Hard,Nam Hoang Ly,Philip Henning,554.0,748.0
3073,2024-02-02,Davis Cup WG2 PO: VIE vs RSA,20240202,Hard,Kris Van Wyk,Linh Giang Trinh,416.0,
3074,2024-02-02,Davis Cup WG2 PO: VIE vs RSA,20240202,Hard,Nam Hoang Ly,Kris Van Wyk,554.0,416.0


In [22]:
# overall wins
wins = df['winner_name'].value_counts()

# wins broken down by surface
surface_stats = (
    df.groupby(['winner_name', 'surface'])
      .size()
      .unstack(fill_value=0)
)

# a simple rivalry drill-down
djoko_vs_alcaraz = df[
    (df['winner_name'].isin(['Novak Djokovic', 'Carlos Alcaraz'])) &
    (df['loser_name'].isin(['Novak Djokovic', 'Carlos Alcaraz']))
]

In [21]:
wins

winner_name
Jannik Sinner            74
Alexander Zverev         69
Carlos Alcaraz           54
Taylor Fritz             53
Casper Ruud              52
                         ..
Benjamin Balleret         1
Hady Habib                1
Hugo Nys                  1
Pierre Hugues Herbert     1
Nishesh Basavareddy       1
Name: count, Length: 307, dtype: int64

In [24]:
surface_stats

surface,Clay,Grass,Hard
winner_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adam Walton,0,2,1
Adria Soriano Barrera,1,0,0
Adrian Mannarino,0,1,15
Alan Fernando Rubio Fierros,0,0,1
Albert Ramos,3,0,0
...,...,...,...
Yshai Oliel,0,0,2
Zachary Svajda,0,0,4
Zhizhen Zhang,9,4,12
Zizou Bergs,3,1,13


In [23]:
djoko_vs_alcaraz

Unnamed: 0,date,tourney_name,tourney_date,surface,winner_name,loser_name,winner_rank,loser_rank
1725,2024-07-01,Wimbledon,20240701,Grass,Carlos Alcaraz,Novak Djokovic,3.0,2.0
2028,2024-07-29,Paris Olympics,20240729,Clay,Novak Djokovic,Carlos Alcaraz,2.0,3.0


In [9]:
from collections import defaultdict

# Ensure chronological
df = df.sort_values('tourney_date')

elo = defaultdict(lambda: 1500.0)
surf_elo = defaultdict(lambda: 1500.0)
K = 32
surface_factor = {'Clay':1.0, 'Hard':1.0, 'Grass':1.0}  # equal

rows = []
for _, row in df.iterrows():
    w = row['winner_name']
    l = row['loser_name']
    surface = row['surface']
    if surface not in surface_factor:
        surface = surface  # unknown surfaces keep as string
    ew = 1 / (1 + 10 ** ((elo[l] - elo[w])/400))
    el = 1 - ew
    # update global
    elo[w] += K * (1 - ew)
    elo[l] += K * (0 - el)
    # surface-specific update
    ew_s = 1 / (1 + 10 ** ((surf_elo[l + '_' + surface] - surf_elo[w + '_' + surface])/400))
    surf_elo[w + '_' + surface] += K * (1 - ew_s)
    surf_elo[l + '_' + surface] += K * (0 - (1 - ew_s))
    
    # features before update
    feat = {
        'elo_diff': elo[w] - elo[l],  # after update though. Should capture before update; adjust
    }

feat

{'elo_diff': 50.19731936466064}

In [16]:
def build_dataset_elo(df):
    df = df.sort_values('tourney_date')
    elo = defaultdict(lambda: 1500.0)
    surf_elo = defaultdict(lambda: 1500.0)
    K = 32
    data = []
    for _, row in df.iterrows():
        w = row['winner_name']
        l = row['loser_name']
        surface = row['surface']
        # Pre-match ratings
        elo_w = elo[w]
        elo_l = elo[l]
        surf_w = surf_elo[w + '_' + surface]
        surf_l = surf_elo[l + '_' + surface]
        # Add rows: winner perspective
        data.append({'elo_diff': elo_w - elo_l,
                     'surf_elo_diff': surf_w - surf_l,
                     'label': 1})
        # Add reverse perspective
        data.append({'elo_diff': elo_l - elo_w,
                     'surf_elo_diff': surf_l - surf_w,
                     'label': 0})
        # Update ratings
        expected_w = 1 / (1 + 10 ** ((elo_l - elo_w)/400))
        elo[w] += K * (1 - expected_w)
        elo[l] += K * (0 - (1 - expected_w))
        expected_w_s = 1 / (1 + 10 ** ((surf_l - surf_w)/400))
        surf_elo[w + '_' + surface] += K * (1 - expected_w_s)
        surf_elo[l + '_' + surface] += K * (0 - (1 - expected_w_s))
    return pd.DataFrame(data), elo, surf_elo

dataset, final_elo, final_surf_elo = build_dataset_elo(df)
dataset.tail()

Unnamed: 0,elo_diff,surf_elo_diff,label
6147,110.316638,57.054854,0
6148,126.298496,83.362258,1
6149,-126.298496,-83.362258,0
6150,21.224336,25.579748,1
6151,-21.224336,-25.579748,0


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
X = dataset[['elo_diff', 'surf_elo_diff']]
y = dataset['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
models = {
    'Decision Tree': DecisionTreeClassifier(max_depth=4, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Neural Net': MLPClassifier(hidden_layer_sizes=(16,8), max_iter=500, random_state=42)
}
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)[:,1]
    results[name] = {
        'Accuracy': accuracy_score(y_test, preds),
        'AUC': roc_auc_score(y_test, probas)
    }
results

{'Decision Tree': {'Accuracy': 0.6027308192457738,
  'AUC': np.float64(0.6356937640459889)},
 'Random Forest': {'Accuracy': 0.6059817945383615,
  'AUC': np.float64(0.6441581369079125)},
 'Gradient Boosting': {'Accuracy': 0.599479843953186,
  'AUC': np.float64(0.6307754484993092)},
 'Neural Net': {'Accuracy': 0.5838751625487646,
  'AUC': np.float64(0.638080630951314)}}


I’ll start by computing the probability for each match based on features like Elo difference and surface Elo difference. To do this for Roland Garros, I need a list of players still in the tournament — probably using the top 1-32 seeds. To speed things up, I could rely on a pre-defined list of players. The simplest approach is to predict the winner by identifying the player with the highest clay Elo rating. I'll compute the top 20 players by this rating and simulate the tournament.

In [18]:
player_clay_elo = []
for key, rating in final_surf_elo.items():
    if key.endswith('_Clay'):
        player = key[:-5]  # remove suffix '_Clay'
        player_clay_elo.append((player, rating))
top20 = sorted(player_clay_elo, key=lambda x: x[1], reverse=True)[:20]
top20[:10]

[('Alexander Zverev', 1682.4264358968703),
 ('Casper Ruud', 1680.693214683674),
 ('Matteo Berrettini', 1677.083912593009),
 ('Carlos Alcaraz', 1676.5072910393078),
 ('Novak Djokovic', 1668.6597664753915),
 ('Stefanos Tsitsipas', 1665.0141412928526),
 ('Jan Lennard Struff', 1628.0645770493663),
 ('Sebastian Baez', 1625.0218728908624),
 ('Felix Auger Aliassime', 1619.7190489225532),
 ('Hubert Hurkacz', 1619.5971035918733)]

To predict the winner, I'll compute the probability for each match based on Elo differences, with a focus on surface Elo (especially clay). I'll gather a list of top players, possibly using the seed rankings or current Elo ratings to approximate. The simplest approach is to simulate the tournament and pick the player with the highest clay surface Elo. From there, I’ll gather the top 20 ranked players and compute their tournament probabilities.

Let's simulate the tournament champion prediction based on model outputs. We can compute each player's probability of winning by evaluating their Elo and predicted match outcomes. Random Forest could help with this, as it focuses on differences in ratings. We can create a Monte Carlo simulation, but tracking a bracket structure would be more accurate. For simplicity, I’ll use the Random Forest model to calculate each player's chance of winning by averaging their win probability against competitors, then raise it to the 7th power for a championship.

In [19]:
import itertools, numpy as np
player_list = [p for p,_ in top20]  # top 20
def prob_p1_beats_p2(p1,p2,model):
    elo1, elo2 = final_elo[p1], final_elo[p2]
    surf1, surf2 = final_surf_elo[p1+'_Clay'], final_surf_elo[p2+'_Clay']
    X_pair = np.array([[elo1-elo2, surf1 - surf2]])
    return model.predict_proba(X_pair)[0,1]
forest = models['Random Forest']
prob_dict = {}
for p in player_list:
    probs = []
    for q in player_list:
        if p==q: continue
        probs.append(prob_p1_beats_p2(p,q,forest))
    avg = np.mean(probs)
    prob_dict[p] = avg**7  # approx champion probability
sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)[:10]



[('Jannik Sinner', np.float64(0.09867397066249572)),
 ('Alexander Zverev', np.float64(0.09338509267515172)),
 ('Novak Djokovic', np.float64(0.07591007025176984)),
 ('Carlos Alcaraz', np.float64(0.07219725178254631)),
 ('Taylor Fritz', np.float64(0.03281457756123151)),
 ('Matteo Berrettini', np.float64(0.01333345346386925)),
 ('Stefanos Tsitsipas', np.float64(0.01255880536811315)),
 ('Daniil Medvedev', np.float64(0.010744435889429327)),
 ('Alex De Minaur', np.float64(0.008517844423982233)),
 ('Tommy Paul', np.float64(0.008410314974668143))]

To predict the champion, we could simulate different models considering each player's probability of winning. Zverev tops the predictions based on his high Elo, but we could use other methods to refine the model. A simpler approach might be calculating each player's average win probability, then applying that to simulate the tournament without a bracket. We could also consider the probability rankings from various models, comparing those to predict the champion by the highest win probability across the field.