# Compiling the data (matches from 2000 to 2024)

In [None]:
import pandas as pd
import glob
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
all_csv_files = glob.glob("tennis_data/*.csv")
all_csv_files.pop()
matches = pd.concat(pd.read_csv(f) for f in all_csv_files)

In [21]:
matches.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,1968-9346,Cape Town-1,Hard,64.0,A,19680101,1.0,BB88,,,...,,,,,,,,,,
1,1968-9346,Cape Town-1,Hard,64.0,A,19680101,2.0,C090,,,...,,,,,,,,,,
2,1968-9346,Cape Town-1,Hard,64.0,A,19680101,3.0,F068,,,...,,,,,,,,,,
3,1968-9346,Cape Town-1,Hard,64.0,A,19680101,4.0,G079,,,...,,,,,,,,,,
4,1968-9346,Cape Town-1,Hard,64.0,A,19680101,5.0,GF84,,,...,,,,,,,,,,


In [22]:
matches.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'loser_rank', 'loser_rank_points', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced'],
      dtype='object')

# Cleaning the dataset

In [None]:
features_temp = [
    'winner_id', 'loser_id',  
    'winner_rank', 'loser_rank',    
    'winner_age', 'loser_age',    
    'winner_ht', 'loser_ht',
    'winner_hand', 'loser_hand',
    'surface',
    'tourney_level',
    'round',
    'w_ace', 'l_ace',
    'w_df', 'l_df',
    'w_svpt', 'l_svpt',
    'w_1stWon', 'l_1stWon',
    'w_SvGms', 'l_SvGms',
    'minutes', 'tourney_date'
]

matches = matches[features_temp]
matches.head()

Unnamed: 0,winner_id,loser_id,winner_rank,loser_rank,winner_age,loser_age,winner_ht,loser_ht,surface,tourney_level,...,winner_hand,loser_hand,w_svpt,l_svpt,w_1stWon,l_1stWon,w_SvGms,l_SvGms,minutes,tourney_date
0,BB88,M0M6,,,24.884,,,,Hard,A,...,,,,,,,,,,19680101
1,C090,Y008,,,24.493,19.053,,,Hard,A,...,L,R,,,,,,,,19680101
2,F068,RD88,,,19.595,,,,Hard,A,...,R,,,,,,,,,19680101
3,G079,P576,,,24.159,23.513,188.0,,Hard,A,...,R,,,,,,,,,19680101
4,GF84,LE16,,,19.228,18.13,,,Hard,A,...,,,,,,,,,,19680101


In [31]:
matches = matches.dropna()

In [33]:
matches = matches.rename(columns={'w_ace':'winner_ace', 'l_ace':'loser_ace', 'w_df':'winner_df', 'l_df':'loser_df', 
                                  'w_svpt':'winner_svpt', 'l_svpt':'loser_svpt', 'w_1stWon':'winner_1stWon', 'l_1stWon':'loser_1stWon',
                                  'w_SvGms':'winner_SvGms', 'l_SvGms':'loser_SvGms'})

In [34]:
# matches['surface'] = matches['surface'].str.capitalize()

round_mapping = {'R128':'128','R64':'64','R32':'32','R16':'16','QF':'QF','SF':'SF','F':'F'}
matches['round'] = matches['round'].replace(round_mapping)

In [38]:
matches.head()

Unnamed: 0,winner_id,loser_id,winner_rank,loser_rank,winner_age,loser_age,winner_ht,loser_ht,surface,tourney_level,...,winner_svpt,loser_svpt,winner_1stWon,loser_1stWon,winner_SvGms,loser_SvGms,minutes,tourney_date,player1,player2
0,L206,B028,56.0,2.0,20.769,23.107,193.0,191.0,Hard,A,...,96.0,95.0,39.0,44.0,15.0,16.0,130.0,19901231,B028,L206
1,Z006,K021,304.0,75.0,27.441,24.624,198.0,175.0,Hard,A,...,101.0,84.0,45.0,35.0,15.0,15.0,119.0,19901231,Z006,K021
2,K030,P004,82.0,69.0,24.884,23.05,191.0,183.0,Hard,A,...,54.0,60.0,24.0,22.0,8.0,8.0,71.0,19901231,K030,P004
3,W136,R186,50.0,84.0,19.748,20.876,178.0,180.0,Hard,A,...,60.0,74.0,30.0,30.0,9.0,10.0,85.0,19901231,R186,W136
4,R023,B350,88.0,28.0,24.427,19.956,185.0,188.0,Hard,A,...,72.0,77.0,33.0,28.0,10.0,11.0,90.0,19901231,R023,B350


# Feature Engineering

In [37]:
mask = np.random.rand(len(matches)) > 0.5

matches["player1"] = np.where(mask, matches["winner_id"], matches["loser_id"])
matches["player2"] = np.where(mask, matches["loser_id"], matches["winner_id"])

In [39]:
stats = ["rank", "age", "ht", "ace", "df", "svpt", "1stWon", "SvGms"]

for stat in stats:
    matches[f"player1_{stat}"] = np.where(mask, matches[f"winner_{stat}"], matches[f"loser_{stat}"])
    matches[f"player2_{stat}"] = np.where(mask, matches[f"loser_{stat}"], matches[f"winner_{stat}"])

In [40]:
matches["winner_binary"] = np.where(matches["winner_id"] == matches["player1"], 1, 0)

In [41]:
matches["rank_diff"] = matches["player1_rank"] - matches["player2_rank"]
matches["age_diff"]  = matches["player1_age"] - matches["player2_age"]
matches["ht_diff"] = matches["player1_ht"] - matches["player2_ht"]
matches["ace_diff"] = matches["player1_ace"] - matches["player2_ace"]
matches["df_diff"] = matches["player1_df"] - matches["player2_df"]
matches["svpt_diff"] = matches["player1_svpt"] - matches["player2_svpt"]
matches["1stWon_diff"] = matches["player1_1stWon"] - matches["player2_1stWon"]
matches["SvGms_diff"] = matches["player1_SvGms"] - matches["player2_SvGms"]

In [42]:
columns_to_drop = ['winner_id', 'loser_id', 'winner_rank', 'loser_rank', 'winner_age',
       'loser_age', 'winner_ht', 'loser_ht', 
       'winner_ace', 'loser_ace', 'winner_df', 'loser_df',
       'player1_rank', 'player2_rank', 'player1_age',
       'player2_age', 'player1_ht', 'player2_ht', 'player1_ace', 'player2_ace',
       'player1_df', 'player2_df', 'player1_svpt', 'player2_svpt', 'player1_1stWon', 'player2_1stWon',
       'player1_SvGms', 'player2_SvGms']

matches = matches.drop(columns=columns_to_drop)

In [43]:
matches.head()

Unnamed: 0,surface,tourney_level,round,winner_hand,loser_hand,winner_svpt,loser_svpt,winner_1stWon,loser_1stWon,winner_SvGms,...,player2,winner_binary,rank_diff,age_diff,ht_diff,ace_diff,df_diff,svpt_diff,1stWon_diff,SvGms_diff
0,Hard,A,32,R,R,96.0,95.0,39.0,44.0,15.0,...,L206,0,-54.0,2.338,-2.0,2.0,1.0,-1.0,5.0,1.0
1,Hard,A,32,R,L,101.0,84.0,45.0,35.0,15.0,...,K021,1,229.0,2.817,23.0,11.0,2.0,17.0,10.0,0.0
2,Hard,A,32,R,R,54.0,60.0,24.0,22.0,8.0,...,P004,1,13.0,1.834,8.0,4.0,-1.0,-6.0,2.0,0.0
3,Hard,A,32,R,R,60.0,74.0,30.0,30.0,9.0,...,W136,0,34.0,1.128,2.0,1.0,3.0,14.0,0.0,1.0
4,Hard,A,32,R,R,72.0,77.0,33.0,28.0,10.0,...,B350,1,60.0,4.471,-3.0,2.0,0.0,-5.0,5.0,-1.0


In [47]:
def insertH2H_winrate_past(matches):
    """
    Adds 'h2h_winrate' = player1's win rate vs player2
    based only on past matches (before the current match).
    """

    # Sort matches by date (very important!)
    matches = matches.sort_values(by='tourney_date').reset_index(drop=True)

    h2h_winrate = []
    history = {}

    for _, row in matches.iterrows():
        p1, p2, winner = row['player1'], row['player2'], row['winner_binary']

        # --- Lookup history ---
        past = history.get((p1, p2), {"wins": 0, "total": 0})
        if past["total"] == 0:
            h2h_winrate.append(0.5)# no history yet
        else:
            h2h_winrate.append(past["wins"] / past["total"])
        # --- Update history AFTER recording winrate ---
        # Case 1: p1 vs p2
        history.setdefault((p1, p2), {"wins": 0, "total": 0})
        history.setdefault((p2, p1), {"wins": 0, "total": 0})

        if winner == 1:  # player1 wins
            history[(p1, p2)]["wins"] += 1
        else:  # player2 wins
            history[(p2, p1)]["wins"] += 1

        history[(p1, p2)]["total"] += 1
        history[(p2, p1)]["total"] += 1

    matches['h2h_winrate'] = h2h_winrate
    return matches

matches = insertH2H_winrate_past(matches)

In [45]:
def add_recent_form_diff(matches, window=5):
    """
    Adds a single feature: recent form difference (player1 - player2).
    Each player's recent form = win ratio of last N matches (excluding current match).
    """

    # Sort chronologically (important!)
    matches = matches.sort_values(by='tourney_date').reset_index(drop=True)

    recent_form_diff = []
    history = {}

    for _, row in matches.iterrows():
        p1, p2, winner = row['player1'], row['player2'], row['winner_binary']

        # --- Player1 ---
        past_p1 = history.get(p1, [])
        if len(past_p1) == 0:
            form_p1 = None
        else:
            last_matches = past_p1[-window:]
            form_p1 = sum(last_matches) / len(last_matches)

        # --- Player2 ---
        past_p2 = history.get(p2, [])
        if len(past_p2) == 0:
            form_p2 = None
        else:
            last_matches = past_p2[-window:]
            form_p2 = sum(last_matches) / len(last_matches)

        # --- Difference ---
        if form_p1 is None or form_p2 is None:
            recent_form_diff.append(None)
        else:
            recent_form_diff.append(form_p1 - form_p2)

        # --- Update history AFTER computing form ---
        history.setdefault(p1, []).append(1 if winner == 1 else 0)
        history.setdefault(p2, []).append(1 if winner == 0 else 0)

    matches['recent_form_diff'] = recent_form_diff

    return matches

matches = add_recent_form_diff(matches)

In [50]:
matches.columns

Index(['surface', 'tourney_level', 'round', 'winner_hand', 'loser_hand',
       'winner_svpt', 'loser_svpt', 'winner_1stWon', 'loser_1stWon',
       'winner_SvGms', 'loser_SvGms', 'minutes', 'tourney_date', 'player1',
       'player2', 'winner_binary', 'rank_diff', 'age_diff', 'ht_diff',
       'ace_diff', 'df_diff', 'svpt_diff', '1stWon_diff', 'SvGms_diff',
       'h2h_winrate', 'recent_form_diff'],
      dtype='object')

# Filter out our dataset from the matches dataframe

In [51]:
X_features = ['surface', 'tourney_level', 'round', 'minutes',
            'rank_diff', 'age_diff',
            'ht_diff', 'ace_diff', 'df_diff', 'svpt_diff', '1stWon_diff', 'SvGms_diff', 'h2h_winrate', 'recent_form_diff']
X = matches[X_features]
y = matches['winner_binary']

# One hot encode the features

In [52]:
X = pd.get_dummies(X,
                   columns = ['surface', 'tourney_level', 'round'],
                   drop_first = True
                )

# Split the dataset

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

In [54]:
X_train_fit, X_eval, y_train_fit, y_eval = train_test_split(X_train, y_train, train_size=0.8, random_state=42, stratify=y_train)

In [56]:
X_test.head()

Unnamed: 0,minutes,rank_diff,age_diff,ht_diff,ace_diff,df_diff,svpt_diff,1stWon_diff,SvGms_diff,h2h_winrate,...,tourney_level_M,tourney_level_O,round_16,round_32,round_64,round_BR,round_F,round_QF,round_RR,round_SF
17390,122.0,214.0,0.789,-6.0,1.0,2.0,25.0,-1.0,-1.0,1.0,...,True,False,False,False,True,False,False,False,False,False
20153,54.0,570.0,3.239,-5.0,-5.0,-2.0,-4.0,-9.0,0.0,0.5,...,False,False,False,True,False,False,False,False,False,False
80985,119.0,13.0,-6.582,-10.0,-1.0,-2.0,-25.0,2.0,1.0,0.0,...,False,False,False,True,False,False,False,False,False,False
22740,54.0,-25.0,-5.804,-2.0,-4.0,0.0,13.0,-2.0,1.0,0.5,...,False,False,False,False,False,False,False,True,False,False
79905,76.0,-42.0,-4.266,-3.0,1.0,-2.0,-9.0,10.0,0.0,1.0,...,True,False,True,False,False,False,False,False,False,False


# Create xgboost model

In [57]:
model = XGBClassifier(n_estimators = 500, learning_rate=0.1, verbosity=1, random_state=42, early_stopping_rounds=50)
model.fit(X_train_fit, y_train_fit, eval_set=[(X_eval, y_eval)])

[0]	validation_0-logloss:0.65229
[1]	validation_0-logloss:0.61873
[2]	validation_0-logloss:0.59030
[3]	validation_0-logloss:0.56570
[4]	validation_0-logloss:0.54518
[5]	validation_0-logloss:0.52731
[6]	validation_0-logloss:0.51211
[7]	validation_0-logloss:0.49857
[8]	validation_0-logloss:0.48675
[9]	validation_0-logloss:0.47629
[10]	validation_0-logloss:0.46719
[11]	validation_0-logloss:0.45897
[12]	validation_0-logloss:0.45191
[13]	validation_0-logloss:0.44563
[14]	validation_0-logloss:0.44015
[15]	validation_0-logloss:0.43510
[16]	validation_0-logloss:0.43052
[17]	validation_0-logloss:0.42644
[18]	validation_0-logloss:0.42279
[19]	validation_0-logloss:0.41944
[20]	validation_0-logloss:0.41619
[21]	validation_0-logloss:0.41364
[22]	validation_0-logloss:0.41108
[23]	validation_0-logloss:0.40833
[24]	validation_0-logloss:0.40628
[25]	validation_0-logloss:0.40436
[26]	validation_0-logloss:0.40252
[27]	validation_0-logloss:0.40084
[28]	validation_0-logloss:0.39951
[29]	validation_0-loglos

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [58]:
print(accuracy_score(model.predict(X_test), y_test))

0.8283750459100687


# Save the model

In [59]:
import xgboost as xgb
model.save_model("predictor.json")

# Load the model

In [None]:
# import xgboost as xgb
# model = xgb.XGBClassifier()
# model.load_model("xgb_model.json")