# Compiling the data (matches from 2000 to 2024)

In [1]:
import pandas as pd
import glob
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
match_files = glob.glob("tennis_data/20*.csv")
matches_raw = pd.concat((pd.read_csv(f) for f in match_files), ignore_index=True)

In [3]:
matches_raw.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2000-7308,Adelaide,Hard,32.0,A,20000103,1,E113,1.0,,...,2.0,1.0,3.0,59.0,37.0,25.0,13.0,9.0,2.0,4.0
1,2000-7308,Adelaide,Hard,32.0,A,20000103,2,F324,,,...,0.0,3.0,5.0,42.0,15.0,13.0,12.0,8.0,0.0,3.0
2,2000-7308,Adelaide,Hard,32.0,A,20000103,3,G352,,,...,1.0,26.0,2.0,103.0,59.0,49.0,22.0,16.0,4.0,5.0
3,2000-7308,Adelaide,Hard,32.0,A,20000103,4,G379,7.0,,...,4.0,0.0,3.0,49.0,22.0,12.0,8.0,8.0,1.0,6.0
4,2000-7308,Adelaide,Hard,32.0,A,20000103,5,N250,3.0,,...,1.0,4.0,2.0,73.0,40.0,25.0,16.0,10.0,7.0,10.0


# Dropping irrelevant columns

In [4]:
features_temp = [
    'winner_id', 'loser_id',  
    'winner_rank', 'loser_rank',    
    'winner_age', 'loser_age',    
    'winner_ht', 'loser_ht',
    'surface',
    'tourney_level',
    'round',
    'w_ace', 'l_ace',
    'w_df', 'l_df',
    'minutes', 'tourney_date'
]

matches = matches_raw[features_temp]
matches.head()

Unnamed: 0,winner_id,loser_id,winner_rank,loser_rank,winner_age,loser_age,winner_ht,loser_ht,surface,tourney_level,round,w_ace,l_ace,w_df,l_df,minutes,tourney_date
0,E113,C487,4.0,56.0,25.81,22.045,191.0,173.0,Hard,A,R32,6.0,1.0,0.0,3.0,76.0,20000103
1,F324,K260,64.0,91.0,18.404,24.882,185.0,191.0,Hard,A,R32,6.0,3.0,3.0,5.0,45.0,20000103
2,G352,A202,58.0,105.0,22.585,28.797,191.0,185.0,Hard,A,R32,8.0,26.0,3.0,2.0,115.0,20000103
3,G379,I052,27.0,54.0,21.599,23.71,175.0,180.0,Hard,A,R32,4.0,0.0,2.0,3.0,65.0,20000103
4,N250,D270,15.0,154.0,23.595,25.58,188.0,175.0,Hard,A,R32,6.0,4.0,2.0,2.0,68.0,20000103


In [5]:
matches = matches.rename(columns={'w_ace':'winner_ace', 'l_ace':'loser_ace', 'w_df':'winner_df', 'l_df':'loser_df'})

# Cleaning the dataset

In [6]:
matches = matches.dropna()

In [7]:
matches['surface'] = matches['surface'].str.capitalize()

round_mapping = {'R128':'128','R64':'64','R32':'32','R16':'16','QF':'QF','SF':'SF','F':'F'}
matches['round'] = matches['round'].replace(round_mapping)

# Feature Engineering

In [8]:
mask = np.random.rand(len(matches)) > 0.5

matches["player1"] = np.where(mask, matches["winner_id"], matches["loser_id"])
matches["player2"] = np.where(mask, matches["loser_id"], matches["winner_id"])

In [9]:
stats = ["rank", "age", "ht", "ace", "df"]

for stat in stats:
    matches[f"player1_{stat}"] = np.where(mask, matches[f"winner_{stat}"], matches[f"loser_{stat}"])
    matches[f"player2_{stat}"] = np.where(mask, matches[f"loser_{stat}"], matches[f"winner_{stat}"])

In [10]:
matches["winner_binary"] = np.where(matches["winner_id"] == matches["player1"], 1, 0)

In [11]:
matches["rank_diff"] = matches["player1_rank"] - matches["player2_rank"]
matches["age_diff"]  = matches["player1_age"] - matches["player2_age"]
matches["ht_diff"] = matches["player1_ht"] - matches["player2_ht"]
matches["ace_diff"] = matches["player1_ace"] - matches["player2_ace"]
matches["df_diff"] = matches["player1_df"] - matches["player2_df"]

In [12]:
columns_to_drop = ['winner_id', 'loser_id', 'winner_rank', 'loser_rank', 'winner_age',
       'loser_age', 'winner_ht', 'loser_ht', 
       'winner_ace', 'loser_ace', 'winner_df', 'loser_df',
       'player1_rank', 'player2_rank', 'player1_age',
       'player2_age', 'player1_ht', 'player2_ht', 'player1_ace', 'player2_ace',
       'player1_df', 'player2_df']

matches = matches.drop(columns=columns_to_drop)

In [13]:
matches.head()

Unnamed: 0,surface,tourney_level,round,minutes,tourney_date,player1,player2,winner_binary,rank_diff,age_diff,ht_diff,ace_diff,df_diff
0,Hard,A,32,76.0,20000103,C487,E113,0,52.0,-3.765,-18.0,-5.0,3.0
1,Hard,A,32,45.0,20000103,K260,F324,0,27.0,6.478,6.0,-3.0,2.0
2,Hard,A,32,115.0,20000103,G352,A202,1,-47.0,-6.212,6.0,-18.0,1.0
3,Hard,A,32,65.0,20000103,G379,I052,1,-27.0,-2.111,-5.0,4.0,-1.0
4,Hard,A,32,68.0,20000103,D270,N250,0,139.0,1.985,-13.0,-2.0,0.0


In [None]:
def insertH2H_winrate_past(matches):
    """
    Adds 'h2h_winrate' = player1's win rate vs player2
    based only on past matches (before the current match).
    """

    # Sort matches by date (very important!)
    matches = matches.sort_values(by='tourney_date').reset_index(drop=True)

    h2h_winrate = []
    history = {}

    for _, row in matches.iterrows():
        p1, p2, winner = row['player1'], row['player2'], row['winner_binary']

        # --- Lookup history ---
        past = history.get((p1, p2), {"wins": 0, "total": 0})
        if past["total"] == 0:
            h2h_winrate.append(None)  # no history yet
        else:
            h2h_winrate.append(past["wins"] / past["total"])

        # --- Update history AFTER recording winrate ---
        # Case 1: p1 vs p2
        history.setdefault((p1, p2), {"wins": 0, "total": 0})
        history.setdefault((p2, p1), {"wins": 0, "total": 0})

        if winner == 1:  # player1 wins
            history[(p1, p2)]["wins"] += 1
        else:  # player2 wins
            history[(p2, p1)]["wins"] += 1

        history[(p1, p2)]["total"] += 1
        history[(p2, p1)]["total"] += 1

    matches['h2h_winrate'] = h2h_winrate
    return matches

matches = insertH2H_winrate_past(matches)

In [15]:
def add_recent_form_diff(matches, window=5):
    """
    Adds a single feature: recent form difference (player1 - player2).
    Each player's recent form = win ratio of last N matches (excluding current match).
    """

    # Sort chronologically (important!)
    matches = matches.sort_values(by='tourney_date').reset_index(drop=True)

    recent_form_diff = []
    history = {}

    for _, row in matches.iterrows():
        p1, p2, winner = row['player1'], row['player2'], row['winner_binary']

        # --- Player1 ---
        past_p1 = history.get(p1, [])
        if len(past_p1) == 0:
            form_p1 = None
        else:
            last_matches = past_p1[-window:]
            form_p1 = sum(last_matches) / len(last_matches)

        # --- Player2 ---
        past_p2 = history.get(p2, [])
        if len(past_p2) == 0:
            form_p2 = None
        else:
            last_matches = past_p2[-window:]
            form_p2 = sum(last_matches) / len(last_matches)

        # --- Difference ---
        if form_p1 is None or form_p2 is None:
            recent_form_diff.append(None)
        else:
            recent_form_diff.append(form_p1 - form_p2)

        # --- Update history AFTER computing form ---
        history.setdefault(p1, []).append(1 if winner == 1 else 0)
        history.setdefault(p2, []).append(1 if winner == 0 else 0)

    matches['recent_form_diff'] = recent_form_diff

    return matches

matches = add_recent_form_diff(matches)

In [16]:
matches = matches.dropna()

# Filter out our dataset from the matches dataframe

In [17]:
x_features = ['surface', 'tourney_level', 'round', 'minutes',
            'rank_diff', 'age_diff',
            'ht_diff', 'ace_diff', 'df_diff', 'h2h_winrate', 'recent_form_diff']
X = matches[x_features]
y = matches['winner_binary']

# One hot encode the features

In [18]:
X = pd.get_dummies(X,
                   columns = ['surface', 'tourney_level', 'round'],
                   drop_first = True
                )

# Split the dataset

In [19]:
X_train , X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

In [20]:
X_train_fit, X_eval, y_train_fit, y_eval = train_test_split(X_train, y_train, train_size=0.8, random_state=42, stratify=y_train)

# Create xgboost model

In [21]:
model = XGBClassifier(n_estimators = 500, learning_rate=0.1, verbosity=1, random_state=42, early_stopping_rounds=50)
model.fit(X_train_fit, y_train_fit, eval_set=[(X_eval, y_eval)])

[0]	validation_0-logloss:0.67130
[1]	validation_0-logloss:0.65370
[2]	validation_0-logloss:0.63926
[3]	validation_0-logloss:0.62707
[4]	validation_0-logloss:0.61676
[5]	validation_0-logloss:0.60812
[6]	validation_0-logloss:0.60102
[7]	validation_0-logloss:0.59458
[8]	validation_0-logloss:0.58944
[9]	validation_0-logloss:0.58474
[10]	validation_0-logloss:0.58065
[11]	validation_0-logloss:0.57664
[12]	validation_0-logloss:0.57383
[13]	validation_0-logloss:0.57099
[14]	validation_0-logloss:0.56870
[15]	validation_0-logloss:0.56659
[16]	validation_0-logloss:0.56498
[17]	validation_0-logloss:0.56381
[18]	validation_0-logloss:0.56237
[19]	validation_0-logloss:0.56131
[20]	validation_0-logloss:0.56023
[21]	validation_0-logloss:0.55912
[22]	validation_0-logloss:0.55813
[23]	validation_0-logloss:0.55769
[24]	validation_0-logloss:0.55729
[25]	validation_0-logloss:0.55709
[26]	validation_0-logloss:0.55642
[27]	validation_0-logloss:0.55607
[28]	validation_0-logloss:0.55568
[29]	validation_0-loglos

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [23]:
print(accuracy_score(model.predict(X_test), y_test))

0.7019494119542452


# Save the model

In [None]:
# import xgboost as xgb
# model.save_model("xgb_model.json")

# Load the model

In [3]:
# import xgboost as xgb
# model = xgb.Booster()
# model.load_model("xgb_model.json")