In [13]:
# ===== Load a small sample from the 2000 ATP file and inspect column names =====

# import necessary libraries
import pandas as pd
from pathlib import Path
import glob 
import numpy as np

# scikit-learn: logistic regression + metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss


file_2000 = Path("data/tennis_atp/atp_matches_2000.csv")
# read the first 5 rows  
df_sample = pd.read_csv(file_2000, nrows=5, low_memory=False)
print("Sample shape:", df_sample.shape)
print("Column names:")
for col in df_sample.columns:
    print("  -", col)
df_sample.head(3)


Sample shape: (5, 49)
Column names:
  - tourney_id
  - tourney_name
  - surface
  - draw_size
  - tourney_level
  - tourney_date
  - match_num
  - winner_id
  - winner_seed
  - winner_entry
  - winner_name
  - winner_hand
  - winner_ht
  - winner_ioc
  - winner_age
  - loser_id
  - loser_seed
  - loser_entry
  - loser_name
  - loser_hand
  - loser_ht
  - loser_ioc
  - loser_age
  - score
  - best_of
  - round
  - minutes
  - w_ace
  - w_df
  - w_svpt
  - w_1stIn
  - w_1stWon
  - w_2ndWon
  - w_SvGms
  - w_bpSaved
  - w_bpFaced
  - l_ace
  - l_df
  - l_svpt
  - l_1stIn
  - l_1stWon
  - l_2ndWon
  - l_SvGms
  - l_bpSaved
  - l_bpFaced
  - winner_rank
  - winner_rank_points
  - loser_rank
  - loser_rank_points


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2000-301,Auckland,Hard,32,A,20000110,1,103163,1.0,,...,55,39,29,17,4,7,11,1612,63,595
1,2000-301,Auckland,Hard,32,A,20000110,2,102607,,Q,...,32,25,18,12,3,6,211,157,49,723
2,2000-301,Auckland,Hard,32,A,20000110,3,103252,,,...,33,20,7,8,7,11,48,726,59,649


In [14]:
# ===== load 2000-19 data, build x/y, split train (≤2018) vs test (2019) =====

# 1) gather yearly CSVs
paths = sorted(glob.glob("data/tennis_atp/atp_matches_20*.csv"))

# 2) read 2000-2019 into one DataFrame
matches = pd.concat(
    [pd.read_csv(p, low_memory=False) for p in paths if 2000 <= int(p[-8:-4]) <= 2019],
    ignore_index=True,
)

# 3) convert date
matches["tourney_date"] = pd.to_datetime(matches["tourney_date"].astype(str),
                                         format="%Y%m%d")

# 4) train ≤2018, test =2019
train = matches[matches["tourney_date"].dt.year <= 2018].copy()
test  = matches[matches["tourney_date"].dt.year == 2019].copy()

# 5) build predictor x (points gap) and target y (high-rank winner?)
def add_xy(df):
    hi_win = df["winner_rank"] < df["loser_rank"]
    hi_pts = np.where(hi_win, df["winner_rank_points"], df["loser_rank_points"])
    lo_pts = np.where(hi_win, df["loser_rank_points"], df["winner_rank_points"])
    df["x"] = hi_pts - lo_pts
    df["y"] = hi_win.astype(int)
    return df

train = add_xy(train).dropna(subset=["x", "y"])
test  = add_xy(test ).dropna(subset=["x", "y"])

# 6) quick check
print(f"train rows: {len(train):,}   test rows: {len(test):,}")
print("train y=1:", round(train["y"].mean(), 3),
      " test y=1:", round(test["y"].mean(), 3))
print(train[["x", "y"]].head(3))


train rows: 57,474   test rows: 2,680
train y=1: 0.66  test y=1: 0.613
        x  y
0  1017.0  1
1   566.0  0
2    77.0  1


In [15]:
# ===== single-feature logistic regression (no intercept) =====

# 1) fit on 2000-2018 data
lr = LogisticRegression(fit_intercept=False)     # β0 = 0 by design
lr.fit(train[["x"]], train["y"])

# 2) predict probabilities for 2019
p_lr = lr.predict_proba(test[["x"]])[:, 1]       # prob that high-rank player wins

# 3) evaluation metrics
acc   = accuracy_score(test["y"], p_lr > 0.5)
ll    = log_loss(test["y"], p_lr)
calib = p_lr.mean() / test["y"].mean()

print(f"LR accuracy : {acc:.4f}")
print(f"LR log-loss : {ll:.4f}")
print(f"LR calib.   : {calib:.3f}")  

LR accuracy : 0.6138
LR log-loss : 0.6553
LR calib.   : 1.018


# Elo 

In [16]:
# ===== Prepare match order and Elo dictionary =====

# Merge train + test, then sort by date
all_matches = pd.concat([train, test]).sort_values("tourney_date").reset_index(drop=True)



# Check total rows and first 2 rows, make sure correct
print("Total matches:", len(all_matches))
all_matches.head(2)

Total matches: 60154


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,x,y
0,2000-451,Doha,Hard,32,A,2000-01-03,18,102854,8.0,,...,15.0,12.0,10.0,15.0,44.0,950.0,52.0,849.0,101.0,1
1,2000-339,Adelaide,Hard,32,A,2000-01-03,26,102796,3.0,,...,14.0,9.0,9.0,13.0,15.0,1748.0,57.0,805.0,943.0,1


In [23]:
# ===== Basic Elo update loop =====
K = 32
elo = {}          # player_id -> current Elo
preds   = []      # predictions for 2019
actuals = []      # actual results for 2019

# Convert required columns into plain Python lists
winner_ids = all_matches["winner_id"].astype(int).tolist()
loser_ids  = all_matches["loser_id"].astype(int).tolist()
dates      = all_matches["tourney_date"].tolist()   # already datetime

# Process row by row
for i in range(len(winner_ids)):
    winner = winner_ids[i]
    loser  = loser_ids[i]
    date   = dates[i]

    # Initialize Elo if player not seen before
    if winner not in elo:
        elo[winner] = 1500
    if loser not in elo:
        elo[loser] = 1500

    # Win probability for the winner
    rating_diff = elo[loser] - elo[winner]
    p_win = 1 / (1 + 10 ** (rating_diff / 400))

    # If match is in 2019, save prediction and actual outcome
    if date.year == 2019:
        preds.append(p_win)
        actuals.append(1)        # winner indeed won

        # preds.append(1 - p_win)
        # actuals.append(0)        # loser indeed won

    # Update Elo ratings
    elo[winner] += K * (1 - p_win)
    elo[loser]  -= K * (1 - p_win)

In [None]:
# ===== Calculate 2019 Elo metrics (explicit version) =====

# Accuracy ─ treat probability >0.5 as predicting "winner will win"
accuracy = accuracy_score(
    y_true   = actuals,                  # true labels (all 1s)
    y_pred   = [p > 0.5 for p in preds]  # convert prob to 0/1
)

# Log-loss ─ must explicitly specify both class labels [0,1]
logloss = log_loss(
    y_true   = actuals,
    y_pred   = preds,
    labels   = [0, 1]                    # tell sklearn both classes exist
)

# Calibration ─ average predicted / average actual
calib = sum(preds) / sum(actuals)

print(f"Elo accuracy : {accuracy:.4f}")
print(f"Elo log-loss : {logloss:.4f}")
print(f"Elo calib.   : {calib:.3f}")

Elo accuracy : 0.6392
Elo log-loss : 0.6363
Elo calib.   : 0.573
