In [1]:
import pandas as pd
import numpy as np
import re
import json

In [10]:
# ------------------------------------------------------------------
# Load the pickle and keep only Grand-Slam matches from 2000 onwards
# ------------------------------------------------------------------
pickle_path = "/Users/saurabhkumar/Desktop/Work/Tennis"
atp_data     = pd.read_pickle(f"{pickle_path}/ATPdata1968_2024.pkl")

gs_data = atp_data.loc[
    (atp_data["tourney_level"] == "G") &
    (pd.to_datetime(atp_data["tourney_date"], format="%Y%m%d").dt.year >= 2000)
].copy()

print(f"Matches kept : {len(gs_data):,}")


Matches kept : 12,192


In [3]:
gs_data.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [11]:
# ---------- 1A. score-string parser ----------
def parse_score(score_str: str) -> tuple[int, int, int, int]:
    """
    Returns: sets_won, sets_lost, games_won, games_lost
    Handles tiebreaks like '7-6(3)'
    """
    if not isinstance(score_str, str) or score_str.strip() == "":
        return 0, 0, 0, 0

    sets_won = sets_lost = games_won = games_lost = 0
    for s in score_str.strip().split():
        # strip tiebreak details
        clean = re.sub(r"\([^)]*\)", "", s)
        if "-" not in clean:
            continue
        try:
            w, l = map(int, clean.split("-")[:2])
            games_won  += w
            games_lost += l
            if (w > l) or (w == 7 and l == 6):   # 7-6 counts as win
                sets_won  += 1
            else:
                sets_lost += 1
        except ValueError:
            pass
    return sets_won, sets_lost, games_won, games_lost


# ---------- 1B. add_calculated_columns ----------
def add_calculated_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Return a copy of df enriched with 60+ match-level metrics."""
    out = df.copy()

    # --- basic outcome from score ---
    sw, sl, gw, gl = zip(*out["score"].apply(parse_score))
    out["sets_won"],  out["sets_lost"]  = sw, sl
    out["games_won"], out["games_lost"] = gw, gl
    out["straight_sets_win"] = (out["sets_lost"] == 0).astype(int)

    # --- service stats ---
    out["service_points_won"]     = out["w_1stWon"] + out["w_2ndWon"]
    out["service_points_won_pct"] = np.where(
        out["w_svpt"] > 0,
        out["service_points_won"] / out["w_svpt"] * 100,
        0.0,
    )
    out["first_serve_pct"]        = np.where(
        out["w_svpt"] > 0, out["w_1stIn"] / out["w_svpt"] * 100, 0.0
    )
    out["first_serve_won_pct"]    = np.where(
        out["w_1stIn"] > 0, out["w_1stWon"] / out["w_1stIn"] * 100, 0.0
    )
    second_serves                 = out["w_svpt"] - out["w_1stIn"]
    out["second_serve_won_pct"]   = np.where(
        second_serves > 0, out["w_2ndWon"] / second_serves * 100, 0.0
    )

    out["service_games_won"]     = out["w_SvGms"] - (out["l_bpFaced"] - out["l_bpSaved"])
    out["service_games_won_pct"] = np.where(
        out["w_SvGms"] > 0, out["service_games_won"] / out["w_SvGms"] * 100, 0.0
    )
    out["break_points_saved_pct"] = np.where(
        out["w_bpFaced"] > 0, out["w_bpSaved"] / out["w_bpFaced"] * 100, 0.0
    )

    # --- return stats ---
    out["return_points_won"]      = out["l_svpt"] - (out["l_1stWon"] + out["l_2ndWon"])
    out["return_points_won_pct"]  = np.where(
        out["l_svpt"] > 0, out["return_points_won"] / out["l_svpt"] * 100, 0.0
    )
    out["return_games_won"]       = out["l_bpFaced"] - out["l_bpSaved"]
    out["return_games_won_pct"]   = np.where(
        out["l_SvGms"] > 0, out["return_games_won"] / out["l_SvGms"] * 100, 0.0
    )
    out["break_points_conv_pct"]  = np.where(
        out["l_bpFaced"] > 0, out["return_games_won"] / out["l_bpFaced"] * 100, 0.0
    )

    # --- overall dominance ---
    out["total_points_won"]      = out["service_points_won"] + out["return_points_won"]
    out["total_points_played"]   = out["w_svpt"] + out["l_svpt"]
    out["total_points_won_pct"]  = np.where(
        out["total_points_played"] > 0,
        out["total_points_won"] / out["total_points_played"] * 100,
        0.0,
    )
    out["games_won_pct"] = np.where(
        (out["games_won"] + out["games_lost"]) > 0,
        out["games_won"] / (out["games_won"] + out["games_lost"]) * 100,
        0.0,
    )
    out["sets_won_pct"] = np.where(
        (out["sets_won"] + out["sets_lost"]) > 0,
        out["sets_won"] / (out["sets_won"] + out["sets_lost"]) * 100,
        0.0,
    )

    # --- opponent quality flags ---
    out["beat_top10"]  = (out["loser_rank"] <= 10).astype(int)
    out["beat_top20"]  = (out["loser_rank"] <= 20).astype(int)
    out["beat_top30"]  = (out["loser_rank"] <= 30).astype(int)
    out["beat_top5"] = (out["loser_rank"] <= 5).astype(int)

    # Calculate speed_score for each match
    min_minutes = out["minutes"].min()
    max_minutes = out["minutes"].max()
    out["speed_score"] = 100 * (max_minutes - out["minutes"]) / (max_minutes - min_minutes)
    out["speed_score"] = out["speed_score"].clip(lower=0, upper=100)

    return out

In [12]:
gs_enh = add_calculated_columns(gs_data)
print("Enhanced columns:", gs_enh.shape[1])

Enhanced columns: 77


In [39]:
# ---------- 3A. config for scoring ----------
WEIGHTS = dict(
    sets     = 0.32,   # up from 0.20
    games    = 0.25,
    points   = 0.23,
    opponent = 0.10,   # down from 0.20
    speed    = 0.10,   # down from 0.15
)

BONUS = dict(
    top5     = 3,      # down from 10
    perfect  = 10,      # up a bit from 5, but not huge
)


# ---------- 3B. campaign summary ----------
def campaign_summary(df: pd.DataFrame) -> pd.Series:
    """
    df = all matches of the *winner* in one tournament
    Returns one row with aggregated metrics.
    """
    out = {
        "player"              : df["winner_name"].iloc[0],
        "tourney_name"        : df["tourney_name"].iloc[0],
        "year"                : pd.to_datetime(df["tourney_date"].iloc[0], format="%Y%m%d").year,
        "matches"             : len(df),
        "sets_won"            : df["sets_won"].sum(),
        "sets_lost"           : df["sets_lost"].sum(),
        "games_won"           : df["games_won"].sum(),
        "games_lost"          : df["games_lost"].sum(),
        "points_won"          : df["total_points_won"].sum(),
        "points_played"       : df["total_points_played"].sum(),
        "pct_top20_opponents" : 100 * df["beat_top20"].sum() / len(df),
        "pct_top30_opponents" : 100 * df["beat_top30"].sum() / len(df),
        "top10_wins"          : df["beat_top10"].sum(),
        "top20_wins"          : df["beat_top20"].sum(),
        "top5_wins"           : df["beat_top5"].sum(),
        "straight_sets_wins"  : df["straight_sets_win"].sum(),
        "avg_match_minutes"   : df["minutes"].mean(),
        "speed_score"         : df["speed_score"].mean(),
    }
    # derived percentages
    out["sets_won_pct"]   = 100 * out["sets_won"]   / (out["sets_won"]   + out["sets_lost"])
    out["games_won_pct"]  = 100 * out["games_won"]  / (out["games_won"]  + out["games_lost"])
    out["points_won_pct"] = 100 * out["points_won"] / out["points_played"]
    return pd.Series(out)


# ---------- 3C. dominance score ----------
def dominance_score(row: pd.Series) -> float:
    """weighted base + bonus"""
    
    base = (
        WEIGHTS["sets"]     * row["sets_won_pct"] +
        WEIGHTS["games"]    * row["games_won_pct"] +
        WEIGHTS["points"]   * row["points_won_pct"] +
        WEIGHTS["opponent"] * row["pct_top30_opponents"] +
        WEIGHTS["speed"]    * row["speed_score"]  
    )
    
    bonus = (
        BONUS["top5"]    * row["top5_wins"] +
        (BONUS["perfect"] if row["sets_lost"] == 0 else 0)
    )
    
    return round(base + bonus, 2)

In [40]:
# 1. identify every GS champion
finals = gs_enh.loc[gs_enh["round"] == "F", ["tourney_id", "winner_name"]]

campaign_records = []

for tid, champ in finals.itertuples(index=False):
    campaign_matches = gs_enh.loc[
        (gs_enh["tourney_id"] == tid) &
        (gs_enh["winner_name"]   == champ)
    ]
    summary = campaign_summary(campaign_matches)
    summary["dominance_score"] = dominance_score(summary)
    campaign_records.append(summary)

campaign_df = pd.DataFrame(campaign_records)
campaign_df.sort_values("dominance_score", ascending=False, inplace=True)
campaign_df.reset_index(drop=True, inplace=True)

print(f"Campaigns parsed: {len(campaign_df)}")
campaign_df.head()

Campaigns parsed: 96


Unnamed: 0,player,tourney_name,year,matches,sets_won,sets_lost,games_won,games_lost,points_won,points_played,...,top10_wins,top20_wins,top5_wins,straight_sets_wins,avg_match_minutes,speed_score,sets_won_pct,games_won_pct,points_won_pct,dominance_score
0,Rafael Nadal,Roland Garros,2008,7,21,0,128,41,660.0,1087.0,...,2,3,2,7,125.857143,81.074114,100.0,75.739645,60.717571,96.15
1,Rafael Nadal,Roland Garros,2017,7,20,0,116,35,569.0,916.0,...,2,3,1,7,103.428571,84.446831,100.0,76.821192,62.117904,92.65
2,Rafael Nadal,Roland Garros,2020,7,21,0,129,53,670.0,1121.0,...,1,2,1,7,136.285714,79.505908,100.0,70.879121,59.768064,87.27
3,Roger Federer,Australian Open,2007,7,21,0,132,72,740.0,1274.0,...,3,4,0,7,114.142857,82.835661,100.0,64.705882,58.084772,86.96
4,Roger Federer,Wimbledon,2017,7,20,0,122,74,676.0,1185.0,...,2,4,0,7,99.142857,85.0913,100.0,62.244898,57.046414,86.33


In [42]:
# Quick glance at top-10 dominant campaigns
display(
    campaign_df[[
        "player",
        "tourney_name",
        "year",
        "dominance_score",
        "sets_won",
        "sets_lost",
        "games_won_pct",
        "points_won_pct",
        "pct_top30_opponents",
        "top5_wins",
        "avg_match_minutes",
        "speed_score"
    ]].head(50)
)

# OPTIONAL: write to CSV for later use
# campaign_df.to_csv("gs_winner_campaigns_2000_2024.csv", index=False)

Unnamed: 0,player,tourney_name,year,dominance_score,sets_won,sets_lost,games_won_pct,points_won_pct,pct_top30_opponents,top5_wins,avg_match_minutes,speed_score
0,Rafael Nadal,Roland Garros,2008,96.15,21,0,75.739645,60.717571,71.428571,2,125.857143,81.074114
1,Rafael Nadal,Roland Garros,2017,92.65,20,0,76.821192,62.117904,57.142857,1,103.428571,84.446831
2,Rafael Nadal,Roland Garros,2020,87.27,21,0,70.879121,59.768064,28.571429,1,136.285714,79.505908
3,Roger Federer,Australian Open,2007,86.96,21,0,64.705882,58.084772,71.428571,0,114.142857,82.835661
4,Roger Federer,Wimbledon,2017,86.33,20,0,62.244898,57.046414,71.428571,0,99.142857,85.0913
5,Rafael Nadal,Roland Garros,2010,84.87,21,0,64.676617,56.855151,57.142857,0,138.714286,79.140709
6,Novak Djokovic,Australian Open,2011,81.49,19,1,67.391304,56.930693,71.428571,2,132.428571,80.085929
7,Jannik Sinner,Australian Open,2024,80.22,21,3,62.946429,55.316092,71.428571,3,158.285714,76.197637
8,Rafael Nadal,Roland Garros,2019,79.72,21,2,69.230769,59.117897,57.142857,2,140.714286,78.839957
9,Rafael Nadal,Roland Garros,2014,79.04,21,2,70.3125,60.592851,42.857143,2,131.714286,80.19334


In [44]:
import json
import pandas as pd
import math

def safe(val, round_to=None):
    if pd.isna(val) or (isinstance(val, float) and math.isnan(val)):
        return None
    return round(val, round_to) if round_to is not None else val

records = []
for _, row in campaign_df.head(25).iterrows():
    record = {
        # Core identifiers & ranking
        "rank": len(records) + 1,
        "player": row["player"],
        "tournament": row["tourney_name"],
        "year": int(row["year"]),
        "dominance_score": safe(row["dominance_score"], 2),
        
        # Key performance metrics (for table & radar chart)
        "sets_won": int(row["sets_won"]),
        "sets_won_pct": safe(row["sets_won_pct"], 1),
        "games_won_pct": safe(row["games_won_pct"], 1),
        "points_won_pct": safe(row["points_won_pct"], 1),
        "pct_top30_opponents": safe(row["pct_top30_opponents"], 1),
        "speed_score": safe(row["speed_score"], 1),
        "avg_match_minutes": safe(row["avg_match_minutes"], 1),
        
        # Highlight metrics (for table display)
        "top5_wins": int(row["top5_wins"]),
        "perfect_campaign": row["sets_lost"] == 0,
        "sets_lost": int(row["sets_lost"]),
        
        # Optional: Score breakdown (for bar chart)
        "score_breakdown": {
            "sets": round(0.32 * row["sets_won_pct"], 1),
            "games": round(0.25 * row["games_won_pct"], 1),
            "points": round(0.23 * row["points_won_pct"], 1),
            "opponent": round(0.10 * row["pct_top30_opponents"], 1),
            "speed": round(0.10 * row["speed_score"], 1)
        }
    }
    records.append(record)

# Save to JSON
json_path = "/Users/saurabhkumar/Desktop/Work/Tennis/gsdi/gs_dominance_rankings.json"
with open(json_path, "w") as f:
    json.dump(records, f, indent=2)

print(f"JSON successfully saved to: {json_path}")
print(f"Total campaigns: {len(records)}")

JSON successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/gsdi/gs_dominance_rankings.json
Total campaigns: 25
