In [1]:
import pandas as pd
import numpy as np
import re
import json

In [7]:
# Load data
pickle_path = "/Users/saurabhkumar/Desktop/Work/Tennis"
atp_data = pd.read_pickle(f"{pickle_path}/ATPdata1968_2024.pkl")

# Filter for Grand Slam matches only
gs_data = atp_data[
    (atp_data["tourney_level"] == "G") & (atp_data['round'].isin(['F', 'SF'])) & 
    (pd.to_datetime(atp_data["tourney_date"], format="%Y%m%d").dt.year >= 1980)
].copy()

gs_data["is_complete"] = ~gs_data["score"].str.contains("RET|W/O", na=False)
gs_data = gs_data[gs_data["is_complete"]].copy()


In [3]:
gs_data.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'is_complete'],
      dtype='object')

In [8]:
gs_data.shape[0]
output_path = "/Users/saurabhkumar/Desktop/Work/Tennis/nbi/allmatches.csv"
gs_data.to_csv(output_path, index=False)
print(f"DataFrame successfully saved to: {output_path}")

DataFrame successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/nbi/allmatches.csv


In [9]:
gs_data.shape[0]

516

In [58]:
# Function to parse set scores from the 'score' string
# Functions
def parse_sets(score_str):
    if pd.isna(score_str):
        return [], 0, 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)(?:\((\d+)\))?", score_str)
    set_margins = [abs(int(s1) - int(s2)) for s1, s2, *_ in sets]
    tiebreaks = sum(1 for _, _, tb in sets if tb not in [None, ''])
    lead_changes = sum([
        1 for i in range(1, len(sets))
        if (int(sets[i-1][0]) > int(sets[i-1][1])) != (int(sets[i][0]) > int(sets[i][1]))
    ])
    return set_margins, tiebreaks, lead_changes

def advanced_comeback_score(score_str):
    if pd.isna(score_str):
        return 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)", score_str)
    if len(sets) < 3:
        return 0
    set_winners = [1 if int(s1) > int(s2) else 2 for s1, s2 in sets]
    p1_sets = set_winners.count(1)
    p2_sets = set_winners.count(2)
    if p1_sets == p2_sets:
        return 0
    winner = 1 if p1_sets > p2_sets else 2
    loser = 2 if winner == 1 else 1
    winner_set_wins = 0
    loser_set_wins = 0
    lead_diffs = []
    for val in set_winners:
        if val == winner:
            winner_set_wins += 1
        else:
            loser_set_wins += 1
        lead_diffs.append(loser_set_wins - winner_set_wins)
    if len(lead_diffs) > 1 and lead_diffs[1] == 2:
        return 3
    losing_indices = [i for i, val in enumerate(set_winners) if val == loser]
    if len(losing_indices) >= 2 and (losing_indices[-1] - losing_indices[0] > 1):
        return 2
    if len(set_winners) == 5 and set_winners[:3].count(winner) == 1:
        return 2
    if len(set_winners) == 5 and set_winners[-1] == winner:
        return 1
    return 0

def final_set_tiebreak(score_str):
    if pd.isna(score_str):
        return 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)(?:\((\d+)\))?", score_str)
    if not sets:
        return 0
    last_set = sets[-1]
    return 1 if len(last_set) > 2 and last_set[2] not in [None, ''] else 0 

In [59]:
# Feature engineering
gs_data["parsed"] = gs_data["score"].apply(parse_sets)
gs_data["set_margins"] = gs_data["parsed"].apply(lambda x: x[0])
gs_data["tiebreak_count"] = gs_data["parsed"].apply(lambda x: x[1])
gs_data["lead_changes"] = gs_data["parsed"].apply(lambda x: x[2])
gs_data["avg_set_margin"] = gs_data["set_margins"].apply(lambda x: np.mean(x) if x else np.nan)
gs_data["comeback"] = gs_data["score"].apply(advanced_comeback_score)
gs_data["num_sets"] = gs_data["set_margins"].apply(len)
gs_data["final_set_tiebreak"] = gs_data["score"].apply(final_set_tiebreak)

# Break point drama
gs_data["bp_total"] = gs_data["w_bpFaced"].fillna(0) + gs_data["l_bpFaced"].fillna(0)
gs_data["bp_saved_total"] = gs_data["w_bpSaved"].fillna(0) + gs_data["l_bpSaved"].fillna(0)
gs_data["bp_saved_ratio"] = gs_data["bp_saved_total"] / gs_data["bp_total"].replace(0, np.nan)
gs_data["bp_saved_ratio"] = gs_data["bp_saved_ratio"].fillna(0)

# Duration
gs_data["duration_score"] = gs_data["minutes"] / gs_data["num_sets"].replace(0, np.nan)

# Normalize features
features_to_normalize = ["avg_set_margin", "tiebreak_count", "lead_changes", "comeback", "bp_saved_ratio", "duration_score"]

for col in features_to_normalize:
    min_val = gs_data[col].min()
    max_val = gs_data[col].max()
    if max_val == min_val:
        gs_data[col + "_norm"] = 0
    elif col == "avg_set_margin":
        gs_data[col + "_norm"] = (max_val - gs_data[col]) / (max_val - min_val)
    else:
        gs_data[col + "_norm"] = (gs_data[col] - min_val) / (max_val - min_val)

In [64]:
# ---------- NBI Calculation ----------
gs_data["NBI"] = (
    0.25 * gs_data["avg_set_margin_norm"] +      # Set closeness (inverted, normalized)
    0.12 * gs_data["tiebreak_count_norm"] +      # Tiebreaks (less weight)
    0.18 * gs_data["lead_changes_norm"] +        # Lead changes (more weight)
    0.22 * gs_data["comeback_norm"] +            # Comeback (more weight)
    0.07 * gs_data["bp_saved_ratio_norm"] +      # Break point drama (less weight)
    0.06 * gs_data["final_set_tiebreak"] +       # Final set tiebreak
    0.10 * gs_data["duration_score_norm"]        # Duration
)

In [65]:
# Sort and prepare output
nailbiters = gs_data.sort_values("NBI", ascending=False)[
    ["tourney_name", "tourney_date", "round", "minutes", "score",
     "winner_name", "loser_name", "NBI", "comeback", "avg_set_margin",
     "tiebreak_count", "lead_changes", "bp_saved_ratio", "bp_total"]
].reset_index(drop=True)

max_nbi = nailbiters["NBI"].max()
nailbiters["NBI_100"] = (nailbiters["NBI"] / max_nbi) * 100


def tag_drama(row):
    tags = []
    if row["comeback"] >= 2:
        tags.append("comeback")
    if row["tiebreak_count"] >= 2:
        tags.append("tiebreaks")
    if row["lead_changes"] >= 2:
        tags.append("momentum")
    if row["bp_saved_ratio"] > 0.6:
        tags.append("bp drama")
    if row.get("duration_score_norm", 0) > 0.7:  # Use normalized duration if available
        tags.append("epic length")
    if row.get("final_set_tiebreak", 0) > 0:
        tags.append("final set tiebreak")
    return ", ".join(tags) if tags else "standard"

nailbiters["drama_tags"] = nailbiters.apply(tag_drama, axis=1)



In [66]:
import json
import math

# Build JSON-ready dicts
records = []
for _, row in nailbiters.iterrows():
    def safe(val, round_to=None):
        if pd.isna(val) or isinstance(val, float) and math.isnan(val):
            return None
        return round(val, round_to) if round_to is not None else val

    record = {
        "match": f"{row['winner_name']} def. {row['loser_name']}",
        "tourney": row["tourney_name"],
        "round": row["round"],
        "date": str(row["tourney_date"])[:10],
        "score": row["score"],
        "duration": int(row["minutes"]) if not pd.isna(row["minutes"]) else None,
        "NBI": safe(row["NBI"], 3),
        "NBI_100": safe(row["NBI_100"], 0),
        "drama_tags": row["drama_tags"],
        "raw_stats": {
            "avg_set_margin": safe(row["avg_set_margin"], 2),
            "tiebreak_count": safe(row["tiebreak_count"], 0),
            "lead_changes": safe(row["lead_changes"], 0),
            "comeback": safe(row["comeback"], 0),
            "bp_saved_ratio": safe(row["bp_saved_ratio"], 3),
            "bp_total": safe(row["bp_total"], 0)
        }
    }
    records.append(record)

# Save to JSON
json_path = "/Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.json"
with open(json_path, "w") as f:
    json.dump(records, f, indent=2)

print(f"JSON successfully saved to: {json_path}")

# Save as CSV
output_path = "/Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.csv"
nailbiters.to_csv(output_path, index=False)
print(f"DataFrame successfully saved to: {output_path}")


JSON successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.json
DataFrame successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.csv


In [67]:
# Create top 50 JSON file
top_50_records = records[:50]

json_path_50 = "/Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters50.json"
with open(json_path_50, "w") as f:
    json.dump(top_50_records, f, indent=2)

print(f"Top 50 JSON successfully saved to: {json_path_50}")

Top 50 JSON successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters50.json


# Old Work v1

In [None]:
# Function to parse set scores from the 'score' string
def parse_sets(score_str):
    if pd.isna(score_str):
        return [], 0, 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)(?:\((\d+)\))?", score_str)
    set_margins = [abs(int(s1) - int(s2)) for s1, s2, *_ in sets]
    tiebreaks = sum(1 for _, _, tb in sets if tb not in [None, ''])
    lead_changes = sum([
        1 for i in range(1, len(sets))
        if (int(sets[i-1][0]) > int(sets[i-1][1])) != (int(sets[i][0]) > int(sets[i][1]))
    ])
    return set_margins, tiebreaks, lead_changes

def advanced_comeback_score(score_str):
    if pd.isna(score_str):
        return 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)", score_str)
    if len(sets) < 3:
        return 0
    set_winners = [1 if int(s1) > int(s2) else 2 for s1, s2 in sets]
    p1_sets = set_winners.count(1)
    p2_sets = set_winners.count(2)
    if p1_sets == p2_sets:
        return 0
    winner = 1 if p1_sets > p2_sets else 2
    loser = 2 if winner == 1 else 1
    winner_set_wins = 0
    loser_set_wins = 0
    lead_diffs = []
    for val in set_winners:
        if val == winner:
            winner_set_wins += 1
        else:
            loser_set_wins += 1
        lead_diffs.append(loser_set_wins - winner_set_wins)
    if len(lead_diffs) > 1 and lead_diffs[1] == 2:
        return 3
    losing_indices = [i for i, val in enumerate(set_winners) if val == loser]
    if len(losing_indices) >= 2 and (losing_indices[-1] - losing_indices[0] > 1):
        return 2
    if len(set_winners) == 5 and set_winners[:3].count(winner) == 1:
        return 2
    if len(set_winners) == 5 and set_winners[-1] == winner:
        return 1
    return 0
def final_set_tiebreak(score_str):
    """Check if the final set was decided by a tiebreak"""
    if pd.isna(score_str):
        return 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)(?:\((\d+)\))?", score_str)
    if not sets:
        return 0
    # Check if the last set has a tiebreak (third group is not None/empty)
    last_set = sets[-1]
    return 1 if len(last_set) > 2 and last_set[2] not in [None, ''] else 0

def clutch_drama(comeback_score, final_set_thriller):
    """Bonus for comeback from 2 sets down AND close final set"""
    return 1 if comeback_score >= 2 and final_set_thriller == 1 else 0    

In [None]:
# ---------- Apply Feature Engineering ----------
gs_data["parsed"] = gs_data["score"].apply(parse_sets)
gs_data["set_margins"] = gs_data["parsed"].apply(lambda x: x[0])
gs_data["tiebreak_count"] = gs_data["parsed"].apply(lambda x: x[1])
gs_data["lead_changes"] = gs_data["parsed"].apply(lambda x: x[2])
gs_data["avg_set_margin"] = gs_data["set_margins"].apply(lambda x: np.mean(x) if x else np.nan)
gs_data["comeback"] = gs_data["score"].apply(advanced_comeback_score)
gs_data["final_set_thriller"] = gs_data["set_margins"].apply(lambda x: 1 if x and x[-1] <= 2 else 0)
gs_data["num_sets"] = gs_data["set_margins"].apply(len)
gs_data["momentum_volatility"] = gs_data["lead_changes"] / gs_data["num_sets"].replace(0, np.nan)

gs_data["final_set_tiebreak"] = gs_data["score"].apply(final_set_tiebreak)
gs_data["clutch_drama"] = gs_data.apply(lambda row: clutch_drama(row["comeback"], row["final_set_thriller"]), axis=1)

# ---------- Break Point Drama ----------
gs_data["bp_total"] = gs_data["w_bpFaced"].fillna(0) + gs_data["l_bpFaced"].fillna(0)
gs_data["bp_saved_total"] = gs_data["w_bpSaved"].fillna(0) + gs_data["l_bpSaved"].fillna(0)
gs_data["bp_saved_ratio"] = gs_data["bp_saved_total"] / gs_data["bp_total"].replace(0, np.nan)

# ---------- Duration ----------
gs_data["duration_score"] = gs_data["minutes"] / gs_data["num_sets"].replace(0, np.nan)
gs_data["duration_norm"] = (gs_data["duration_score"] - gs_data["duration_score"].min()) / (
    gs_data["duration_score"].max() - gs_data["duration_score"].min()
)

# Normalize features
for col in ["avg_set_margin", "tiebreak_count", "lead_changes", "momentum_volatility", "comeback", "bp_saved_ratio"]:
    min_val = gs_data[col].min()
    max_val = gs_data[col].max()
    if col == "avg_set_margin":
        # Invert after normalization
        gs_data[col + "_norm"] = (max_val - gs_data[col]) / (max_val - min_val)
    else:
        gs_data[col + "_norm"] = (gs_data[col] - min_val) / (max_val - min_val)

In [None]:
# ---------- NBI Calculation ----------
gs_data["NBI"] = (
    0.18 * gs_data["avg_set_margin_norm"] +
    0.15 * gs_data["tiebreak_count_norm"] +
    0.10 * gs_data["lead_changes_norm"] +
    0.06 * gs_data["momentum_volatility_norm"] +
    0.28 * gs_data["comeback_norm"] +
    0.05 * gs_data["final_set_thriller"] +
    0.08 * gs_data["bp_saved_ratio_norm"] +
    0.07 * gs_data["duration_norm"] +
    0.02 * gs_data["final_set_tiebreak"] +
    0.01 * gs_data["clutch_drama"]
)


# Define individual component columns
gs_data["NBI_set_closeness"] = 0.18 * (1 / gs_data["avg_set_margin"].replace(0, np.nan))
gs_data["NBI_tiebreaks"] = 0.15 * gs_data["tiebreak_count"]
gs_data["NBI_lead_changes"] = 0.10 * gs_data["lead_changes"]
gs_data["NBI_comeback"] = 0.28 * gs_data["comeback"]
gs_data["NBI_bp_drama"] = 0.08 * gs_data["bp_saved_ratio"].fillna(0)
gs_data["NBI_duration"] = 0.07 * gs_data["duration_norm"]
gs_data["NBI_final_set_thriller"] = 0.05 * gs_data["final_set_thriller"]
gs_data["NBI_momentum_volatility"] = 0.06 * gs_data["momentum_volatility"]
gs_data["NBI_final_set_tiebreak"] = 0.02 * gs_data["final_set_tiebreak"]
gs_data["NBI_clutch_drama"] = 0.01 * gs_data["clutch_drama"]

nailbiters = gs_data.sort_values("NBI", ascending=False)[
    [
        "tourney_name", "tourney_date", "round", "minutes", "score",
        "winner_name", "loser_name", "NBI", "comeback", "avg_set_margin",
        "tiebreak_count", "lead_changes", "bp_saved_ratio", "bp_total",
        "NBI_set_closeness", "NBI_tiebreaks", "NBI_lead_changes", "NBI_comeback",
        "NBI_bp_drama", "NBI_duration", "NBI_final_set_thriller", "NBI_momentum_volatility", "NBI_final_set_tiebreak", "NBI_clutch_drama"
    ]
].reset_index(drop=True)

# Old Work v0

In [4]:
# Apply parser
gs_data["parsed"] = gs_data["score"].apply(parse_sets)
gs_data["set_margins"] = gs_data["parsed"].apply(lambda x: x[0])
gs_data["tiebreak_count"] = gs_data["parsed"].apply(lambda x: x[1])
gs_data["lead_changes"] = gs_data["parsed"].apply(lambda x: x[2])
gs_data["avg_set_margin"] = gs_data["set_margins"].apply(lambda x: np.mean(x) if x else np.nan)

In [None]:
# Break point drama
gs_data["bp_total"] = gs_data["w_bpFaced"].fillna(0) + gs_data["l_bpFaced"].fillna(0)
gs_data["bp_saved_total"] = gs_data["w_bpSaved"].fillna(0) + gs_data["l_bpSaved"].fillna(0)
gs_data["bp_saved_ratio"] = gs_data["bp_saved_total"] / gs_data["bp_total"].replace(0, np.nan)

# Advanced comeback scoring
def advanced_comeback_score(score_str):
    if pd.isna(score_str):
        return 0
    score_str = score_str.replace("RET", "").replace("W/O", "")
    sets = re.findall(r"(\d+)-(\d+)", score_str)
    if len(sets) < 3:
        return 0  # Not long enough for a real comeback

    set_winners = []
    for s1, s2 in sets:
        if int(s1) > int(s2):
            set_winners.append(1)
        else:
            set_winners.append(2)

    p1_sets = set_winners.count(1)
    p2_sets = set_winners.count(2)

    if p1_sets == p2_sets:
        return 0
    winner = 1 if p1_sets > p2_sets else 2
    loser = 2 if winner == 1 else 1

    winner_set_wins = 0
    loser_set_wins = 0
    lead_diffs = []

    for val in set_winners:
        if val == winner:
            winner_set_wins += 1
        else:
            loser_set_wins += 1
        lead_diffs.append(loser_set_wins - winner_set_wins)

    # Down 0–2
    if len(lead_diffs) > 1 and lead_diffs[1] == 2:
        return 3

    # Lost two non-consecutive sets
    losing_indices = [i for i, val in enumerate(set_winners) if val == loser]
    if len(losing_indices) >= 2 and (losing_indices[-1] - losing_indices[0] > 1):
        return 2

    # Down 1–2 and wins
    if len(set_winners) == 5 and set_winners[:3].count(winner) == 1:
        return 2

    # 2–2, wins final set
    if len(set_winners) == 5 and set_winners[-1] == winner:
        return 1

    return 0

gs_data["comeback"] = gs_data["score"].apply(advanced_comeback_score)

# Number of sets
gs_data["num_sets"] = gs_data["set_margins"].apply(len)

# Duration score
gs_data["duration_score"] = gs_data["minutes"] / gs_data["num_sets"].replace(0, np.nan)

In [None]:
# Updated NBI formula (with 30% weight to comeback)
gs_data["NBI"] = (
    0.25 * (1 / gs_data["avg_set_margin"].replace(0, np.nan)) +
    0.2 * gs_data["tiebreak_count"] +
    0.15 * gs_data["lead_changes"] +
    0.3 * gs_data["comeback"] +
    0.1 * gs_data["bp_saved_ratio"].fillna(0)
)

In [10]:

# Rank and preview top nail-biters
nailbiters = gs_data.sort_values("NBI", ascending=False)[
    ["tourney_name", "tourney_date","round","minutes", "score", "winner_name", "loser_name", "NBI", "comeback", 
     "avg_set_margin","tiebreak_count", "lead_changes", "bp_saved_ratio", "bp_total" ]
].reset_index()



In [11]:
gs_nailbiters = nailbiters[
    (nailbiters['round'].isin(['F', 'SF'])) &  # Filter for 'F' or 'SF' in 'round'
    (pd.to_datetime(nailbiters['tourney_date'], format='%Y%m%d').dt.year > 2000) # Filter for tourney_date year > 2000
].sort_values("NBI", ascending=False)

In [14]:
gs_nailbiters.head(5)

output_path = "/Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.csv"
gs_nailbiters.to_csv(output_path, index=False)

print(f"DataFrame successfully saved to: {output_path}")

DataFrame successfully saved to: /Users/saurabhkumar/Desktop/Work/Tennis/nbi/gs_nailbiters.csv
