
# Tennis Tagging Analytics Notebook

This notebook ingests your **Tennis Match Tagging Schema (CSV)** and produces:

- Validation of enum values & required fields
- Core KPIs (win rate, serve/return effectiveness)
- Pattern & tactic win rates
- Error/pressure hotspots
- Overhead conversion, rally-length profiles
- Saved charts in `figures/`

> **How to use**
> 1. Place your match CSV (following `tennis_tag_schema_dictionary.md`) in the same folder.
> 2. Set `CSV_PATH` in the next cell.
> 3. Run all cells.



In [None]:

# --- Config ---
CSV_PATH = 'tennis_tag_schema_template.csv'  # <-- change to your file path
FIG_DIR = 'figures'
REPORT_DIR = 'reports'

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textwrap import dedent

os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# Use a clear style
plt.style.use('seaborn-v0_8-darkgrid')

print('Using CSV:', CSV_PATH)


In [None]:

# --- Allowed enums & schema ---
REQUIRED_COLUMNS = [
    'point_id','set_no','game_no','score_before_point','server','serve_number','serve_code',
    'return_code','return_aggr','rally_len_shots','stroke_seq','pattern','tactic_code',
    'pressure_flags','final_shot_type','final_outcome','court_pos_final','notes'
]

ALLOWED = {
    'server': {'n','o'},
    'serve_number': {'1','2',1,2},
    'serve_code': {'A','W','IN','SF','DF','WB'},
    'return_code': {'IN','NET','LONG','WIDE','UE','FE',''},
    'return_aggr': {'BLK','NEU','AGR',''},
    'pattern': {'FIRST','RALLY','APPROACH','NET','LOB_DEF','MOON_BALL',''},
    'tactic_code': {'MOVE_OP','DEPTH','CHANGE_DIR','TO_WEAK_WING','BODY','PACE',''},
    'pressure_flags': {'MOVED_BY_OP','CROSSED_BY_OP','PASSED_AT_NET','QUESTIONABLE_CALL',''},
    'final_shot_type': {'F','B','SLICE','V','O','D','L',''},
    'court_pos_final': {'BASELINE','INSIDE','NET',''}
}

# Helper for compound final_outcome
OUTCOME_MAIN = {'PtWon','PtLost'}
OUTCOME_CAUSE = {'W','UE','FE','DF'}

# --- Load CSV ---
df = pd.read_csv(CSV_PATH)

# Normalize column names
df.columns = [c.strip() for c in df.columns]

# Check required columns
missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Fill NaN with empty string for text columns
for col in ['score_before_point','stroke_seq','pattern','tactic_code','pressure_flags','final_shot_type','notes','return_code','return_aggr','court_pos_final']:
    df[col] = df[col].fillna('')

# Coerce types
df['point_id'] = pd.to_numeric(df['point_id'], errors='coerce')
df['set_no'] = pd.to_numeric(df['set_no'], errors='coerce')
df['game_no'] = pd.to_numeric(df['game_no'], errors='coerce')
df['serve_number'] = df['serve_number'].astype(str)
df['rally_len_shots'] = pd.to_numeric(df['rally_len_shots'], errors='coerce').fillna(0).astype(int)

# --- Validation ---
issues = []

for idx, row in df.iterrows():
    # Enum checks
    for col, allowed in ALLOWED.items():
        val = str(row[col]) if col in df.columns else ''
        # pressure_flags is list-like; check each
        if col == 'pressure_flags' and val:
            for flag in [x.strip() for x in val.split(';') if x.strip()]:
                if flag not in allowed:
                    issues.append((idx, col, val, f"invalid flag '{flag}'"))
        else:
            if val not in {str(a) for a in allowed}:
                issues.append((idx, col, val, 'invalid value'))

    # final_outcome compound
    fo = str(row['final_outcome']) if 'final_outcome' in df.columns else ''
    parts = fo.split('|') if fo else []
    if len(parts) != 2 or parts[0] not in OUTCOME_MAIN or parts[1] not in OUTCOME_CAUSE:
        issues.append((idx, 'final_outcome', fo, 'expected format PtWon|W or PtLost|UE etc.'))

# Create issues DataFrame
issues_df = pd.DataFrame(issues, columns=['row','column','value','reason'])

print(f"Rows: {len(df)} | Validation issues: {len(issues_df)}")
issues_df.head(20)


In [None]:

# --- Preprocessing & derived fields ---

# Split final_outcome into outcome_main and outcome_cause
out_main, out_cause = [], []
for fo in df['final_outcome'].astype(str):
    parts = fo.split('|') if fo else ['', '']
    out_main.append(parts[0] if parts else '')
    out_cause.append(parts[1] if len(parts)>1 else '')

df['outcome_main'] = out_main
df['outcome_cause'] = out_cause

# Basic booleans
df['pt_won'] = (df['outcome_main'] == 'PtWon').astype(int)
df['pt_lost'] = (df['outcome_main'] == 'PtLost').astype(int)

# First-strike indicator: pattern FIRST or rally_len_shots <= 3
df['first_strike'] = ((df['pattern'] == 'FIRST') | (df['rally_len_shots'] <= 3)).astype(int)

# Parse pressure flags
for flag in ['MOVED_BY_OP','CROSSED_BY_OP','PASSED_AT_NET','QUESTIONABLE_CALL']:
    df[f'flag_{flag}'] = df['pressure_flags'].fillna('').str.contains(flag).astype(int)

# Final shot class simplified
def shot_class(s):
    s = str(s)
    if s in {'F','B'}: return s
    if s in {'SLICE'}: return 'SLICE'
    if s in {'V','O','D','L'}: return s
    return ''

df['final_shot_class'] = df['final_shot_type'].apply(shot_class)

print('Derived columns added:', [c for c in df.columns if c.startswith('flag_')] + ['first_strike','final_shot_class'])


In [None]:

# --- Core KPIs ---

summary = {
    'points': len(df),
    'won': int(df['pt_won'].sum()),
    'lost': int(df['pt_lost'].sum()),
}
summary['win_rate_%'] = round(100 * summary['won'] / max(1, summary['points']), 2)
print(summary)

# By server
by_server = df.groupby('server')['pt_won'].mean().rename('win_rate').reset_index()
by_server['win_rate_%'] = (by_server['win_rate'] * 100).round(1)
print('
Win rate by server:')
print(by_server)

# Serve number & serve code
by_serve_no = df.groupby('serve_number')['pt_won'].mean().rename('win_rate').reset_index()
by_serve_no['win_rate_%'] = (by_serve_no['win_rate'] * 100).round(1)

by_serve_code = df.groupby('serve_code')['pt_won'].mean().rename('win_rate').reset_index()
by_serve_code['win_rate_%'] = (by_serve_code['win_rate'] * 100).round(1)

# Return outcomes
by_return = df.groupby('return_code')['pt_won'].mean().rename('win_rate').reset_index()
by_return['win_rate_%'] = (by_return['win_rate'] * 100).round(1)

# Patterns & tactics
by_pattern = df.groupby('pattern')['pt_won'].mean().rename('win_rate').reset_index()
by_pattern['win_rate_%'] = (by_pattern['win_rate'] * 100).round(1)

by_tactic = df.groupby('tactic_code')['pt_won'].mean().rename('win_rate').reset_index()
by_tactic['win_rate_%'] = (by_tactic['win_rate'] * 100).round(1)

# Pressure flags impact
flag_cols = [c for c in df.columns if c.startswith('flag_')]
flag_impact = []
for c in flag_cols:
    sub = df[df[c]==1]
    flag_impact.append({
        'flag': c.replace('flag_',''),
        'count': len(sub),
        'win_rate_%': round(100*sub['pt_won'].mean(),1) if len(sub)>0 else np.nan
    })
flag_impact_df = pd.DataFrame(flag_impact)

# Overhead conversion
ov = df[df['final_shot_class']=='O']
ov_summary = {
    'count': len(ov),
    'wins': int((ov['outcome_main']=='PtWon').sum()),
    'losses': int((ov['outcome_main']=='PtLost').sum()),
    'win_rate_%': round(100*((ov['outcome_main']=='PtWon').mean() if len(ov)>0 else 0),1)
}

# Rally length profile
bins = [0,1,3,5,8,12,99]
labels = ['0-1','2-3','4-5','6-8','9-12','>12']
df['rally_bin'] = pd.cut(df['rally_len_shots'], bins=bins, labels=labels, include_lowest=True)
by_rally = df.groupby('rally_bin')['pt_won'].mean().rename('win_rate').reset_index()
by_rally['win_rate_%'] = (by_rally['win_rate']*100).round(1)

# Final shot outcomes
by_final_shot = df.groupby(['final_shot_class','outcome_main']).size().unstack(fill_value=0)

print('
Overhead summary:', ov_summary)
by_rally


In [None]:

# --- Visualizations ---

import matplotlib.pyplot as plt

# 1) Win rate by server
plt.figure(figsize=(5,4))
plt.bar(by_server['server'], by_server['win_rate_%'], color=['steelblue','darkorange'])
plt.title('Win Rate by Server')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_server.png")
plt.show()

# 2) Serve number
plt.figure(figsize=(5,4))
plt.bar(by_serve_no['serve_number'], by_serve_no['win_rate_%'], color='mediumseagreen')
plt.title('Win Rate by Serve Number')
plt.xlabel('Serve number (1/2)')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_serve_number.png")
plt.show()

# 3) Serve code
plt.figure(figsize=(7,4))
plt.bar(by_serve_code['serve_code'], by_serve_code['win_rate_%'], color='mediumpurple')
plt.title('Win Rate by Serve Code')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_serve_code.png")
plt.show()

# 4) Return code
plt.figure(figsize=(7,4))
plt.bar(by_return['return_code'], by_return['win_rate_%'], color='firebrick')
plt.title('Win Rate by Return Outcome')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_return.png")
plt.show()

# 5) Pattern
plt.figure(figsize=(8,4))
plt.bar(by_pattern['pattern'], by_pattern['win_rate_%'], color='cornflowerblue')
plt.title('Win Rate by Pattern')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_pattern.png")
plt.show()

# 6) Tactic code
plt.figure(figsize=(8,4))
plt.bar(by_tactic['tactic_code'], by_tactic['win_rate_%'], color='goldenrod')
plt.title('Win Rate by Tactic')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_tactic.png")
plt.show()

# 7) Pressure flags impact
plt.figure(figsize=(8,4))
plt.bar(flag_impact_df['flag'], flag_impact_df['win_rate_%'], color='tomato')
plt.title('Win Rate When Pressure Flag is Present')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_pressure_flag.png")
plt.show()

# 8) Rally length bin
plt.figure(figsize=(7,4))
plt.bar(by_rally['rally_bin'].astype(str), by_rally['win_rate_%'], color='slategray')
plt.title('Win Rate by Rally Length (shots after return)')
plt.ylabel('Win rate (%)')
plt.ylim(0,100)
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/win_rate_by_rally_bin.png")
plt.show()

# 9) Final shot outcomes stacked bar
plt.figure(figsize=(8,5))
wins = by_final_shot.get('PtWon', pd.Series([0]*len(by_final_shot), index=by_final_shot.index))
losses = by_final_shot.get('PtLost', pd.Series([0]*len(by_final_shot), index=by_final_shot.index))
shots = by_final_shot.index.astype(str)
plt.bar(shots, wins, label='Won', color='seagreen')
plt.bar(shots, losses, bottom=wins, label='Lost', color='indianred')
plt.title('Final Shot Type: Wins vs Losses')
plt.ylabel('Count')
plt.legend()
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/final_shot_wins_losses.png")
plt.show()

print('Figures saved to', FIG_DIR)


In [None]:

# --- Quick Markdown report output ---

lines = []
lines.append('# Match KPIs
')
lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}
")
lines.append(f"Points: {len(df)} | Won: {int(df['pt_won'].sum())} | Lost: {int(df['pt_lost'].sum())} | Win rate: {round(100*df['pt_won'].mean(),1)}%
")

# Server
lines.append('
## By Server
')
for _, r in by_server.iterrows():
    lines.append(f"- Server `{r['server']}`: {r['win_rate_%']}% win rate
")

# Patterns
lines.append('
## Patterns
')
for _, r in by_pattern.iterrows():
    lines.append(f"- {r['pattern']}: {r['win_rate_%']}%
")

# Tactics
lines.append('
## Tactics
')
for _, r in by_tactic.iterrows():
    lines.append(f"- {r['tactic_code']}: {r['win_rate_%']}%
")

# Pressure flags
lines.append('
## Pressure Flags
')
for _, r in flag_impact_df.iterrows():
    lines.append(f"- {r['flag']} (n={int(r['count'])}): {r['win_rate_%']}%
")

# Overheads
lines.append('
## Overhead
')
lines.append(f"- Count: {ov_summary['count']} | Wins: {ov_summary['wins']} | Losses: {ov_summary['losses']} | Win rate: {ov_summary['win_rate_%']}%
")

md = '
'.join(lines)
md_path = os.path.join(REPORT_DIR, 'match_kpis.md')
with open(md_path, 'w', encoding='utf-8') as f:
    f.write(md)
print('Report written to', md_path)
