In [91]:
import pandas as pd
import numpy as np
import csv
import os

setting up the data 

In [92]:
def pair_match_point_files(file_names):
    file_set = set(file_names)
    pairs = []
    for name in file_names:
        if "matches" in name:
            points_name = name.replace("matches", "points")
            if points_name in file_set:
                pairs.append([name, points_name])
    return pairs

In [93]:
files = pair_match_point_files(os.listdir('data/raw data'))
exp = files[0][1]

In [None]:
def _safe_div(a: pd.Series, b: pd.Series) -> pd.Series:
    """Safely divide two Series avoiding division by zero.

    Parameters
    ----------
    a, b : pd.Series
        Numerator and denominator.
    Returns
    -------
    pd.Series
        Series of a / b where b is non-zero, otherwise 0.
    """
    a = a.astype(float)
    b = b.astype(float)
    return np.divide(a, b, out=np.zeros_like(a, dtype=float), where=b != 0)


def _is_game_point_for(pts_for: pd.Series, pts_against: pd.Series, is_tiebreak: pd.Series) -> pd.Series:
    """Return True if the upcoming point is a game point for the perspective player."""
    # Regular games: win when reaching at least 4 points and leading by 2.
    is_tb = is_tiebreak.astype(bool)
    cond1 = (pts_for == 3) & (pts_against <= 2) & (~is_tb)
    cond2 = (pts_for >= 3) & (pts_against >= 3) & (pts_for == pts_against + 1) & (~is_tb)
    return cond1 | cond2


def _is_game_point_against(pts_for: pd.Series, pts_against: pd.Series, is_tiebreak: pd.Series) -> pd.Series:
    """Game point for the opponent (simply flip arguments)."""
    return _is_game_point_for(pts_against, pts_for, is_tiebreak)



In [None]:
def build_match_state_panel(input_data: pd.DataFrame, best_of_default: int = 5) -> pd.DataFrame:
    """Construct a panel with match state before each point for both perspectives.

    Parameters
    ----------
    input_data : pd.DataFrame
        Point-level data for a single or multiple matches.
    best_of_default : int, optional
        Number of sets in the match format, by default 5.
    """
    df = input_data.copy()

    # ------------------------------------------------------------------
    # Required columns and point winners
    needed = ["match_id", "SetNo", "GameNo", "PointNumber", "PointServer"]
    for c in needed:
        if c not in df.columns:
            raise ValueError(f"Missing required column: {c}")

    if "PointWinner" in df.columns:
        p1_won_point = (df["PointWinner"] == 1).astype(int)
        p2_won_point = (df["PointWinner"] == 2).astype(int)
    elif {"P1PointsWon", "P2PointsWon"}.issubset(df.columns):
        df = df.sort_values(["match_id", "SetNo", "GameNo", "PointNumber"]).copy()
        df["p1_cum_prev"] = df.groupby("match_id")["P1PointsWon"].shift(1).fillna(0)
        df["p2_cum_prev"] = df.groupby("match_id")["P2PointsWon"].shift(1).fillna(0)
        p1_won_point = (df["P1PointsWon"] > df["p1_cum_prev"]).astype(int)
        p2_won_point = (df["P2PointsWon"] > df["p2_cum_prev"]).astype(int)
    else:
        raise ValueError(
            "Need either PointWinner or (P1PointsWon, P2PointsWon) to derive point winners."
        )

    # Sort in canonical order and create overall point index
    df = df.sort_values(["match_id", "SetNo", "GameNo", "PointNumber"]).copy()
    df["point_idx"] = df.groupby("match_id").cumcount() + 1

    # ------------------------------------------------------------------
    # Game level point counts before current point
    game_change = (
        (df["SetNo"].astype(str) + "-" + df["GameNo"].astype(str))
        .ne((df["SetNo"].astype(str) + "-" + df["GameNo"].astype(str)).shift(1))
    )
    df["game_key"] = game_change.groupby(df["match_id"]).cumsum()

    df["p1_pts_in_game"] = (
        p1_won_point.groupby([df["match_id"], df["game_key"]]).cumsum().shift(1).fillna(0).astype(int)
    )
    df["p2_pts_in_game"] = (
        p2_won_point.groupby([df["match_id"], df["game_key"]]).cumsum().shift(1).fillna(0).astype(int)
    )

    # ------------------------------------------------------------------
    # Match level cumulative points before current point
    df["ttl_p1"] = p1_won_point.groupby(df["match_id"]).cumsum().shift(1).fillna(0).astype(int)
    df["ttl_p2"] = p2_won_point.groupby(df["match_id"]).cumsum().shift(1).fillna(0).astype(int)

    # ------------------------------------------------------------------
    # Optional serve/return counters and rates
    srv_cols = {
        "P1FirstSrvIn",
        "P1FirstSrvWon",
        "P1SecondSrvIn",
        "P1SecondSrvWon",
        "P1DoubleFault",
        "P2FirstSrvIn",
        "P2FirstSrvWon",
        "P2SecondSrvIn",
        "P2SecondSrvWon",
        "P2DoubleFault",
    }
    have_srv = srv_cols.issubset(df.columns)
    if have_srv:
        for side in (1, 2):
            df[f"p{side}_fs_in_cum"] = (
                df[f"P{side}FirstSrvIn"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
            )
            df[f"p{side}_fs_won_cum"] = (
                df[f"P{side}FirstSrvWon"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
            )
            df[f"p{side}_ss_in_cum"] = (
                df[f"P{side}SecondSrvIn"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
            )
            df[f"p{side}_ss_won_cum"] = (
                df[f"P{side}SecondSrvWon"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
            )
            df[f"p{side}_df_cum"] = (
                df[f"P{side}DoubleFault"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
            )
        df["p1_sv_pts"] = df["p1_fs_in_cum"] + df["p1_ss_in_cum"] + df["p1_df_cum"]
        df["p2_sv_pts"] = df["p2_fs_in_cum"] + df["p2_ss_in_cum"] + df["p2_df_cum"]
        df["p1_fsp"] = _safe_div(df["p1_fs_in_cum"], df["p1_sv_pts"])
        df["p2_fsp"] = _safe_div(df["p2_fs_in_cum"], df["p2_sv_pts"])
        df["p1_w1sp"] = _safe_div(df["p1_fs_won_cum"], df["p1_fs_in_cum"])
        df["p2_w1sp"] = _safe_div(df["p2_fs_won_cum"], df["p2_fs_in_cum"])
        df["p1_w2sp"] = _safe_div(df["p1_ss_won_cum"], df["p1_ss_in_cum"])
        df["p2_w2sp"] = _safe_div(df["p2_ss_won_cum"], df["p2_ss_in_cum"])
    else:
        df["p1_fsp"] = df["p2_fsp"] = 0.0
        df["p1_w1sp"] = df["p2_w1sp"] = 0.0
        df["p1_w2sp"] = df["p2_w2sp"] = 0.0
        df["p1_df_cum"] = df["p2_df_cum"] = 0.0

    # Optional ace counts
    if {"P1Ace", "P2Ace"}.issubset(df.columns):
        df["p1_aces_cum"] = df["P1Ace"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
        df["p2_aces_cum"] = df["P2Ace"].groupby(df["match_id"]).cumsum().shift(1).fillna(0)
    else:
        df["p1_aces_cum"] = df["p2_aces_cum"] = 0.0

    # ------------------------------------------------------------------
    # Games won in current set prior to this game
    last_point_idx = df.groupby(["match_id", "SetNo", "GameNo"]).tail(1).index
    game_winner = pd.Series(
        index=last_point_idx,
        data=np.where(
            (p1_won_point + p2_won_point).loc[last_point_idx] == 1,
            np.where(p1_won_point.loc[last_point_idx] == 1, 1, 2),
            np.nan,
        ),
        dtype="float",
    )
    games_tbl = df.loc[last_point_idx, ["match_id", "SetNo", "GameNo"]].copy()
    games_tbl["game_winner"] = game_winner.values
    games_tbl["p1_game_win"] = (games_tbl["game_winner"] == 1).astype(int)
    games_tbl["p2_game_win"] = (games_tbl["game_winner"] == 2).astype(int)
    games_tbl["p1_games_in_set_cum"] = games_tbl.groupby(["match_id", "SetNo"])["p1_game_win"].cumsum()
    games_tbl["p2_games_in_set_cum"] = games_tbl.groupby(["match_id", "SetNo"])["p2_game_win"].cumsum()
    games_tbl["p1_games_before"] = (
        games_tbl.groupby(["match_id", "SetNo"])["p1_games_in_set_cum"].shift(1).fillna(0).astype(int)
    )
    games_tbl["p2_games_before"] = (
        games_tbl.groupby(["match_id", "SetNo"])["p2_games_in_set_cum"].shift(1).fillna(0).astype(int)
    )

    df = df.merge(
        games_tbl[["match_id", "SetNo", "GameNo", "p1_games_before", "p2_games_before"]],
        on=["match_id", "SetNo", "GameNo"],
        how="left",
    )

    # Sets won before current set
    set_last_games = games_tbl.groupby(["match_id", "SetNo"]).tail(1).copy()
    set_last_games["set_winner"] = np.where(
        set_last_games["p1_games_in_set_cum"] > set_last_games["p2_games_in_set_cum"], 1, 2
    )
    set_last_games["p1_set_win"] = (set_last_games["set_winner"] == 1).astype(int)
    set_last_games["p2_set_win"] = (set_last_games["set_winner"] == 2).astype(int)
    set_last_games["p1_sets_before"] = (
        set_last_games.groupby("match_id")["p1_set_win"].cumsum().shift(1).fillna(0).astype(int)
    )
    set_last_games["p2_sets_before"] = (
        set_last_games.groupby("match_id")["p2_set_win"].cumsum().shift(1).fillna(0).astype(int)
    )

    df = (
        df.merge(
            set_last_games[["match_id", "SetNo", "p1_sets_before", "p2_sets_before"]],
            on=["match_id", "SetNo"],
            how="left",
        ).fillna({"p1_sets_before": 0, "p2_sets_before": 0})
    )

    # Basic flags
    df["is_tiebreak_game"] = (
        (df["p1_games_before"] == 6) & (df["p2_games_before"] == 6)
    ).astype(int)
    df["server_is_p1"] = (df["PointServer"] == 1).astype(int)

    # ------------------------------------------------------------------
    # Build two perspectives (P1 and P2)
    base_cols = ["match_id", "SetNo", "GameNo", "PointNumber", "point_idx"]
    p1 = df[base_cols].copy()
    p1["perspective"] = "P1"
    p1["server_is_persp"] = df["server_is_p1"]
    p1["pts_in_game_for"] = df["p1_pts_in_game"]
    p1["pts_in_game_against"] = df["p2_pts_in_game"]
    p1["games_in_set_for"] = df["p1_games_before"]
    p1["games_in_set_against"] = df["p2_games_before"]
    p1["sets_for"] = df["p1_sets_before"]
    p1["sets_against"] = df["p2_sets_before"]
    p1["is_tiebreak"] = df["is_tiebreak_game"]
    p1["ttl_diff"] = df["ttl_p1"] - df["ttl_p2"]
    p1["aces_diff"] = df["p1_aces_cum"] - df["p2_aces_cum"]
    p1["df_diff"] = df["p1_df_cum"] - df["p2_df_cum"]
    p1["fsp_diff"] = df["p1_fsp"] - df["p2_fsp"]
    p1["w1sp_diff"] = df["p1_w1sp"] - df["p2_w1sp"]
    p1["w2sp_diff"] = df["p1_w2sp"] - df["p2_w2sp"]

    p2 = df[base_cols].copy()
    p2["perspective"] = "P2"
    p2["server_is_persp"] = 1 - df["server_is_p1"]
    p2["pts_in_game_for"] = df["p2_pts_in_game"]
    p2["pts_in_game_against"] = df["p1_pts_in_game"]
    p2["games_in_set_for"] = df["p2_games_before"]
    p2["games_in_set_against"] = df["p1_games_before"]
    p2["sets_for"] = df["p2_sets_before"]
    p2["sets_against"] = df["p1_sets_before"]
    p2["is_tiebreak"] = df["is_tiebreak_game"]
    p2["ttl_diff"] = -(df["ttl_p1"] - df["ttl_p2"])
    p2["aces_diff"] = -(df["p1_aces_cum"] - df["p2_aces_cum"])
    p2["df_diff"] = -(df["p1_df_cum"] - df["p2_df_cum"])
    p2["fsp_diff"] = -(df["p1_fsp"] - df["p2_fsp"])
    p2["w1sp_diff"] = -(df["p1_w1sp"] - df["p2_w1sp"])
    p2["w2sp_diff"] = -(df["p1_w2sp"] - df["p2_w2sp"])

    panel = pd.concat([p1, p2], ignore_index=True)

    # Pressure flags
    panel["is_game_point_for"] = _is_game_point_for(
        panel["pts_in_game_for"],
        panel["pts_in_game_against"],
        panel["is_tiebreak"].astype(bool),
    ).astype(int)
    panel["is_game_point_against"] = _is_game_point_against(
        panel["pts_in_game_for"],
        panel["pts_in_game_against"],
        panel["is_tiebreak"].astype(bool),
    ).astype(int)
    panel["is_break_point"] = (
        (1 - panel["server_is_persp"]).astype(bool)
        & panel["is_game_point_for"].astype(bool)
    ).astype(int)

    panel["best_of"] = best_of_default
    panel["sets_needed_to_win"] = (panel["best_of"] // 2) + 1

    ordered = [
        "match_id",
        "SetNo",
        "GameNo",
        "PointNumber",
        "point_idx",
        "perspective",
        "server_is_persp",
        "pts_in_game_for",
        "pts_in_game_against",
        "games_in_set_for",
        "games_in_set_against",
        "sets_for",
        "sets_against",
        "best_of",
        "sets_needed_to_win",
        "is_tiebreak",
        "is_game_point_for",
        "is_game_point_against",
        "is_break_point",
        "ttl_diff",
        "aces_diff",
        "df_diff",
        "fsp_diff",
        "w1sp_diff",
        "w2sp_diff",
    ]
    ordered = [c for c in ordered if c in panel.columns]
    return panel[ordered].copy()


In [101]:
path ='2012-ausopen-points.csv'
ex = pd.read_csv(f"data/raw data/{path}")
games = ex.sort_values(['match_id','SetNo','GameNo','PointNumber']).copy()
#print(games['match_id'].unique())
for i in (games['match_id'].unique().tolist()):
    a = (games[games['match_id'] == i])
    break

In [102]:
build_match_state_panel(a)

Unnamed: 0,match_id,SetNo,GameNo,PointNumber,point_idx,perspective,server_is_persp,pts_in_game_for,pts_in_game_against,games_in_set_for,...,is_tiebreak,is_game_point_for,is_game_point_against,is_break_point,ttl_diff,aces_diff,df_diff,fsp_diff,w1sp_diff,w2sp_diff
0,2012-ausopen-1101,1,1,1,1,P1,1,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,2012-ausopen-1101,1,1,2,2,P1,1,1,0,0,...,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0
2,2012-ausopen-1101,1,1,3,3,P1,1,2,0,0,...,0,0,0,0,2,0.0,0.0,0.0,0.0,0.0
3,2012-ausopen-1101,1,1,4,4,P1,1,3,0,0,...,0,1,0,0,3,0.0,0.0,0.0,0.0,0.0
4,2012-ausopen-1101,1,1,5,5,P1,1,3,1,0,...,0,1,0,0,2,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,2012-ausopen-1101,3,6,119,119,P2,1,1,4,0,...,0,0,0,0,-38,-4.0,4.0,-0.0,-0.0,-0.0
242,2012-ausopen-1101,3,6,120,120,P2,1,1,0,0,...,0,0,0,0,-37,-4.0,4.0,-0.0,-0.0,-0.0
243,2012-ausopen-1101,3,6,121,121,P2,1,1,1,0,...,0,0,0,0,-38,-4.0,4.0,-0.0,-0.0,-0.0
244,2012-ausopen-1101,3,6,122,122,P2,1,1,2,0,...,0,0,0,0,-39,-4.0,4.0,-0.0,-0.0,-0.0
