In [None]:
import pandas as pd
import numpy as np
from src.io_helpers import skillcorner_game
from src.viz import make_pitch, add_title

df, meta = skillcorner_game("/Users/jacksimonson/Documents/repos/football-viz/data/raw/skillcorner/2068/structured_data.json")

raw = df.copy()

# --- 1) (optional) drop the ball by speed if available ---
if "speed" in raw.columns and raw["speed"].notna().any():
    sp = (raw.groupby("entity_id", as_index=False)["speed"]
            .apply(lambda s: np.nanpercentile(s, 95)))
    # ball tends to have the highest p95 speed by a margin
    ball_id = sp.loc[sp["speed"].idxmax(), "entity_id"]
    # if you're unsure, print to verify:
    # print("Assuming ball entity_id:", ball_id)
    raw = raw[raw["entity_id"] != ball_id]

# --- 2) infer teams by average x in the first 5 minutes ---
# use 0–300s window; if your timestamps already include period offset, that’s fine
early = raw[raw["timestamp"].between(0, 300)]
if early.empty:
    # fallback: just take the first ~10k rows
    early = raw.head(10000)

means = early.groupby("entity_id", as_index=False)["x"].mean().rename(columns={"x": "x_mean"})
med = means["x_mean"].median()

# assign side by x position (left vs right)
side_map = {eid: ("home" if xm <= med else "away") for eid, xm in zip(means["entity_id"], means["x_mean"])}

players_df = raw.copy()
players_df["team"] = players_df["entity_id"].map(side_map)

# if some entities didn’t appear in early window, backfill using global mean x
missing = players_df["team"].isna()
if missing.any():
    gmeans = (raw.groupby("entity_id", as_index=False)["x"]
                 .mean().rename(columns={"x":"x_mean"}))
    gmed = gmeans["x_mean"].median()
    gmap = {eid: ("home" if xm <= gmed else "away") for eid, xm in zip(gmeans["entity_id"], gmeans["x_mean"])}
    players_df.loc[missing, "team"] = players_df.loc[missing, "entity_id"].map(gmap)

# --- 3) rename clusters to real team names if available ---
home_name = (meta.get("home_team", {}) or {}).get("name") if isinstance(meta, dict) else None
away_name = (meta.get("away_team", {}) or {}).get("name") if isinstance(meta, dict) else None
label_map = {"home": home_name or "Home", "away": away_name or "Away"}
players_df["team_label"] = players_df["team"].map(label_map)

# --- 4) first-half slice and average locations ---
half = players_df[players_df["timestamp"].between(0, 2700)].copy()
avg = (half.groupby(["team_label", "entity_id"], as_index=False)
           .agg({"x":"mean", "y":"mean"}))

print("Teams detected:", avg["team_label"].unique().tolist())
print("Rows:", len(players_df), "| Half rows:", len(half), "| Avg points:", len(avg))


L = float(meta.get("pitch_length", 105))
W = float(meta.get("pitch_width", 68))

# sanity: if your data exceeds these, bump dimensions from data
L = max(L, 2 * np.nanmax(np.abs(avg["x"])))   # e.g., ~105
W = max(W, 2 * np.nanmax(np.abs(avg["y"])))   # e.g., ~68

avg["x_plot"] = avg["x"] + L/2
avg["y_plot"] = avg["y"] + W/2

pitch, fig, ax = make_pitch(pitch_type="custom", pitch_length=L, pitch_width=W)
for name, grp in avg.groupby("team_label"):
    ax.scatter(grp["x_plot"], grp["y_plot"], s=150, alpha=0.85, label=str(name))
ax.legend(loc="upper center", ncol=2, frameon=False)
add_title(fig, f"SkillCorner — avg locations (0–45′) | {int(L)}×{int(W)} (center→corner shifted)")

In [None]:
roll = (df.groupby("timestamp")
          .apply(lambda g: g["y"].max()-g["y"].min())
          .reset_index(name="team_length"))
roll.plot(x="timestamp", y="team_length")  # quick sanity chart
