# xG plotting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from supabase import create_client
from mplsoccer import Pitch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, brier_score_loss

pd.set_option("display.max_columns", 100)

In [None]:
GOAL_X = 105.0
GOAL_Y = 34.0
GOAL_WIDTH = 7.32

In [None]:
# Edit MATCH_ID_INPUT (one match) and SHOT_INPUT (one shot)
# Edit SUPABASE_KEY if it changes
SUPABASE_URL = "https://cctasilkmirgtrvapggo.supabase.co"
SUPABASE_KEY = "sb_secret_k_EFKytwKeNCeBWtjyohsg_EcPyz3XP"

MATCH_ID_INPUT = "3812"
SHOT_INPUT = ""
FETCH_BATCH_SIZE = 1000

client = create_client(SUPABASE_URL, SUPABASE_KEY)
print("Supabase client ready.")

In [None]:
print("Fetching match data...")

match_text = str(MATCH_ID_INPUT).strip()
if match_text == "":
    raise ValueError('Set MATCH_ID_INPUT to one match id, e.g. "3812".')

match_float = float(match_text)
if not match_float.is_integer():
    raise ValueError("MATCH_ID_INPUT must be an integer match id.")

match_id = int(match_float)

shots_rows = []
start = 0
while True:
    response = (
        client.table("shots")
        .select("*")
        .eq("match_id", match_id)
        .range(start, start + FETCH_BATCH_SIZE - 1)
        .execute()
    )
    batch = response.data or []
    shots_rows.extend(batch)
    if len(batch) < FETCH_BATCH_SIZE:
        break
    start += FETCH_BATCH_SIZE

xt_rows = []
start = 0
while True:
    response = (
        client.table("xt_events")
        .select("*")
        .eq("match_id", match_id)
        .range(start, start + FETCH_BATCH_SIZE - 1)
        .execute()
    )
    batch = response.data or []
    xt_rows.extend(batch)
    if len(batch) < FETCH_BATCH_SIZE:
        break
    start += FETCH_BATCH_SIZE

frames_rows = []
start = 0
while True:
    response = (
        client.table("tracking_frames")
        .select("*")
        .eq("game_id", match_id)
        .range(start, start + FETCH_BATCH_SIZE - 1)
        .execute()
    )
    batch = response.data or []
    frames_rows.extend(batch)
    if len(batch) < FETCH_BATCH_SIZE:
        break
    start += FETCH_BATCH_SIZE

ball_rows = []
start = 0
while True:
    response = (
        client.table("tracking_ball_positions")
        .select("*")
        .eq("game_id", match_id)
        .range(start, start + FETCH_BATCH_SIZE - 1)
        .execute()
    )
    batch = response.data or []
    ball_rows.extend(batch)
    if len(batch) < FETCH_BATCH_SIZE:
        break
    start += FETCH_BATCH_SIZE

shots_df = pd.DataFrame(shots_rows)
xt_df = pd.DataFrame(xt_rows)
frames_df = pd.DataFrame(frames_rows)
ball_df = pd.DataFrame(ball_rows)

if shots_df.empty:
    raise ValueError(f"No shots found for match {match_id}.")
if xt_df.empty:
    raise ValueError(f"No xt_events found for match {match_id}.")
if frames_df.empty:
    raise ValueError(f"No tracking_frames found for match {match_id}.")
if ball_df.empty:
    raise ValueError(f"No tracking_ball_positions found for match {match_id}.")

shots_sel = shots_df.copy()
if "possession_event_id" not in shots_sel.columns:
    raise ValueError("shots table is missing possession_event_id.")

shots_sel["match_id"] = pd.to_numeric(shots_sel.get("match_id"), errors="coerce")
shots_sel["possession_event_id"] = pd.to_numeric(shots_sel.get("possession_event_id"), errors="coerce")
shots_sel["game_id"] = shots_sel["match_id"]

for col in ["match_id", "possession_event_id", "game_event_id"]:
    if col not in xt_df.columns:
        raise ValueError(f"xt_events table is missing {col}.")
xt_df["match_id"] = pd.to_numeric(xt_df["match_id"], errors="coerce")
xt_df["possession_event_id"] = pd.to_numeric(xt_df["possession_event_id"], errors="coerce")
xt_df["game_event_id"] = pd.to_numeric(xt_df["game_event_id"], errors="coerce")

for col in ["game_id", "game_event_id", "frame_num"]:
    if col not in frames_df.columns:
        raise ValueError(f"tracking_frames table is missing {col}.")
frames_df["game_id"] = pd.to_numeric(frames_df["game_id"], errors="coerce")
frames_df["game_event_id"] = pd.to_numeric(frames_df["game_event_id"], errors="coerce")
frames_df["frame_num"] = pd.to_numeric(frames_df["frame_num"], errors="coerce")

for col in ["game_id", "frame_num", "x", "y"]:
    if col not in ball_df.columns:
        raise ValueError(f"tracking_ball_positions table is missing {col}.")
ball_df["game_id"] = pd.to_numeric(ball_df["game_id"], errors="coerce")
ball_df["frame_num"] = pd.to_numeric(ball_df["frame_num"], errors="coerce")

if "z" not in ball_df.columns:
    ball_df["z"] = np.nan

df = shots_sel.merge(
    xt_df[["match_id", "possession_event_id", "game_event_id"]],
    on=["match_id", "possession_event_id"],
    how="left",
)

df = df.merge(
    frames_df[["game_id", "game_event_id", "frame_num"]],
    on=["game_id", "game_event_id"],
    how="left",
)

df = df.drop_duplicates(subset=["match_id", "possession_event_id"], keep="first")

df = df.merge(
    ball_df[["game_id", "frame_num", "x", "y", "z"]],
    on=["game_id", "frame_num"],
    how="left",
)

df_model = df[df["x"].notna() & df["y"].notna()].copy()
if df_model.empty:
    raise ValueError("No matched tracking coordinates found for this match.")

x_raw = pd.to_numeric(df_model["x"], errors="coerce")
y_raw = pd.to_numeric(df_model["y"], errors="coerce")
valid_xy = pd.DataFrame({"x": x_raw, "y": y_raw}).dropna()

if valid_xy.empty:
    raise ValueError("No valid x/y coordinates for modeling.")

x_q01, x_q99 = np.nanpercentile(valid_xy["x"], [1, 99])
y_q01, y_q99 = np.nanpercentile(valid_xy["y"], [1, 99])

if -60 <= x_q01 and x_q99 <= 60 and -40 <= y_q01 and y_q99 <= 40:
    coord_mode = "centered_meters"
    x_pitch = x_raw + 52.5
    y_pitch = y_raw + 34.0
elif -5 <= x_q01 and x_q99 <= 110 and -5 <= y_q01 and y_q99 <= 73:
    coord_mode = "pitch_like_meters"
    x_pitch = x_raw
    y_pitch = y_raw
else:
    coord_mode = "scaled_from_raw"
    x_min, x_max = np.nanpercentile(valid_xy["x"], [1, 99])
    y_min, y_max = np.nanpercentile(valid_xy["y"], [1, 99])
    x_span = max(x_max - x_min, 1e-6)
    y_span = max(y_max - y_min, 1e-6)
    x_pitch = (x_raw - x_min) * (105.0 / x_span)
    y_pitch = (y_raw - y_min) * (68.0 / y_span)

df_model["x_pitch"] = x_pitch.clip(0, 105)
df_model["y_pitch"] = y_pitch.clip(0, 68)

left_goal_distance = np.hypot(df_model["x_pitch"], df_model["y_pitch"] - GOAL_Y)
right_goal_distance = np.hypot(GOAL_X - df_model["x_pitch"], df_model["y_pitch"] - GOAL_Y)
attacking_left = left_goal_distance < right_goal_distance

df_model["x_plot"] = np.where(attacking_left, GOAL_X - df_model["x_pitch"], df_model["x_pitch"])
df_model["y_plot"] = np.where(attacking_left, 68.0 - df_model["y_pitch"], df_model["y_pitch"])

dx = GOAL_X - df_model["x_plot"]
dy = np.abs(df_model["y_plot"] - GOAL_Y)
df_model["distance"] = np.hypot(dx, dy)
df_model["angle"] = np.degrees(np.arctan2(GOAL_WIDTH * dx, dx**2 + dy**2 - (GOAL_WIDTH / 2) ** 2)).clip(lower=0)

df_model["shot_height"] = pd.to_numeric(df_model.get("z"), errors="coerce")

shot_outcome = df_model["shot_outcome_type"] if "shot_outcome_type" in df_model.columns else pd.Series("", index=df_model.index)
shot_type = df_model["shot_type"] if "shot_type" in df_model.columns else pd.Series("", index=df_model.index)
ball_raw = df_model["ball_moving"] if "ball_moving" in df_model.columns else pd.Series(0, index=df_model.index)

df_model["goal"] = (shot_outcome.astype(str) == "G").astype(int)
df_model["is_foot"] = (shot_type.astype(str) == "F").astype(int)
df_model["is_header"] = (shot_type.astype(str) == "H").astype(int)
df_model["is_volley"] = (shot_type.astype(str) == "V").astype(int)

ball_num = pd.to_numeric(ball_raw, errors="coerce")
if ball_num.notna().mean() >= 0.8:
    df_model["is_ball_moving"] = ball_num.fillna(0).clip(0, 1).astype(int)
else:
    ball_text = ball_raw.astype(str).str.strip().str.lower()
    df_model["is_ball_moving"] = ball_text.isin(["1", "true", "t", "yes", "y"]).astype(int)

if "shooter_player_name" not in df_model.columns:
    df_model["shooter_player_name"] = "Unknown"

df_model["possession_event_id"] = pd.to_numeric(df_model.get("possession_event_id"), errors="coerce")
df_model["shot_uid"] = np.arange(len(df_model))

shot_text = str(SHOT_INPUT).strip()
if shot_text == "":
    selected_shot_df = df_model.iloc[[0]].copy()
    shot_source = "first_row"
else:
    selected_shot_df = pd.DataFrame()
    shot_source = ""
    try:
        shot_value = float(shot_text)
    except ValueError:
        shot_value = None

    candidate_cols = ["possession_event_id", "id", "shot_id", "game_event_id", "frame_num", "shot_uid"]
    for col in candidate_cols:
        if col not in df_model.columns:
            continue

        if shot_value is not None:
            col_num = pd.to_numeric(df_model[col], errors="coerce")
            match = df_model[col_num == shot_value]
        else:
            match = df_model[df_model[col].astype(str).str.strip() == shot_text]

        if not match.empty:
            selected_shot_df = match.iloc[[0]].copy()
            shot_source = col
            break

    if selected_shot_df.empty:
        raise ValueError(
            "SHOT_INPUT not found. Try possession_event_id, id, shot_id, game_event_id, frame_num, or leave blank."
        )

selected_shot_uid = int(selected_shot_df["shot_uid"].iloc[0])

print(f"Match: {match_id}")
print(f"Coordinate mode: {coord_mode}")
print(f"Shots in match: {len(shots_sel)}")
print(f"Shots with tracking coordinates: {len(df_model)}")
print(f"Selected shot source: {shot_source}")

preview_cols = [
    "shot_uid",
    "match_id",
    "possession_event_id",
    "shooter_player_name",
    "distance",
    "angle",
    "goal",
]
selected_shot_df[preview_cols]

In [None]:
features = [
    "distance",
    "angle",
    "shot_height",
    "is_foot",
    "is_header",
    "is_volley",
    "is_ball_moving",
]

required_cols = features + [
    "goal",
    "shooter_player_name",
    "x_plot",
    "y_plot",
    "match_id",
    "shot_uid",
    "possession_event_id",
]

df_clean = df_model[required_cols].replace([np.inf, -np.inf], np.nan).dropna().copy()
if df_clean.empty:
    raise ValueError("No rows left after cleaning model features.")

selected_df = df_clean[df_clean["shot_uid"] == selected_shot_uid].copy()
if selected_df.empty:
    raise ValueError("Selected shot does not have complete model features.")

train_df = df_clean[df_clean["shot_uid"] != selected_shot_uid].copy()
if len(train_df) < 8 or train_df["goal"].nunique() < 2:
    train_df = df_clean.copy()
    print("Using all shots in this match for training (limited data).")

if train_df["goal"].nunique() < 2:
    raise ValueError("This match does not have enough goal/non-goal variation to fit the model.")

X_train_all = train_df[features]
y_train_all = train_df["goal"].astype(int)

can_split = len(train_df) >= 10 and y_train_all.value_counts().min() >= 2
if can_split:
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_all,
        y_train_all,
        test_size=0.2,
        random_state=42,
        stratify=y_train_all,
    )
else:
    X_train, y_train = X_train_all, y_train_all
    X_test, y_test = X_train_all, y_train_all

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

df_clean["xg"] = model.predict_proba(df_clean[features])[:, 1]
selected_shot = df_clean[df_clean["shot_uid"] == selected_shot_uid].iloc[0]
selected_shot_xg = float(selected_shot["xg"])

if y_test.nunique() == 2 and len(y_test) > 1:
    y_pred_test = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_test)
else:
    auc = np.nan

brier = brier_score_loss(train_df["goal"], model.predict_proba(train_df[features])[:, 1])

print(f"Model shots in match: {len(df_clean)}")
print(f"Selected shot xG: {selected_shot_xg:.3f}")
print(f"Training Brier score: {brier:.3f}")
if np.isnan(auc):
    print("AUC: not available")
else:
    print(f"AUC: {auc:.3f}")

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
pitch = Pitch(
    pitch_type="custom",
    pitch_length=105,
    pitch_width=68,
    pitch_color="#22312b",
    line_color="white",
    linewidth=2,
)
pitch.draw(ax=ax)

x_val = float(selected_shot["x_plot"])
y_val = float(selected_shot["y_plot"])

sc = pitch.scatter(
    [x_val],
    [y_val],
    s=1200,
    c=[selected_shot_xg],
    cmap="Reds",
    alpha=0.95,
    edgecolors="black",
    linewidth=2.5,
    marker="*",
    ax=ax,
    vmin=0,
    vmax=1,
    zorder=10,
)

ax.plot([x_val, GOAL_X], [y_val, GOAL_Y], color="white", linewidth=2, alpha=0.85)

cbar = plt.colorbar(sc, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label("xG", rotation=270, labelpad=20)

outcome_text = "Goal" if int(selected_shot["goal"]) == 1 else "No Goal"
player_name = selected_shot["shooter_player_name"]

ax.set_title(
    f"Match {match_id} | {player_name} | xG {selected_shot_xg:.3f} | {outcome_text}",
    fontsize=15,
    fontweight="bold",
)

info = (
    f"distance: {selected_shot['distance']:.2f} m\n"
    f"angle: {selected_shot['angle']:.2f} deg\n"
    f"possession_event_id: {selected_shot['possession_event_id']}"
)

ax.text(
    3,
    65,
    info,
    color="white",
    fontsize=11,
    verticalalignment="top",
    bbox=dict(boxstyle="round,pad=0.4", facecolor="black", alpha=0.45),
)

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "=" * 60)
print("SELECTED SHOT SUMMARY")
print("=" * 60)
print(f"Match ID: {match_id}")
print(f"Shot selector input: {SHOT_INPUT if str(SHOT_INPUT).strip() else 'first_row'}")
print(f"Player: {selected_shot['shooter_player_name']}")
print(f"Outcome: {'Goal' if int(selected_shot['goal']) == 1 else 'No Goal'}")
print(f"Distance: {selected_shot['distance']:.2f} m")
print(f"Angle: {selected_shot['angle']:.2f} deg")
print(f"xG: {selected_shot_xg:.3f}")
print(f"Training Brier score: {brier:.3f}")
if np.isnan(auc):
    print("AUC: not available")
else:
    print(f"AUC: {auc:.3f}")

summary_df = pd.DataFrame(
    [
        {
            "match_id": int(selected_shot["match_id"]),
            "possession_event_id": selected_shot["possession_event_id"],
            "shooter_player_name": selected_shot["shooter_player_name"],
            "distance": float(selected_shot["distance"]),
            "angle": float(selected_shot["angle"]),
            "goal": int(selected_shot["goal"]),
            "xg": float(selected_shot_xg),
        }
    ]
)
summary_df