In [None]:
# ================================================================================
# NFL BIG DATA BOWL 2026 - FULLY WORKING COMPREHENSIVE SOLUTION
# Complete Implementation with All Features & Visualizations
# ================================================================================

import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tqdm.auto import tqdm

# Models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# Preprocessing & Analysis
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle

warnings.filterwarnings('ignore')

# Plotting settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 100

print("="*90)
print(" "*10 + "NFL BIG DATA BOWL 2026 - FULLY WORKING SOLUTION")
print(" "*15 + "Complete Implementation with All Features")
print("="*90)

# ================================================================================
# PART 1: DATA LOADING
# ================================================================================
print("\nðŸ“Š PART 1: DATA LOADING")
print("="*90)

DATA_DIR = "/Users/ryanparks/Downloads/compsci_ai/big_data_bowl/nfl-big-data-bowl-2026-prediction/"

# Load data
print("Loading training data...")
input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="Input files")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="Output files")), ignore_index=True)

print("Loading test data...")
test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
supp_path = DATA_DIR + "supplementary_data.csv"
supplementary = pd.read_csv(supp_path)

# Heuristic: contains 'zone' (case-insensitive) -> 'zone'; else 'man'; NaN -> 'unknown'
cov_raw = supplementary["team_coverage_type"]
cov_str = cov_raw.astype("string").str.lower()
coverage_man_zone = pd.Series("unknown", index=supplementary.index, dtype="string")
coverage_man_zone = coverage_man_zone.mask(cov_str.notna() & cov_str.str.contains("zone", na=False), "zone")
coverage_man_zone = coverage_man_zone.mask(cov_str.notna() & ~cov_str.str.contains("zone", na=False), "man")

supplementary["coverage_man_zone"] = coverage_man_zone
supplementary["coverage_is_man"] = (supplementary["coverage_man_zone"] == "man").astype("int8")
supplementary["coverage_is_zone"] = (supplementary["coverage_man_zone"] == "zone").astype("int8")
supplementary["coverage_is_unknown"] = (supplementary["coverage_man_zone"] == "unknown").astype("int8")

# Keep one row per (game_id, play_id) for a clean many:one merge
key_cols = ["game_id", "play_id"]
for k in key_cols:
    if k not in supplementary.columns:
        raise KeyError(f"'{k}' column is required in supplementary_data.csv for merging.")
supp_play = (supplementary
                .drop_duplicates(subset=key_cols, keep="first")
                [key_cols + ["coverage_man_zone", "coverage_is_man", "coverage_is_zone", "coverage_is_unknown"]]
                .reset_index(drop=True))

# Merge into train_input / train_output (many frames to one play row)
df_in = df_in.merge(
    supp_play, on=key_cols, how="left", validate="m:1"
)
df_out = df_out.merge(
    supp_play, on=key_cols, how="left", validate="m:1"
)

test_in = test_in.merge(
    supp_play, on=key_cols, how="left", validate="m:1"
)
test_template = test_template.merge(
    supp_play, on=key_cols, how="left", validate="m:1"
)
dist = df_in["coverage_man_zone"].value_counts(dropna=False)
print("[coverage] distribution in train_input:\n", dist.to_string())
print(f"\nâœ“ Data Shapes:")
print(f"  â€¢ Training inputs: {df_in.shape}")
print(f"  â€¢ Training outputs: {df_out.shape}")
print(f"  â€¢ Test inputs: {test_in.shape}")
print(f"  â€¢ Test template: {test_template.shape}")

In [None]:
data2 = df_in.query('play_id == 55 and game_id == 2023112300')
player_in = data2.query('nfl_id == 55910')
player_out = df_out.query('play_id == 55 and game_id == 2023112300 and nfl_id == 55910')

In [None]:
def get_initial_conds(data, goal_list, nfl_id):
    df_hist = data.query('nfl_id ==  ' + str(nfl_id))
    df_hist = df_hist[["x", "y", "s", "a", "dir", "o"]]
    df_hist['dir_cos'] = np.cos(np.radians(df_hist['dir']))
    df_hist['dir_sin'] = np.sin(np.radians(df_hist['dir']))
    df_hist['sin_cos'] = df_hist['dir_cos'] **2 + df_hist['dir_sin'] **2
    df_hist['sx'] = df_hist['s'] * np.cos(np.radians(df_hist['dir']))
    df_hist['sy'] = df_hist['s'] * np.sin(np.radians(df_hist['dir']))
    df_hist['s_check'] = np.sqrt(df_hist['sx'] **2 + df_hist['sy'] **2)
    goal = goal_list.iloc[-1]
    goal_2 = goal_list.iloc[-2]
    vx = (goal[0] - goal_2[0]) / 0.1
    vy = (goal[1] - goal_2[1]) / 0.1
    initial = df_hist.iloc[-1]
    conditions = {
        'xi': [initial["x"], initial["y"]],
        'vi': [initial["sx"], initial["sy"]],
        'xf': [goal[0], goal[1]],
        'vf': [vx, vy]
    }
    return df_hist, conditions

In [None]:
df_hist = player_in
goal_list = player_out[["x", "y"]]
df_hist = df_hist[["x", "y", "s", "a", "dir", "o"]]
df_hist['dir_cos'] = np.cos(np.radians(df_hist['dir']))
df_hist['dir_sin'] = np.sin(np.radians(df_hist['dir']))
df_hist['sin_cos'] = df_hist['dir_cos'] **2 + df_hist['dir_sin'] **2
df_hist['sx'] = df_hist['s'] * np.cos(np.radians(df_hist['dir']))
df_hist['sy'] = df_hist['s'] * np.sin(np.radians(df_hist['dir']))
df_hist['s_check'] = np.sqrt(df_hist['sx'] **2 + df_hist['sy'] **2)
goal = goal_list.iloc[-1]

In [None]:
import matlab.engine
eng = matlab.engine.start_matlab()
eng.addpath("/Users/ryanparks/Downloads/matlab_testing/NLALIB")

In [None]:
eng.addpath("/Users/ryanparks/Downloads/matlab_testing/NLALIB/OptimTraj")

In [None]:
def matlab_player_path(xi, vi2, xf, vf, r):
    xi_m = matlab.double([xi], size=(1, 2))
    vi_m = matlab.double([vi2], size=(1, 2))
    xf_m = matlab.double([xf], size=(1, 2))
    vf_m = matlab.double([vf], size=(1, 2))

    # (if your MATLAB code expects column vectors, use size=(2,1) instead)

    # call the function; set nargout to however many outputs it returns
    #out = eng.fastest_path_segment_3(xi_m, vi_m, xf_m, vf_m, float(r), float(401), nargout=1)
    out = eng.solveOptimalRunner(xi_m, vi_m, xf_m, vf_m,)
    #out = fastest_path_segment_2(xi, vi_2, xm, vm, r, 401);
    #out = eng.fastest_path_segment_2(xm, vm, xf, vf, r, 401)
    return out

In [None]:
def matlab_player_path_2(xi, vi2, tfin, m, h, E, F, target):
    xi_m = matlab.double([xi], size=(1, 2))
    vi_m = matlab.double([vi2], size=(1, 2))
    t_m = matlab.double([tfin], size=(1, 1))
    m_m = matlab.double([m], size=(1, 1))
    h_m = matlab.double([h], size=(1, 1))
    E_m = matlab.double([E], size=(1, 1))
    F_m = matlab.double([F], size=(1, 1))

    target_m = matlab.double([target], size=(1, 2))

    # (if your MATLAB code expects column vectors, use size=(2,1) instead)

    # call the function; set nargout to however many outputs it returns
    #out = eng.fastest_path_segment_3(xi_m, vi_m, xf_m, vf_m, float(r), float(401), nargout=1)
    out = eng.energyConstrainedPath(h_m, m_m, E_m, F_m, t_m, xi_m, vi_m, target_m)
    #out = fastest_path_segment_2(xi, vi_2, xm, vm, r, 401);
    #out = eng.fastest_path_ssolve_segmentegment_2(xm, vm, xf, vf, r, 401)
    return out

In [None]:
def resample_path(X, Y, tf, dt, x_true, y_true):
    # Convert to arrays
    X = np.array(X).flatten()
    Y = np.array(Y).flatten()
    n = len(X)
    x_true = np.asarray(x_true).ravel()
    y_true = np.asarray(y_true).ravel()

    #t_new = np.arange(0, tf + dt/2, dt)
    n_true = len(x_true)
    t_new = dt * np.arange(n_true)
    
    # Create original (nonuniform) time vector
    t_orig = np.linspace(0, tf, n)
    
    # Create new uniform times
    #t_new = np.arange(0, tf + dt/2, dt)  # include final tf
    
    # Interpolate
    x_new = np.interp(t_new, t_orig, X)
    y_new = np.interp(t_new, t_orig, Y)
    
    # Package into DataFrame
    df = pd.DataFrame({'t': t_new, 'x': x_new, 'y': y_new})
    return df

In [None]:


def resample_path_nearest(X, Y, tf, dt, x_true, y_true, enforce_monotone=True):
    """
    Resample a dense predicted path (X,Y) onto uniform times by
    choosing, at each tracking time, the path point closest to the *actual* (x_true,y_true).

    Parameters
    ----------
    X, Y : array-like
        Dense predicted path samples (nearly continuous), length n_pred.
    tf : float
        Final time (seconds).
    dt : float
        Tracking interval (seconds).
    x_true, y_true : array-like
        Actual tracking coordinates at uniform times [0, dt, ..., tf].
        Must have length n_true = round(tf/dt)+1.
    enforce_monotone : bool
        If True, ensure chosen indices along (X,Y) are non-decreasing in time.

    Returns
    -------
    DataFrame with columns ['t', 'x', 'y'] aligned to tracking times.
    """
    X = np.asarray(X).ravel()
    Y = np.asarray(Y).ravel()
    x_true = np.asarray(x_true).ravel()
    y_true = np.asarray(y_true).ravel()

    #t_new = np.arange(0, tf + dt/2, dt)
    n_true = len(x_true)
    # Make times align to actual series length (avoids +1 endpoint)
    t_new = dt * np.arange(n_true)
    if len(x_true) != len(t_new):
        raise ValueError(f"len(x_true)={len(x_true)} must equal number of uniform steps {len(t_new)}")

    # Pairwise squared distances: shape (n_true, n_pred)
    # (Fine for typical n_pred ~ few hundred; avoids extra deps.)
    dx = x_true[:, None] - X[None, :]
    dy = y_true[:, None] - Y[None, :]
    D2 = dx*dx + dy*dy

    # Best match index at each uniform time
    idx = np.argmin(D2, axis=1)

    if enforce_monotone:
        # Non-decreasing pass to avoid going "backwards" on the predicted path
        idx = np.maximum.accumulate(idx)
        idx = np.clip(idx, 0, len(X) - 1)

    x_new = X[idx]
    y_new = Y[idx]
    return pd.DataFrame({'t': t_new, 'x': x_new, 'y': y_new})

In [None]:
def get_play_paths_2(play_in, play_out, tf, F, arb):

    nfl_ids_1 = set(play_in[play_in["player_role"] == 'Defensive Coverage']["nfl_id"])
    targetPos = [play_in["ball_land_x"].iloc[0], play_in["ball_land_y"].iloc[0]]
    nfl_ids_2 = set(play_out["nfl_id"])
    nfl_ids = list(nfl_ids_1 & nfl_ids_2)
    #F = 150 
    Emax = tf *arb
    h = 1.8
    m = 80.0
    i = 0
    #bad_ids = [55910, 53953]
    #bad_ids = []
    errs = {}
    merged_play = pd.DataFrame(columns=['x', 'y'])  # empty default

    if len(nfl_ids) == 0:
        print("NO DEFENSIVE COVERAGE")
        return merged_play, errs
        
    for nfl_id in nfl_ids:
        player_in = play_in.query('nfl_id == ' + str(nfl_id))[["x","y"]]
        player_out = play_out.query('nfl_id == ' + str(nfl_id))
        goal_list = player_out[["x","y", "nfl_id"]]
        _, conds = get_initial_conds(play_in, goal_list, nfl_id)
        r = 0.2
        goal_2 = player_in.iloc[-1]
        goal = goal_list.iloc[0]
        vx = (goal[0] - goal_2[0]) / 0.1
        vy = (goal[1] - goal_2[1]) / 0.1
        velo = [vx, vy]
        out = matlab_player_path_2(conds['xi'], velo, tf, m, h, Emax, F, targetPos)
        #out = matlab_player_path_2(conds['xi'], conds['vi'], conds['xf'], conds['vf'], r)
        X = np.array(out['X']).flatten()
        Y = np.array(out['Y']).flatten()
        df = pd.DataFrame({'x': X, 'y': Y})
        df['nfl_id'] = nfl_id

        dt = 0.1     # interval (seconds)
        x_true = conds['xf'][0]
        y_true = conds['xf'][1]

        end_pt_err = (x_true - X[-1])**2 + (y_true - Y[-1])**2
        errs[f'{nfl_id}'] = end_pt_err
        
        if i ==0 :
            merged_play = df
        else:
            merged_play = pd.concat([merged_play, df], ignore_index=True)

        i +=1

    return merged_play, errs

In [None]:
def NLP_from_sample(plays_sample, rmse_tracker):
    i = 0
    for _, play in plays_sample.iterrows():
        gid = play["game_id"]
        pid = play["play_id"]

        play_in = df_in.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))
        play_out = df_out.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))

        if play_out.empty:
            continue

        last_frame = int(play_out["frame_id"].max())
        tf = last_frame / 10.0
        F = 200
        arb = 150
        merged_play_2, errs = get_play_paths_2(play_in, play_out, tf, F, arb)

        # --- NEW: record an entry for every nfl_id in this play ---
        for nfl_id, se in errs.items():  # se = squared endpoint error
            key = f'{gid}_{pid}_{nfl_id}'
            rmse_tracker[key] = se  # per-player endpoint RMSE
            # or: rmse_tracker[key] = se if you want to keep SE and sqrt later

        # keep merged trajectories for visualization
        merged_play_2["game_id"] = gid
        merged_play_2["play_id"] = pid

        if i == 0:
            merged_play_3 = merged_play_2
        else:
            merged_play_3 = pd.concat([merged_play_3, merged_play_2], ignore_index=True)

        i += 1

    return merged_play_3

In [None]:
#random sample of 100 plays
unique_plays = df_out[['game_id','play_id']].drop_duplicates()
#random_state=42
plays_sample = unique_plays.sample(n=1500)
#plays_sample = unique_plays

# def 
# play_in = df_in.query('play_id == 1711 and game_id == 2023090700')
# play_out = df_out.query('play_id == 1711 and game_id == 2023090700')
# nfl_ids = list(set(play_out["nfl_id"]))

# last_frame = int(play_out2["frame_id"].max())
# nfl_ids2
rmse_tracker4 = {}
i = 0

for _, play in plays_sample.iterrows():
    gid = play["game_id"]
    pid = play["play_id"]

    play_in = df_in.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))
    play_out = df_out.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))

    if play_out.empty:
        continue

    last_frame = int(play_out["frame_id"].max())
    tf = last_frame / 10.0

    F = 200
    arb = 150
    merged_play_2, errs = get_play_paths_2(play_in, play_out, tf, F, arb)

    # --- NEW: record an entry for every nfl_id in this play ---
    for nfl_id, se in errs.items():  # se = squared endpoint error
        key = f'{gid}_{pid}_{nfl_id}'
        rmse_tracker4[key] = se  # per-player endpoint RMSE
        # or: rmse_tracker[key] = se if you want to keep SE and sqrt later

    # keep merged trajectories for visualization
    merged_play_2["game_id"] = gid
    merged_play_2["play_id"] = pid

    if i == 0:
        merged_play_3 = merged_play_2
    else:
        merged_play_3 = pd.concat([merged_play_3, merged_play_2], ignore_index=True)

    i += 1

#np.sqrt(0.5*merged_play_3["se"].mean())  
all_rmse = np.array(list(rmse_tracker4.values()))
overall_rmse = np.sqrt(0.5*all_rmse.mean())
print("Overall mean endpoint RMSE:", overall_rmse)

In [None]:
rmse_tracker7_new = {}
i = 0

#for _, row in sample_50.iterrows():
for _, row in remaining_df.iterrows():
    gid = int(row["game_id"])
    pid = int(row["play_id"])

    play_in = df_in.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))
    play_out = df_out.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))

    if play_out.empty:
        continue

    last_frame = int(play_out["frame_id"].max())
    tf = last_frame / 10.0

    F = 200
    arb = 150
    merged_play_2, errs = get_play_paths_2(play_in, play_out, tf, F, arb)

    # --- NEW: record an entry for every nfl_id in this play ---
    for nfl_id, se in errs.items():  # se = squared endpoint error
        key = f'{gid}_{pid}_{nfl_id}'
        rmse_tracker7_new[key] = se  # per-player endpoint RMSE
        # or: rmse_tracker[key] = se if you want to keep SE and sqrt later

    # keep merged trajectories for visualization
    merged_play_2["game_id"] = gid
    merged_play_2["play_id"] = pid

    if i == 0:
        merged_play_13 = merged_play_2
    else:
        merged_play_13 = pd.concat([merged_play_3, merged_play_2], ignore_index=True)

    i += 1
    print("Play i: ", i)

#np.sqrt(0.5*merged_play_3["se"].mean())  
all_rmse_new = np.array(list(rmse_tracker7_new.values()))
overall_rmse_new = np.sqrt(0.5*all_rmse_new.mean())
print("Overall mean endpoint RMSE:", overall_rmse_new)

In [None]:
for _, play in df_trials.iterrows():
    game_id = play["game_id"]
    play_id = play["play_id"]
    merged_top = trial_df_plays.query('play_id ==' + str(play_id) + ' and game_id == ' + str(game_id))
    plot_play(game_id, play_id, True, merged_top)

In [None]:

# 49410 2023111904 299

# 53589 2023101509 2672

# 56074 2023102204 723
test_list = [(49410, 2023111904, 299), (53589, 2023101509, 2672), (56074, 2023102204, 723)]
for nfl_id, gid, pid in test_list:
    play_in = df_in.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))
    play_out = df_out.query('play_id == ' + str(pid) + ' and game_id == ' + str(gid))
    targetPos = [play_in["ball_land_x"].iloc[0], play_in["ball_land_y"].iloc[0]]
    player_in = play_in.query('nfl_id == ' + str(nfl_id))[["x","y"]]
    player_out = play_out.query('nfl_id == ' + str(nfl_id))
    goal_list = player_out[["x","y", "nfl_id"]]
    _, conds = get_initial_conds(play_in, goal_list, nfl_id)
    r = 0.2
    last_frame = int(play_out["frame_id"].max())
    tf = last_frame / 10.0
    goal_2 = player_in.iloc[-1]
    goal = goal_list.iloc[0]
    vx = (goal[0] - goal_2[0]) / 0.1
    vy = (goal[1] - goal_2[1]) / 0.1
    velo = [vx, vy]
    print("COMBO 1 conds: ", conds, " velo: ", velo, " target: ", targetPos, " tf: ", tf)
    print(f"goal_list_{nfl_id}_{pid}.csv")
    goal_list.to_csv(f"goal_list_{nfl_id}_{pid}.csv", index=False)

In [None]:
merged_rmse = {}
for d in [rmse_tracker, rmse_tracker2, rmse_tracker3, rmse_tracker4]:
    merged_rmse.update(d)

# get all RMSE values
values = list(merged_rmse.values())

# plot histogram
plt.figure(figsize=(8, 5))
plt.hist(values, bins=30, edgecolor='black')
plt.xlabel("RMSE")
plt.ylabel("Count")
plt.title("Distribution of RMSE across all trackers")
plt.tight_layout()
plt.show()

In [None]:
def get_play_paths(play_in, play_out, tf):
    nfl_ids = list(set(play_out["nfl_id"]))
    i = 0
    #bad_ids = [55910, 53953]
    #bad_ids = []
    for nfl_id in nfl_ids:
        player_in = play_in.query('nfl_id == ' + str(nfl_id))[["x","y"]]
        player_out = play_out.query('nfl_id == ' + str(nfl_id))
        goal_list = player_out[["x","y", "nfl_id"]]
        _, conds = get_initial_conds(play_in, goal_list, nfl_id)
        r = 0.2
        out = matlab_player_path(conds['xi'], conds['vi'], conds['xf'], conds['vf'], r)
        X = np.array(out['X']).flatten()
        Y = np.array(out['Y']).flatten()
        dt = 0.1     # interval (seconds)
        x_true = goal_list["x"].to_numpy()
        y_true = goal_list["y"].to_numpy()
        
        df_path = resample_path_nearest(X, Y, tf, dt, x_true, y_true)
        #df_path = resample_path(X, Y, tf, dt, x_true, y_true)
        pred = df_path[["x","y"]].to_numpy()
        true = goal_list[["x","y"]].to_numpy()

        df_path["se"] = ((pred - true)**2).sum(axis=1)
        
        df_path["nfl_id"] = nfl_id
        #df_path["mse"] = mean_squared_error(df_path['x'], goal_list['x']) + mean_squared_error(df_path['y'], goal_list['y'])
        #df_path["se"] = ((df_path['x'] - goal_list['x'])**2) + ((df_path['y'] - goal_list['y'])**2) 
        rmse_e = rmse(df_path, goal_list, tf*10)
        overall_rmse = np.sqrt(df_path["se"].mean())
        print(nfl_id, " Error: ", rmse_e, " new_error: ", overall_rmse)
        goal_2 = player_in.iloc[-1]
        goal = goal_list.iloc[0]
        vx = (goal[0] - goal_2[0]) / 0.1
        vy = (goal[1] - goal_2[1]) / 0.1
        velo = [vx, vy]
        out2 = matlab_player_path(conds['xi'], velo, conds['xf'], conds['vf'], r)
        X2 = np.array(out2['X']).flatten()
        Y2 = np.array(out2['Y']).flatten()
        df_path2 = resample_path_nearest(X2, Y2, tf, dt, x_true, y_true)
        #df_path2 = resample_path(X2, Y2, tf, dt, x_true, y_true)
        pred2 = df_path2[["x","y"]].to_numpy()

        df_path2["se"] = ((pred2 - true)**2).sum(axis=1)
        #df_path2["mse"] = mean_squared_error(df_path2['x'], goal_list['x']) + mean_squared_error(df_path2['y'], goal_list['y'])
        #df_path2["se"] = ((df_path2['x'] - goal_list['x'])**2) + ((df_path2['y'] - goal_list['y'])**2) 
        
        df_path2["nfl_id"] = nfl_id
        rmse_e2 = rmse(df_path2, goal_list, tf*10)
        overall_rmse2 = np.sqrt(df_path2["se"].mean())
        print(nfl_id, " Error2: ", rmse_e2, " new_error2: ", overall_rmse2)
        #if overall_rmse > overall_rmse2 and nfl_id not in bad_ids:
        if overall_rmse > overall_rmse2:
            df_path = df_path2
        # if nfl_id in bad_ids:
        #     df_path = df_path2
        
        if i ==0 :
            merged_play = df_path
        else:
            merged_play = pd.concat([merged_play, df_path], ignore_index=True)

        i +=1

    return merged_play




In [None]:
#random sample of 100 plays
unique_plays = df_out[['game_id','play_id']].drop_duplicates()
# random_state=42
#plays_sample = unique_plays.sample(n=1000)
plays_sample = unique_plays

# def 
# play_in = df_in.query('play_id == 1711 and game_id == 2023090700')
# play_out = df_out.query('play_id == 1711 and game_id == 2023090700')
# nfl_ids = list(set(play_out["nfl_id"]))

# last_frame = int(play_out2["frame_id"].max())
# nfl_ids2
plays_sample
rmse_tracker = {}
i = 0
for _, play in plays_sample.iterrows():
    gid = play["game_id"]
    pid = play["play_id"]
    #print(play["game_id"])
    play_in = df_in.query('play_id ==' + str(pid) +' and game_id == ' + str(gid))
    play_out = df_out.query('play_id ==' + str(pid) +' and game_id == ' + str(gid))
    nfl_ids = list(set(play_out["nfl_id"]))

    last_frame = int(play_out2["frame_id"].max())
    tf = last_frame/10
    merged_play_2 = get_play_paths(play_in, play_out, tf)
    overall_rmse = np.sqrt(0.5*merged_play_2["se"].mean())
    rmse_tracker[f'{gid}_{pid}'] = overall_rmse
    merged_play_2["game_id"] = gid
    merged_play_2["play_id"] = pid
    if i == 0:
        merged_play_3 = merged_play_2
    else:
        merged_play_3 = pd.concat([merged_play_3, merged_play_2], ignore_index=True)

    i+=1

overall_rmse = np.sqrt(0.5*merged_play_3["se"].mean())   
print(overall_rmse) 

In [None]:
import joblib
# joblib.dump({
#     'preds_matlab': merged_play_3
# }, 'preds_matlab.joblib')

# print("Saved preds to preds_matlab.joblib")
#merged_play_3
merged_play_3 = joblib.load('preds_matlab.joblib')

In [None]:
def plot_play(game_id, play_id, oof, merged_play_2):
    data_in  = df_in.query('game_id == ' + str(game_id) + ' and play_id == ' + str(play_id)).copy()
    data_out = df_out.query('game_id == ' + str(game_id) + ' and play_id == ' + str(play_id)).copy()
    preds_temp = preds_lstm
    # if oof:
    #     preds_temp = preds_oof_lstm
    # else:
    #     preds_temp = preds_lstm

    preds_out = preds_temp.query('game_id == ' + str(game_id) + ' and play_id == ' + str(play_id)).copy()
    # --- 2) Define a minimal, consistent schema and a standardizer
    MIN_COLS = ['game_id','play_id','frame_id','x','y','nfl_id','player_role']

    def standardize(df, source_label):
        df = df.copy()
        # Ensure all expected columns exist (create safe defaults if missing)
        for c in MIN_COLS:
            if c not in df.columns:
                if c == 'player_role':
                    if source_label == 'out':
                        df[c] = 'def_air'
                    else: 
                        df[c] = 'pred_def_air'
                elif c in ('x','y'):
                    df[c] = np.nan
                else:
                    df[c] = pd.NA

        # Coerce numeric columns
        for c in ['x','y','frame_id','nfl_id']:
            df[c] = pd.to_numeric(df[c], errors='coerce')

        # Normalize player_role dtype
        if source_label == 'out':
            df['player_role'] = df['player_role'].astype(str).fillna('def_air')
            # If data_in provided, copy role for targeted receiver
            if data_in is not None and 'player_role' in data_in.columns:
                # Get the nfl_ids of targeted receivers
                targeted_ids = data_in.loc[
                    data_in['player_role'] == 'Targeted Receiver', 'nfl_id'
                ].unique()

                # If any targeted receivers exist, copy the label
                df.loc[df['nfl_id'].isin(targeted_ids), 'player_role'] = 'targeted_air'
            # if data_in['nfl_id']:
            #     data_in[data_in['nfl_id'] == 55910]['player_role'].iloc[0]
        

            
        # Tag source
        df['source'] = source_label
        # Keep only the minimal columns + source to avoid dtype merge issues
        return df[['game_id','play_id','frame_id','x','y','nfl_id','player_role','source']]

    data_in_std  = standardize(data_in,  'in')
    data_out_std = standardize(data_out, 'out')
    pred_out_std = standardize(preds_out, 'pred')

    # --- 3) Concatenate safely
    merged_play = pd.concat([data_in_std, data_out_std], ignore_index=True)
    g = sns.pairplot(
        merged_play,
        x_vars=["x"],
        y_vars=["y"],
        height=3.5,
        hue="player_role" #hue define the color-code variable
    )
    plt.title('game_id = ' + str(game_id) + ' and play_id = ' + str(play_id))
    #plt.show()

    # ---- Add the ball as a single 'x' ----
    ball_x = data_in["ball_land_x"].iloc[0]
    ball_y = data_in["ball_land_y"].iloc[0]

    # `g.axes[0,0]` gives you the Axes object for this 1Ã—1 pairplot
    ax = g.axes[0, 0]

    # last_frame = int(data["frame_id"].max())

    # last_pts = data[data["frame_id"] == last_frame]
    first_frame = int(data_in["frame_id"].max())
    first_pts = data_in[data_in["frame_id"] == first_frame]

    # ax.scatter(
    #     last_pts["x"], last_pts["y"],
    #     marker="x", s=100, linewidths=2,
    #     c="blue", label=f"df_in last frame ({last_frame})"
    # )
    ax.scatter(ball_x, ball_y, marker="x", s=120, color="black", label="Ball")
    # ax.scatter(
    #     pred_out_std["x"],
    #     pred_out_std["y"],
    #     marker="*",           # or "o" / "*" / "^" etc.
    #     s=5,
    #     c="blue",
    #     alpha=0.8,
    #     label="Predicted (df_out)"
    # )
    ax.scatter(
        merged_play_2["x"],
        merged_play_2["y"],
        marker="*",           # or "o" / "*" / "^" etc.
        s=5,
        c="red",
        alpha=0.8,
        label="Predicted (df_out)"
    )
    

    plt.show()