In [1]:
import pickle

from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import h5py

import processor
import xgb_model
import utils

In [2]:
data_dir = "/media/data/files/nfl_bdb_2025/data"
test_weeks = [1]
proc = processor.SeparationDataProcessor(data_dir)

In [4]:
with open("xgb_model_all_all_weeks.pkl", "rb") as f:
    bst = pickle.load(f)

In [5]:
cache_fname = proc.cache_file_fname
with h5py.File(cache_fname, "r") as all_weeks_f:
    X = []
    y = []
    play_ids = []
    for week_num in test_weeks:
        f = all_weeks_f[f"week_{week_num}"]
        n = f["seq_arr"].shape[0]
        for idx in tqdm(range(n)):
            y.append(f["separation_arr"][idx])
            play_players_df = pd.DataFrame(
                f["play_players_arr"][idx, :, :],
                columns=xgb_model.decode(f["play_players_cols"]),
            )
            play_players_df["nflId"] = play_players_df["nflId"].astype(int)
            play_overall_df = pd.DataFrame(
                f["play_overall_arr"][idx, :, :],
                columns=xgb_model.decode(f["play_overall_cols"]),
            )
            play_overall_df["nflId"] = play_overall_df["nflId"].astype(int)
            meta_df = pd.DataFrame(
                f["meta_arr"][idx, :].reshape(1, -1),
                columns=xgb_model.decode(f["meta_cols"]),
            )
            play_ids.append(play_overall_df[["gameId", "playId"]].iloc[0].astype(int))
            
            pos_cols = xgb_model.get_position_cols()
        
            seq_mask = f["seq_mask"][idx, :, 0, 0].astype(int)
            idxs = np.where(seq_mask)[0]
            seq_len = idxs.max() + 1
            pos_arr = f["seq_arr"][idx, :seq_len, :, :]
            pos_df = []
            for seq_idx in range(pos_arr.shape[0]):
                seq_df = pd.DataFrame(pos_arr[seq_idx, :, :], columns=xgb_model.decode(f["seq_cols"]))
                seq_df = seq_df.drop(seq_df[seq_df["club_football"] == 1.0].index)
                seq_df["nflId"] = seq_df["nflId"].astype(int)
                
                seq_df = seq_df.merge(play_players_df, how="outer", on="nflId")
                seq_df = seq_df.merge(play_overall_df, how="outer", on=["gameId", "playId", "nflId"])
                
                seq_df["position_ord"] = np.argmax(seq_df[pos_cols].to_numpy(), axis=1)
                seq_df = seq_df.sort_values(by="position_ord")
                seq_df = seq_df[seq_df["wasTargettedReceiver"] == 1.0]
                seq_df = seq_df.merge(meta_df, how="outer", on=["gameId", "playId"])

                pos_df.append(seq_df)
            X.append(pd.concat(pos_df, axis=0))

            if idx > 25:
                break
        break
play_ids = pd.concat(play_ids, axis=1).T

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.74it/s]


In [7]:
in_motion_idxs = np.where([x["inMotionAtBallSnap"].sum() > 0 for x in X])
shift_idxs = np.where([x["shiftSinceLineset"].sum() > 0 for x in X])
motion_idxs = np.where([x["motionSinceLineset"].sum() > 0 for x in X])

In [8]:
pred_arr = []
for idx in tqdm(range(len(X))):
    y_true = y[idx]
    seq_df = X[idx][utils.get_target_feature_cols()]
    pred_sep = bst.predict(seq_df)
    pred_arr.append(pred_sep)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 28.21it/s]


In [9]:
in_motion_idxs

(array([], dtype=int64),)

In [10]:
idx = 24

print(f"True Final Separation: {y[idx]}")
pred_lp = np.convolve(pred_arr[idx], np.ones(3)/3, mode='valid')

plt.figure()
plt.plot(pred_arr[idx])
plt.plot(pred_lp)
plt.show()

IndexError: list index out of range

In [11]:
def ste(sep_arr):
    if len(pred_sep) < 5:
        return -1        
        
    n_steps = 3
    pred_lp = np.convolve(pred_sep, np.ones(n_steps)/n_steps, mode='valid')

    t_max = np.argmax(pred_lp)
    s_max = pred_lp[t_max]

    t_snap = len(pred_lp) - 1
    s_snap = pred_lp[-1]

    ste_val = (s_snap / s_max) * ((t_max + 1) / (t_snap + 1))

    return ste_val

In [12]:
ste_arr = []

for idx, pred_sep in enumerate(pred_arr):
    if len(pred_sep) < 5:
        ste_val = -1
        continue
        
    n_steps = 3
    pred_lp = np.convolve(pred_sep, np.ones(n_steps)/n_steps, mode='valid')

    t_max = np.argmax(pred_lp)
    s_max = pred_lp[t_max]

    t_snap = len(pred_lp) - 1
    s_snap = pred_lp[-1]

    ste_val = (s_snap / s_max) * ((t_max + 1) / (t_snap + 1))

    ste_arr.append(ste_val)

In [13]:
min_idx = np.argmin(ste_arr)
print(play_ids[min_idx])
plt.figure()
plt.plot(pred_arr[min_idx])
plt.show()

KeyError: np.int64(3)

In [14]:
with h5py.File(cache_fname, "r") as all_weeks_f:
    for week_num in test_weeks:
        f = all_weeks_f[f"week_{week_num}"]
        meta_df = pd.DataFrame(
            f["meta_arr"],
            columns=xgb_model.decode(f["meta_cols"]),
        )
        break

In [15]:
play_ids

Unnamed: 0,gameId,playId
0,2022091136,923
0,2022091136,3544
0,2022091136,2502
0,2022091136,1041
0,2022091136,1772


In [19]:
data = utils.load_data(data_dir)
play_data = data["play"]

Loading data from disk.... Data Loaded! Load time: 0.670 seconds


In [None]:
play_ids.iloc[0]["gameId"].astype(int).item()

In [None]:
play_ids.iloc[0].astype(int)["gameId"].item()

In [None]:
play_ids.iloc[0]["gameId"]

In [22]:
play_data["gameId"]

0        2022102302
1        2022091809
2        2022103004
3        2022110610
4        2022102700
            ...    
16119    2022110604
16120    2022103005
16121    2022092502
16122    2022091809
16123    2022101602
Name: gameId, Length: 16124, dtype: int64

In [None]:
pd.concat([x.iloc[0].astype(int) for x in play_ids], axis=1).T

In [None]:
(play_data["gameId"] == 2022110592).sum()

In [None]:
max_ste = np.argmax(ste_arr)
plt.figure()
plt.plot(pred_arr[max_ste])
plt.show()