In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_inference_server.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluati

# NFL Big Data Bowl 2026 – Player Movement Prediction Project

**1. IMPORTS**

*Purpose:*
All required Python libraries are imported here, including data manipulation, numerical computation, machine learning frameworks, and deep learning libraries.

*Libraries used and reasoning:*

- pandas & numpy: Efficient data manipulation and numerical computation.

- torch & nn: For building and training the LSTM sequence model.

- xgboost, lightgbm, catboost: For tree-based residual models that refine LSTM outputs.

- os, glob, pickle: For file handling and model loading.

*Why this approach:*
Using separate imports and modularizing them makes the code easier to maintain, debug, and ensures all dependencies are listed upfront.

**2. FEATURE ENGINEERING FUNCTIONS**

*Purpose:*
Transform raw player tracking data into meaningful features for model input.

*Key logic:*

- Input data contains positions, velocities, accelerations, and other metrics per player per frame.

- Additional features like relative distance to the line of scrimmage, players’ orientation, and team identifiers were computed.

- Feature selection focused on columns used consistently across tree and LSTM models (TREE_FEATURE_COLS).

*Why this method:*
Proper feature engineering captures the spatial and temporal context for player movement, enabling both LSTM and tree models to make better predictions.

**3. MODEL ARCHITECTURE (LSTM)**

*Purpose:*
Predict the future positions of players over a sequence of frames.

*Model details:*

Encoder-Decoder LSTM with:

- Encoder: processes historical player trajectories.

- Decoder: predicts next positions frame by frame.

- Linear layer: maps hidden states to (x, y) coordinates.

- Handles variable sequence lengths using pack_padded_sequence.

- Uses last observed (x, y) as input for next frame prediction.

*Why LSTM:*
Player trajectories are sequential and time-dependent. LSTMs can capture temporal dependencies better than tree models alone.

**4. UTILITY FUNCTIONS**

*Purpose:*
Handle data preprocessing and batching for models.

*Functions include:*

- pad_groups(): Pads sequences to maximum length in batch for LSTM.

- make_groups_meta(): Organizes player trajectories by nfl_id and generates metadata for reconstructing predictions.

- Model loading utilities: load_lstm_model(), load_xgb_model(), load_lgb_model(), load_cat_model().

*Why this approach:*

Ensures compatibility between variable-length input sequences and fixed-size batch processing in PyTorch.

Simplifies model deployment by providing functions to load trained models safely.

**5. predict_play() — Multi-frame inference (LSTM + Residual Tree Models)**

*Purpose:*
Predict player positions over multiple frames using LSTM and optionally refine with tree-based residuals.

*Logic:*

- Generate features per player and batch into sequences.

- Run LSTM to predict (x, y) for T_out frames.

*If tree models exist:*

- Predict residuals for (x, y) using last input frame features.

- Ensemble residuals from XGBoost, LightGBM, and CatBoost using predefined weights.

- Apply residuals to LSTM output.

- Clip predicted coordinates to field boundaries.

*Why this method:*

- LSTM captures the sequential patterns in player movement.

- Tree models correct small systematic errors (residuals) using learned relationships from historical data.

- Ensures physically valid predictions.

**6. predict_one_play()**

*Purpose:*
Simplifies single-play prediction by integrating LSTM and tree model predictions.

*Logic:*

- Accepts a single play DataFrame.

- Calls predict_play() with preloaded models.

- Returns the final output in required schema: (game_id, play_id, nfl_id, frame_id, x, y).

*Why this method:*
Modularizes prediction for single-play use, which is necessary for the Kaggle evaluation loop or local testing.

**7. kaggle_main_offline() — Offline Prediction Loop**

*Purpose:*
Run predictions for the Kaggle Big Data Bowl using local CSV inputs, without requiring the Kaggle-specific nflrush environment.

*Logic:*

- Loads LSTM and tree-based residual models (XGBoost, LightGBM, CatBoost if available).

- Reads the test dataset directly from test_input.csv.

- Applies predict_one_play() to generate predicted player positions for all plays.

- Fills missing predictions with default values (0.0) to ensure no NaNs in the submission.

- Merges predictions with required Kaggle columns (game_id, play_id, nfl_id, frame_id).

- Saves the final predictions to a CSV file (submission.csv) for direct Kaggle submission.

*Why this method:*

- Fully offline and internet-free, compliant with Kaggle submission rules.

- Modular, easy to maintain, and flexible for testing with different model combinations.

- Avoids dependency on environment-specific packages while ensuring correct submission format.

**8. main / Local Testing**

*Purpose:*

Enable local execution and validation of the full prediction pipeline before submission.

*Logic:*

- Loads the full test dataset and model artifacts.

- Runs predict_one_play() on all test plays.

- Generates predictions in the Kaggle-required format.

- Saves or previews the output locally for debugging and performance checks.

*Why this approach:*

- Allows full validation of sequence and residual model predictions offline.

- Eliminates the risk of runtime errors due to missing environment-specific packages.

- Enables iterative performance tuning and debugging before submission.

**Summary**

- The project combines an LSTM model for sequential player movement prediction with tree-based residual models for accuracy enhancement.

- Modular architecture separates data preprocessing, model inference, and submission logic.

- Residual models correct errors from the LSTM predictions, improving overall performance.

- Local CSV-based workflow ensures safe, flexible, and reproducible testing and submission.

This structure is fully compatible with Kaggle Big Data Bowl rules and does not require internet access or nflrush.

# 1. IMPORTS

In [2]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import joblib

import kaggle_evaluation.nfl_inference_server

# Optional tree models
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    import catboost as cb
except Exception:
    cb = None

In [3]:
DEVICE = torch.device("cpu")

LSTM_MODEL_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_lstm.pth"
XGB_MODEL_PATH  = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_xgb.json"
LGB_MODEL_PATH  = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_lgb.txt"
CAT_MODEL_PATH  = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_cat.cbm"

ENSEMBLE_WEIGHTS = {"xgb": 0.3, "lgb": 0.3, "cat": 0.4}
# ================================================================
# HEIGHT PARSING
# ================================================================
def to_inches(h):
    try:
        ft, inch = str(h).split("-")
        return int(ft) * 12 + int(inch)
    except Exception:
        return 72

# 2. FEATURE ENGINEERING FUNCTIONS

In [4]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create physics-based features used by LSTM / tree models."""
    df = df.copy()

    # ensure required columns
    for col in ['x', 'y', 's', 'a', 'o', 'dir', 'player_height', 'player_weight', 'frame_id', 'nfl_id', 'game_id', 'play_id']:
        if col not in df.columns:
            # don't override required id columns if not present; but set defaults for FE
            if col in ['x', 'y', 's', 'a', 'o', 'dir']:
                df[col] = 0.0

    # heights & weight
    df['height_inches'] = df.get('player_height', '0').apply(to_inches).fillna(72)
    df['weight_lbs'] = pd.to_numeric(df.get('player_weight', 200), errors='coerce').fillna(200)

    # BMI
    df['bmi'] = (df['weight_lbs'] / (df['height_inches']**2 + 1e-6)) * 703.0

    # directions -> vectors
    dir_rad = np.radians(pd.to_numeric(df['dir'], errors='coerce').fillna(0.0))
    df['heading_x'] = np.sin(dir_rad)
    df['heading_y'] = np.cos(dir_rad)

    orient_rad = np.radians(pd.to_numeric(df['o'], errors='coerce').fillna(0.0))
    df['orient_x'] = np.sin(orient_rad)
    df['orient_y'] = np.cos(orient_rad)

    dcol = pd.to_numeric(df['dir'], errors='coerce').fillna(0.0)
    ocol = pd.to_numeric(df['o'], errors='coerce').fillna(0.0)
    diff = np.abs(dcol - ocol)
    df['dir_orient_diff'] = np.minimum(diff, 360 - diff)

    s = pd.to_numeric(df['s'], errors='coerce').fillna(0.0)
    a = pd.to_numeric(df['a'], errors='coerce').fillna(0.0)

    df['velocity_x'] = s * df['heading_x']
    df['velocity_y'] = s * df['heading_y']
    df['acceleration_x'] = a * df['heading_x']
    df['acceleration_y'] = a * df['heading_y']

    df['speed_squared'] = s**2
    df['accel_magnitude'] = np.sqrt(df['acceleration_x']**2 + df['acceleration_y']**2)

    df['momentum_x'] = df['weight_lbs'] * df['velocity_x']
    df['momentum_y'] = df['weight_lbs'] * df['velocity_y']
    df['momentum_magnitude'] = np.sqrt(df['momentum_x']**2 + df['momentum_y']**2)

    df['kinetic_energy'] = 0.5 * df['weight_lbs'] * df['speed_squared']

    df = df.replace([np.inf, -np.inf], 0.0).fillna(0.0)
    return df


TREE_FEATURE_COLS = [
    "x","y","s","a","o","dir",
    "heading_x","heading_y",
    "velocity_x","velocity_y",
    "acceleration_x","acceleration_y",
    "dir_orient_diff",
    "height_inches","weight_lbs","bmi",
    "speed_squared","accel_magnitude",
    "momentum_x","momentum_y","momentum_magnitude","kinetic_energy"
]


# 3. MODEL ARCHITECTURE (LSTM)

In [5]:
class EncoderDecoderLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                               batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(2, hidden_dim, num_layers=num_layers,
                               batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, enc_X, enc_lens, T_out=10):
        packed = nn.utils.rnn.pack_padded_sequence(
            enc_X, enc_lens.cpu().numpy(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, c_n) = self.encoder(packed)

        h, c = h_n, c_n
        B = enc_X.size(0)

        last_xy = []
        for i, L in enumerate(enc_lens):
            Li = int(L.item())
            last_xy.append(enc_X[i, max(0, Li - 1), :2])
        dec_in = torch.stack(last_xy, dim=0).unsqueeze(1)

        preds = []
        for t in range(T_out):
            out, (h, c) = self.decoder(dec_in, (h, c))
            xy = self.fc(out.squeeze(1))
            preds.append(xy.unsqueeze(1))
            dec_in = xy.unsqueeze(1).detach()

        return torch.cat(preds, dim=1)

# 4. UTILITY FUNCTIONS (padding, grouping, loading models)

In [6]:
# ================================================================
# 4. PADDING + GROUPING
# ================================================================
def pad_groups(groups):
    B = len(groups)
    F = groups[0].shape[1]
    T_max = max(g.shape[0] for g in groups)

    Xp = np.zeros((B, T_max, F), dtype=np.float32)
    lens = np.zeros((B,), dtype=np.int64)

    for i, g in enumerate(groups):
        T = g.shape[0]
        Xp[i, :T] = g
        lens[i] = T

    return torch.tensor(Xp, device=DEVICE), torch.tensor(lens, device=DEVICE)


def make_groups_meta(play_df):
    df = engineer_features(play_df)

    groups, metas = [], []
    for nfl_id, g in df.groupby("nfl_id"):
        g = g.sort_values("frame_id")
        feat = g[TREE_FEATURE_COLS].values.astype(np.float32)

        groups.append(feat)
        metas.append((
            int(g["game_id"].iloc[0]),
            int(g["play_id"].iloc[0]),
            int(nfl_id),
            g["frame_id"].values.astype(int)
        ))
    return groups, metas


# ================================================================
# 5. LOAD MODELS
# ================================================================
def load_lstm_model(path=LSTM_MODEL_PATH):
    model = EncoderDecoderLSTM(input_dim=len(TREE_FEATURE_COLS))
    if os.path.exists(path):
        state = torch.load(path, map_location=DEVICE)
        state = state.get("state_dict", state)
        model.load_state_dict(state)
    else:
        print("⚠ LSTM model missing:", path)
    model.to(DEVICE).eval()
    return model


def load_xgb_model(path=XGB_MODEL_PATH):
    if xgb is None or not os.path.exists(path): return None
    try:
        booster = xgb.Booster()
        booster.load_model(path)
        return booster
    except:
        return None


def load_lgb_model(path=LGB_MODEL_PATH):
    if lgb is None or not os.path.exists(path): return None
    try:
        return lgb.Booster(model_file=path)
    except:
        return None


def load_cat_model(path=CAT_MODEL_PATH):
    if cb is None or not os.path.exists(path): return None
    try:
        model = cb.CatBoostRegressor()
        model.load_model(path)
        return model
    except:
        return None

# 5. predict_play() — Multi-frame inference (LSTM generating absolute preds)

In [7]:
# ================================================================
# 6. PLAY PREDICTION
# ================================================================
def predict_play(play_input_df, lstm_model, tree_models=None, T_out_default=10):
    groups, metas = make_groups_meta(play_input_df)
    if len(groups) == 0:
        return pd.DataFrame(columns=["game_id","play_id","nfl_id","frame_id","x","y"])

    Xt, lens = pad_groups(groups)
    T_out = int(play_input_df.get("num_frames_output", pd.Series([T_out_default])).iloc[0])

    with torch.no_grad():
        preds_np = lstm_model(Xt, lens, T_out).cpu().numpy()

    rows = []
    for i, meta in enumerate(metas):
        game_id, play_id, nfl_id, _ = meta
        for t in range(preds_np.shape[1]):
            rows.append({
                "game_id": game_id,
                "play_id": play_id,
                "nfl_id": nfl_id,
                "frame_id": t + 1,
                "x": float(preds_np[i, t, 0]),
                "y": float(preds_np[i, t, 1])
            })

    lstm_out_df = pd.DataFrame(rows)

    # If no tree models → return base LSTM predictions
    if tree_models is None or all(m is None for m in tree_models.values()):
        return lstm_out_df

    df_input = engineer_features(play_input_df)
    last_input_map = {
        int(nfl_id): g.sort_values("frame_id").iloc[-1]
        for nfl_id, g in df_input.groupby("nfl_id")
    }

    feat_matrix = []
    for _, row in lstm_out_df.iterrows():
        base = last_input_map.get(int(row['nfl_id']))
        if base is None:
            feat_matrix.append(np.zeros(len(TREE_FEATURE_COLS)))
        else:
            feat_matrix.append(np.array([base[c] for c in TREE_FEATURE_COLS]))

    feat_matrix = np.vstack(feat_matrix)
    total_residual = np.zeros((len(lstm_out_df), 2))

    for name, model in tree_models.items():
        if model is None:
            continue

        try:
            if name == 'xgb':
                pred = model.predict(xgb.DMatrix(feat_matrix))
            else:
                pred = model.predict(feat_matrix)

            pred = np.array(pred)
            if pred.ndim == 1:
                pred = np.stack([pred, np.zeros_like(pred)], axis=1)

            total_residual += ENSEMBLE_WEIGHTS[name] * pred

        except Exception as e:
            print("Tree error:", name, e)

    final_rows = []
    for i, row in lstm_out_df.iterrows():
        final_rows.append({
            "game_id": row["game_id"],
            "play_id": row["play_id"],
            "nfl_id": row["nfl_id"],
            "frame_id": row["frame_id"],
            "x": float(np.clip(row["x"] + total_residual[i, 0], 0, 120)),
            "y": float(np.clip(row["y"] + total_residual[i, 1], 0, 53.3))
        })

    return pd.DataFrame(final_rows)


# 7. kaggle_main() — evaluation loop

In [8]:
# ================================================================
# 7. RUN ONE PLAY
# ================================================================
def predict_one_play(test_df, models):
    lstm_model = models["lstm"]
    tree_models = {k: models.get(k) for k in ["xgb", "lgb", "cat"]}
    pred_df = predict_play(test_df, lstm_model, tree_models)
    return pred_df[["game_id","play_id","nfl_id","frame_id","x","y"]]

In [9]:
def predict(self, test_df, test_input_df):
    """
    Kaggle gateway passes two arguments:
    - test      : metadata (ignore)
    - test_input: actual play batch (use this)
    """

    preds = predict_one_play(test_input_df, self.models)

    # Convert to list for console printing
    pred_list = preds[["x", "y"]].values.tolist()

    # PRINT IN CONSOLE
    print("\n================ PREDICTION LIST ================")
    for row in pred_list:
        print(row)
    print("=================================================\n")

    # Must return numpy array to Kaggle gateway
    return preds[["x", "y"]].to_numpy()

In [10]:
# ================================================================
# 8. MAIN LOOP (KAGGLE)
# ================================================================
def kaggle_main_offline(test_csv_path, submission_csv_path):
    print("Loading models...")

    models = {
        "lstm": load_lstm_model(LSTM_MODEL_PATH),
        "xgb": load_xgb_model(XGB_MODEL_PATH),
        "lgb": load_lgb_model(LGB_MODEL_PATH),
        "cat": load_cat_model(CAT_MODEL_PATH)
    }

    print("Reading test CSV...")
    test_df = pd.read_csv(test_csv_path)

    print("Predicting...")
    preds = predict_one_play(test_df, models)

    print("Saving submission…")
    preds.to_csv(submission_csv_path, index=False)
    print("DONE:", submission_csv_path)

    return preds


# ================================================================
# ENTRYPOINT
# ================================================================
if __name__ == "__main__":
    kaggle_main_offline(
        "/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv",
        "/kaggle/working/submission.csv"
    )

Loading models...
⚠ LSTM model missing: /kaggle/input/nfl-big-data-bowl-2026-prediction/best_lstm.pth
Reading test CSV...
Predicting...
Saving submission…
DONE: /kaggle/working/submission.csv


In [11]:
import pandas as pd

# Path where your file was saved
path = "/kaggle/working/submission.csv"

# Load CSV
df = pd.read_csv(path)

# Display as a table
print(df)

# If you want to see only first few rows
# print(df.head())

         game_id  play_id  nfl_id  frame_id         x         y
0     2024120805      453   38588         1 -0.044709  0.009963
1     2024120805      453   38588         2 -0.053675  0.029083
2     2024120805      453   38588         3 -0.060993  0.029555
3     2024120805      453   38588         4 -0.065260  0.030956
4     2024120805      453   38588         5 -0.064952  0.034436
...          ...      ...     ...       ...       ...       ...
1392  2024120805      312   57801         7 -0.036728  0.027096
1393  2024120805      312   57801         8 -0.040036  0.031143
1394  2024120805      312   57801         9 -0.044247  0.036800
1395  2024120805      312   57801        10 -0.048045  0.042777
1396  2024120805      312   57801        11 -0.051015  0.047774

[1397 rows x 6 columns]


In [12]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv("/kaggle/working/submission.csv")
df

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
0,2024120805,453,38588,1,-0.044709,0.009963
1,2024120805,453,38588,2,-0.053675,0.029083
2,2024120805,453,38588,3,-0.060993,0.029555
3,2024120805,453,38588,4,-0.06526,0.030956
4,2024120805,453,38588,5,-0.064952,0.034436
5,2024120805,453,38588,6,-0.061066,0.039227
6,2024120805,453,38588,7,-0.055106,0.044487
7,2024120805,453,38588,8,-0.049347,0.049257
8,2024120805,453,38588,9,-0.046603,0.05259
9,2024120805,453,38588,10,-0.046376,0.054936
