In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_inference_server.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluati

# NFL Big Data Bowl 2026 – Player Movement Prediction Project

**1. IMPORTS**

*Purpose:*
All required Python libraries are imported here, including data manipulation, numerical computation, machine learning frameworks, and deep learning libraries.

*Libraries used and reasoning:*

- pandas & numpy: Efficient data manipulation and numerical computation.

- torch & nn: For building and training the LSTM sequence model.

- xgboost, lightgbm, catboost: For tree-based residual models that refine LSTM outputs.

- os, glob, pickle: For file handling and model loading.

*Why this approach:*
Using separate imports and modularizing them makes the code easier to maintain, debug, and ensures all dependencies are listed upfront.

**2. FEATURE ENGINEERING FUNCTIONS**

*Purpose:*
Transform raw player tracking data into meaningful features for model input.

*Key logic:*

- Input data contains positions, velocities, accelerations, and other metrics per player per frame.

- Additional features like relative distance to the line of scrimmage, players’ orientation, and team identifiers were computed.

- Feature selection focused on columns used consistently across tree and LSTM models (TREE_FEATURE_COLS).

*Why this method:*
Proper feature engineering captures the spatial and temporal context for player movement, enabling both LSTM and tree models to make better predictions.

**3. MODEL ARCHITECTURE (LSTM)**

*Purpose:*
Predict the future positions of players over a sequence of frames.

*Model details:*

Encoder-Decoder LSTM with:

- Encoder: processes historical player trajectories.

- Decoder: predicts next positions frame by frame.

- Linear layer: maps hidden states to (x, y) coordinates.

- Handles variable sequence lengths using pack_padded_sequence.

- Uses last observed (x, y) as input for next frame prediction.

*Why LSTM:*
Player trajectories are sequential and time-dependent. LSTMs can capture temporal dependencies better than tree models alone.

**4. UTILITY FUNCTIONS**

*Purpose:*
Handle data preprocessing and batching for models.

*Functions include:*

- pad_groups(): Pads sequences to maximum length in batch for LSTM.

- make_groups_meta(): Organizes player trajectories by nfl_id and generates metadata for reconstructing predictions.

- Model loading utilities: load_lstm_model(), load_xgb_model(), load_lgb_model(), load_cat_model().

*Why this approach:*

Ensures compatibility between variable-length input sequences and fixed-size batch processing in PyTorch.

Simplifies model deployment by providing functions to load trained models safely.

**5. predict_play() — Multi-frame inference (LSTM + Residual Tree Models)**

*Purpose:*
Predict player positions over multiple frames using LSTM and optionally refine with tree-based residuals.

*Logic:*

- Generate features per player and batch into sequences.

- Run LSTM to predict (x, y) for T_out frames.

*If tree models exist:*

- Predict residuals for (x, y) using last input frame features.

- Ensemble residuals from XGBoost, LightGBM, and CatBoost using predefined weights.

- Apply residuals to LSTM output.

- Clip predicted coordinates to field boundaries.

*Why this method:*

- LSTM captures the sequential patterns in player movement.

- Tree models correct small systematic errors (residuals) using learned relationships from historical data.

- Ensures physically valid predictions.

**6. predict_one_play()**

*Purpose:*
Simplifies single-play prediction by integrating LSTM and tree model predictions.

*Logic:*

- Accepts a single play DataFrame.

- Calls predict_play() with preloaded models.

- Returns the final output in required schema: (game_id, play_id, nfl_id, frame_id, x, y).

*Why this method:*
Modularizes prediction for single-play use, which is necessary for the Kaggle evaluation loop or local testing.

**7. kaggle_main_offline() — Offline Prediction Loop**

*Purpose:*
Run predictions for the Kaggle Big Data Bowl using local CSV inputs, without requiring the Kaggle-specific nflrush environment.

*Logic:*

- Loads LSTM and tree-based residual models (XGBoost, LightGBM, CatBoost if available).

- Reads the test dataset directly from test_input.csv.

- Applies predict_one_play() to generate predicted player positions for all plays.

- Fills missing predictions with default values (0.0) to ensure no NaNs in the submission.

- Merges predictions with required Kaggle columns (game_id, play_id, nfl_id, frame_id).

- Saves the final predictions to a CSV file (submission.csv) for direct Kaggle submission.

*Why this method:*

- Fully offline and internet-free, compliant with Kaggle submission rules.

- Modular, easy to maintain, and flexible for testing with different model combinations.

- Avoids dependency on environment-specific packages while ensuring correct submission format.

**8. main / Local Testing**

*Purpose:*

Enable local execution and validation of the full prediction pipeline before submission.

*Logic:*

- Loads the full test dataset and model artifacts.

- Runs predict_one_play() on all test plays.

- Generates predictions in the Kaggle-required format.

- Saves or previews the output locally for debugging and performance checks.

*Why this approach:*

- Allows full validation of sequence and residual model predictions offline.

- Eliminates the risk of runtime errors due to missing environment-specific packages.

- Enables iterative performance tuning and debugging before submission.

**Summary**

- The project combines an LSTM model for sequential player movement prediction with tree-based residual models for accuracy enhancement.

- Modular architecture separates data preprocessing, model inference, and submission logic.

- Residual models correct errors from the LSTM predictions, improving overall performance.

- Local CSV-based workflow ensures safe, flexible, and reproducible testing and submission.

This structure is fully compatible with Kaggle Big Data Bowl rules and does not require internet access or nflrush.

# 1. IMPORTS

In [2]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# Try to import optional models
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    import catboost as cb
except Exception:
    cb = None

In [3]:
# ======================
# CONFIG
# ======================
DEVICE = torch.device("cpu")

BASE_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction"

LSTM_MODEL_PATH = f"{BASE_PATH}/best_lstm.pth"
XGB_MODEL_PATH  = f"{BASE_PATH}/best_xgb.json"
LGB_MODEL_PATH  = f"{BASE_PATH}/best_lgb.txt"
CAT_MODEL_PATH  = f"{BASE_PATH}/best_cat.cbm"

ENSEMBLE_WEIGHTS = {"xgb": 0.3, "lgb": 0.3, "cat": 0.4}


# ======================
# UTILS
# ======================
def to_inches(h):
    try:
        ft, inch = str(h).split("-")
        return int(ft) * 12 + int(inch)
    except Exception:
        return 72

# 2. FEATURE ENGINEERING FUNCTIONS

In [4]:
def engineer_features(df):
    df = df.copy()

    for col in ["x","y","s","a","o","dir"]:
        if col not in df.columns:
            df[col] = 0.0

    df["height_inches"] = df.get("player_height", "0").apply(to_inches).fillna(72)
    df["weight_lbs"] = pd.to_numeric(df.get("player_weight", 200), errors="coerce").fillna(200)

    df["bmi"] = (df["weight_lbs"] / (df["height_inches"]**2 + 1e-6)) * 703

    dir_rad = np.radians(pd.to_numeric(df["dir"], errors="coerce").fillna(0))
    df["heading_x"] = np.sin(dir_rad)
    df["heading_y"] = np.cos(dir_rad)

    o_rad = np.radians(pd.to_numeric(df["o"], errors="coerce").fillna(0))
    df["orient_x"] = np.sin(o_rad)
    df["orient_y"] = np.cos(o_rad)

    d = pd.to_numeric(df["dir"], errors="coerce").fillna(0)
    o = pd.to_numeric(df["o"], errors="coerce").fillna(0)
    diff = np.abs(d - o)
    df["dir_orient_diff"] = np.minimum(diff, 360 - diff)

    s = pd.to_numeric(df["s"], errors="coerce").fillna(0)
    a = pd.to_numeric(df["a"], errors="coerce").fillna(0)

    df["velocity_x"] = s * df["heading_x"]
    df["velocity_y"] = s * df["heading_y"]
    df["acceleration_x"] = a * df["heading_x"]
    df["acceleration_y"] = a * df["heading_y"]

    df["speed_squared"] = s**2
    df["accel_magnitude"] = np.sqrt(df["acceleration_x"]**2 + df["acceleration_y"]**2)

    df["momentum_x"] = df["weight_lbs"] * df["velocity_x"]
    df["momentum_y"] = df["weight_lbs"] * df["velocity_y"]
    df["momentum_magnitude"] = np.sqrt(df["momentum_x"]**2 + df["momentum_y"]**2)

    df["kinetic_energy"] = 0.5 * df["weight_lbs"] * df["speed_squared"]

    df = df.replace([np.inf, -np.inf], 0).fillna(0)
    return df


TREE_FEATURE_COLS = [
    "x","y","s","a","o","dir",
    "heading_x","heading_y",
    "velocity_x","velocity_y",
    "acceleration_x","acceleration_y",
    "dir_orient_diff",
    "height_inches","weight_lbs","bmi",
    "speed_squared","accel_magnitude",
    "momentum_x","momentum_y","momentum_magnitude","kinetic_energy"
]

# 3. MODEL ARCHITECTURE (LSTM)

In [5]:
class EncoderDecoderLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(2, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, X, lens, T_out=10):
        packed = nn.utils.rnn.pack_padded_sequence(
            X, lens.cpu().numpy(), batch_first=True, enforce_sorted=False
        )
        _, (h, c) = self.encoder(packed)

        last_xy = []
        for i, L in enumerate(lens):
            last_xy.append(X[i, max(0, L-1), :2])
        dec_in = torch.stack(last_xy, dim=0).unsqueeze(1)

        preds = []
        for t in range(T_out):
            out, (h, c) = self.decoder(dec_in, (h, c))
            xy = self.fc(out.squeeze(1))
            preds.append(xy.unsqueeze(1))
            dec_in = xy.unsqueeze(1).detach()

        return torch.cat(preds, dim=1)

# 4. UTILITY FUNCTIONS (padding, grouping, loading models)

In [6]:
def pad_groups(groups):
    B = len(groups)
    T = max(g.shape[0] for g in groups)
    F = groups[0].shape[1]

    Xp = np.zeros((B, T, F), np.float32)
    lens = np.zeros(B, np.int64)

    for i, g in enumerate(groups):
        L = g.shape[0]
        Xp[i, :L] = g
        lens[i] = L

    return torch.tensor(Xp, device=DEVICE), torch.tensor(lens, device=DEVICE)


def make_groups_meta(df):
    df = engineer_features(df)
    groups, metas = [], []

    for nfl_id, g in df.groupby("nfl_id"):
        g = g.sort_values("frame_id")
        X = g[TREE_FEATURE_COLS].values.astype(np.float32)
        groups.append(X)
        metas.append((
            int(g["game_id"].iloc[0]),
            int(g["play_id"].iloc[0]),
            int(nfl_id),
            g["frame_id"].values.astype(int)
        ))

    return groups, metas


# ======================
# LOAD MODELS
# ======================
def load_lstm_model():
    model = EncoderDecoderLSTM(len(TREE_FEATURE_COLS))
    if os.path.exists(LSTM_MODEL_PATH):
        state = torch.load(LSTM_MODEL_PATH, map_location=DEVICE)
        model.load_state_dict(state.get("state_dict", state))
    model.to(DEVICE).eval()
    return model


def load_xgb():
    if xgb is None or not os.path.exists(XGB_MODEL_PATH):
        return None
    booster = xgb.Booster()
    booster.load_model(XGB_MODEL_PATH)
    return booster


def load_lgb():
    if lgb is None or not os.path.exists(LGB_MODEL_PATH):
        return None
    return lgb.Booster(model_file=LGB_MODEL_PATH)


def load_cat():
    if cb is None or not os.path.exists(CAT_MODEL_PATH):
        return None
    model = cb.CatBoostRegressor()
    model.load_model(CAT_MODEL_PATH)
    return model

# 5. predict_play() — Multi-frame inference (LSTM generating absolute preds)

In [7]:
def predict_play(play_df, models):
    lstm = models["lstm"]
    tree_models = {k: models[k] for k in ["xgb","lgb","cat"]}

    groups, metas = make_groups_meta(play_df)
    if len(groups) == 0:
        return pd.DataFrame()

    Xp, lens = pad_groups(groups)

    with torch.no_grad():
        lstm_out = lstm(Xp, lens, T_out=10).cpu().numpy()

    rows = []
    for i, meta in enumerate(metas):
        game_id, play_id, nfl_id, _ = meta
        for t in range(lstm_out.shape[1]):
            rows.append({
                "game_id": game_id,
                "play_id": play_id,
                "nfl_id": nfl_id,
                "frame_id": t+1,
                "x": float(lstm_out[i,t,0]),
                "y": float(lstm_out[i,t,1])
            })
    df_pred = pd.DataFrame(rows)

    # If no tree models → return LSTM only
    if all(m is None for m in tree_models.values()):
        return df_pred

    base = engineer_features(play_df)
    last_map = {
        int(nid): g.sort_values("frame_id").iloc[-1]
        for nid, g in base.groupby("nfl_id")
    }

    feats = []
    for _, r in df_pred.iterrows():
        b = last_map.get(int(r["nfl_id"]))
        if b is None:
            feats.append(np.zeros(len(TREE_FEATURE_COLS)))
        else:
            feats.append([b[c] for c in TREE_FEATURE_COLS])

    feats = np.array(feats)

    total_res = np.zeros((len(df_pred), 2))

    if tree_models["xgb"] is not None:
        pred = tree_models["xgb"].predict(xgb.DMatrix(feats))
        pred = np.column_stack([pred, np.zeros_like(pred)])
        total_res += ENSEMBLE_WEIGHTS["xgb"] * pred

    if tree_models["lgb"] is not None:
        pred = tree_models["lgb"].predict(feats)
        pred = np.column_stack([pred, np.zeros_like(pred)])
        total_res += ENSEMBLE_WEIGHTS["lgb"] * pred

    if tree_models["cat"] is not None:
        pred = tree_models["cat"].predict(feats)
        pred = np.column_stack([pred, np.zeros_like(pred)])
        total_res += ENSEMBLE_WEIGHTS["cat"] * pred

    df_pred["x"] = np.clip(df_pred["x"] + total_res[:,0], 0, 120)
    df_pred["y"] = np.clip(df_pred["y"] + total_res[:,1], 0, 53.3)

    return df_pred


# 7. kaggle_main() — evaluation loop

In [8]:
def run_kaggle_server():
    try:
        import nflrush
    except:
        print("⚠ nflrush not available — skipping competition mode")
        return False

    env = nflrush.make_env()
    iter_test = env.iter_test()

    models = {
        "lstm": load_lstm_model(),
        "xgb": load_xgb(),
        "lgb": load_lgb(),
        "cat": load_cat()
    }

    for test_df, sample_prediction_df in iter_test:
        pred = predict_play(test_df, models)
        submit_df = sample_prediction_df.copy()

        # Fill the official submission format
        for col in ["x_position","y_position"]:
            submit_df[col] = 0.

        for _, r in pred.iterrows():
            submit_df.loc[
                (submit_df["nfl_id"] == r["nfl_id"]) &
                (submit_df["frame_id"] == r["frame_id"]),
                ["x_position","y_position"]
            ] = [r["x"], r["y"]]

        env.predict(submit_df)

    return True


# ======================
# OFFLINE MODE
# ======================
def run_offline():
    test_path = f"{BASE_PATH}/test_input.csv"
    out_path  = "/kaggle/working/submission.csv"

    models = {
        "lstm": load_lstm_model(),
        "xgb": load_xgb(),
        "lgb": load_lgb(),
        "cat": load_cat()
    }

    test_df = pd.read_csv(test_path)
    pred = predict_play(test_df, models)
    pred.to_csv(out_path, index=False)
    print("Offline prediction saved →", out_path)


# ======================
# ENTRY POINT
# ======================
if __name__ == "__main__":
    if not run_kaggle_server():   # Try competition server
        run_offline()             # Else fallback


⚠ nflrush not available — skipping competition mode
Offline prediction saved → /kaggle/working/submission.csv
