In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_inference_server.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluati

# 1. IMPORTS

In [2]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# Try to import tree libraries; if missing, we'll skip those models
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    import catboost as cb
except Exception:
    cb = None


In [3]:
# CONFIG / PATHS / WEIGHTS
DEVICE = torch.device("cpu")  # Kaggle evaluation is CPU
# Model file names - change these to your actual saved files before zipping
LSTM_MODEL_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_lstm.pth"
XGB_MODEL_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_xgb.json"
LGB_MODEL_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_lgb.txt"
CAT_MODEL_PATH = "/kaggle/input/nfl-big-data-bowl-2026-prediction/best_cat.cbm"

# Ensemble blending weights for residuals (must sum <= 1)
ENSEMBLE_WEIGHTS = {
    "xgb": 0.3,
    "lgb": 0.3,
    "cat": 0.4
}

# Fallback if none of tree models available -> zero contribution
# LSTM has implicit weight = 1.0 (we add residuals on top)

# 2. FEATURE ENGINEERING FUNCTIONS

In [4]:
def to_inches(h):
    try:
        ft, inch = str(h).split("-")
        return int(ft) * 12 + int(inch)
    except Exception:
        return 72

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Competition-safe feature engineering used by LSTM and tree models.
       Must match training used for all models."""
    df = df.copy()
    for col in ['x','y','s','a','o','dir']:
        if col not in df.columns:
            df[col] = 0.0

    # Physical attributes
    if 'player_height' in df.columns:
        df['height_inches'] = df['player_height'].apply(to_inches).fillna(72)
    else:
        df['height_inches'] = 72
    if 'player_weight' in df.columns:
        df['weight_lbs'] = pd.to_numeric(df['player_weight'], errors='coerce').fillna(200)
    else:
        df['weight_lbs'] = 200
    df['bmi'] = (df['weight_lbs'] / (df['height_inches']**2 + 1e-6)) * 703.0

    # Directional components
    dir_rad = np.radians(pd.to_numeric(df['dir'], errors='coerce').fillna(0.0))
    df['heading_x'] = np.sin(dir_rad)
    df['heading_y'] = np.cos(dir_rad)
    orient_rad = np.radians(pd.to_numeric(df['o'], errors='coerce').fillna(0.0))
    df['orient_x'] = np.sin(orient_rad)
    df['orient_y'] = np.cos(orient_rad)

    # diff dir/orient
    dcol = pd.to_numeric(df['dir'], errors='coerce').fillna(0.0)
    ocol = pd.to_numeric(df['o'], errors='coerce').fillna(0.0)
    diff = np.abs(dcol - ocol)
    df['dir_orient_diff'] = np.minimum(diff, 360 - diff)

    # Velocity/acceleration components
    s = pd.to_numeric(df['s'], errors='coerce').fillna(0.0)
    a = pd.to_numeric(df['a'], errors='coerce').fillna(0.0)
    df['velocity_x'] = s * df['heading_x']
    df['velocity_y'] = s * df['heading_y']
    df['acceleration_x'] = a * df['heading_x']
    df['acceleration_y'] = a * df['heading_y']

    # Higher-level physics
    df['speed_squared'] = s**2
    df['accel_magnitude'] = np.sqrt(df['acceleration_x']**2 + df['acceleration_y']**2)
    df['momentum_x'] = df['weight_lbs'] * df['velocity_x']
    df['momentum_y'] = df['weight_lbs'] * df['velocity_y']
    df['momentum_magnitude'] = np.sqrt(df['momentum_x']**2 + df['momentum_y']**2)
    df['kinetic_energy'] = 0.5 * df['weight_lbs'] * df['speed_squared']

    # safe fill
    df = df.replace([np.inf, -np.inf], 0.0)
    df = df.fillna(0.0)
    return df

# Note: ensure these features exactly match what the tree models expect.
# Example feature list used by tree models (per-row)
TREE_FEATURE_COLS = [
    "x","y","s","a","o","dir",
    "heading_x","heading_y",
    "velocity_x","velocity_y",
    "acceleration_x","acceleration_y",
    "dir_orient_diff",
    "height_inches","weight_lbs","bmi",
    "speed_squared","accel_magnitude",
    "momentum_x","momentum_y","momentum_magnitude","kinetic_energy"
]

# 3. MODEL ARCHITECTURE (LSTM)

In [5]:
class EncoderDecoderLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(2, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, enc_X, enc_lens, T_out=10):
        packed = nn.utils.rnn.pack_padded_sequence(enc_X, enc_lens.cpu().numpy(), batch_first=True, enforce_sorted=False)
        _, (h_n, c_n) = self.encoder(packed)
        h, c = h_n, c_n
        B = enc_X.size(0)
        last_xy = []
        for i, L in enumerate(enc_lens):
            Li = int(L.item())
            last = enc_X[i, max(0, Li-1), :2]  # assume x,y are first two features
            last_xy.append(last)
        dec_in = torch.stack(last_xy, dim=0).unsqueeze(1)  # (B,1,2)
        preds = []
        for t in range(T_out):
            out, (h, c) = self.decoder(dec_in, (h, c))
            out = out.squeeze(1)
            xy = self.fc(out)
            preds.append(xy.unsqueeze(1))
            dec_in = xy.unsqueeze(1).detach()
        preds = torch.cat(preds, dim=1)
        return preds

# 4. UTILITY FUNCTIONS (padding, grouping, loading models)

In [6]:
def pad_groups(groups):
    B = len(groups)
    F = groups[0].shape[1]
    T_max = max(g.shape[0] for g in groups)
    Xp = np.zeros((B, T_max, F), dtype=np.float32)
    lens = np.zeros((B,), dtype=np.int64)
    for i,g in enumerate(groups):
        T = g.shape[0]
        Xp[i, :T, :] = g
        lens[i] = T
    Xt = torch.tensor(Xp, dtype=torch.float32, device=DEVICE)
    lens_t = torch.tensor(lens, dtype=torch.int64, device=DEVICE)
    return Xt, lens_t

def make_groups_meta(play_df):
    df = engineer_features(play_df.copy())
    groups = []
    metas = []
    for nfl_id, g in df.groupby("nfl_id"):
        g = g.sort_values("frame_id")
        feat = g[[c for c in TREE_FEATURE_COLS if c in g.columns]].values.astype(np.float32)
        groups.append(feat)
        metas.append((int(g['game_id'].iloc[0]), int(g['play_id'].iloc[0]), int(nfl_id), g['frame_id'].values.astype(int)))
    return groups, metas

def load_lstm_model(path=LSTM_MODEL_PATH):
    model = EncoderDecoderLSTM(input_dim=len(TREE_FEATURE_COLS), hidden_dim=128, num_layers=2, dropout=0.1)
    if os.path.exists(path):
        state = torch.load(path, map_location=DEVICE)
        # support if you saved dict with 'state_dict'
        if isinstance(state, dict) and 'state_dict' in state:
            state = state['state_dict']
        model.load_state_dict(state)
    else:
        print("Warning: LSTM model file not found at", path)
    model.to(DEVICE)
    model.eval()
    return model

def load_xgb_model(path=XGB_MODEL_PATH):
    if xgb is None:
        return None
    if not os.path.exists(path):
        return None
    try:
        booster = xgb.Booster()
        booster.load_model(path)
        return booster
    except Exception:
        # try to load as sklearn wrapper
        try:
            bst = xgb.BRegressor()
            bst.load_model(path)
            return bst
        except Exception:
            return None

def load_lgb_model(path=LGB_MODEL_PATH):
    if lgb is None:
        return None
    if not os.path.exists(path):
        return None
    try:
        booster = lgb.Booster(model_file=path)
        return booster
    except Exception:
        try:
            booster = lgb.LGBMRegressor()
            booster.booster_ = lgb.Booster(model_file=path)
            return booster
        except Exception:
            return None

def load_cat_model(path=CAT_MODEL_PATH):
    if cb is None:
        return None
    if not os.path.exists(path):
        return None
    try:
        model = cb.CatBoost()
        model.load_model(path)
        return model
    except Exception:
        try:
            model = cb.CatBoostRegressor()
            model.load_model(path)
            return model
        except Exception:
            return None

# 5. predict_play() — Multi-frame inference (LSTM generating absolute preds)

In [7]:
def predict_play(play_input_df, lstm_model, tree_models=None, T_out_default=10):
    """
    Predict positions for a single play using LSTM + optional tree model residuals.
    
    Args:
        play_input_df: DataFrame for a single play with input frames
        lstm_model: loaded LSTM model
        tree_models: dict with keys {'xgb','lgb','cat'} pointing to model objects or None
        T_out_default: default number of output frames if not specified
    
    Returns:
        DataFrame with columns game_id, play_id, nfl_id, frame_id, x, y
    """
    groups, metas = make_groups_meta(play_input_df)
    if len(groups) == 0:
        return pd.DataFrame(columns=["game_id","play_id","nfl_id","frame_id","x","y"])

    Xt, lens = pad_groups(groups)

    # Determine T_out
    if 'num_frames_output' in play_input_df.columns:
        try:
            T_out = int(play_input_df['num_frames_output'].dropna().iloc[0])
            if T_out <= 0:
                T_out = T_out_default
        except:
            T_out = T_out_default
    else:
        T_out = T_out_default

    # LSTM prediction -> absolute positions
    with torch.no_grad():
        preds = lstm_model(Xt, lens, T_out=T_out)  # (B, T_out, 2)
        preds_np = preds.cpu().numpy()

    # Build DataFrame of LSTM results
    rows = []
    for i, meta in enumerate(metas):
        game_id, play_id, nfl_id, in_frame_ids = meta
        for t in range(preds_np.shape[1]):
            rows.append({
                "game_id": int(game_id),
                "play_id": int(play_id),
                "nfl_id": int(nfl_id),
                "frame_id": int(t+1),
                "x": float(preds_np[i,t,0]),
                "y": float(preds_np[i,t,1])
            })
    lstm_out_df = pd.DataFrame(rows)

    # If no tree models, return LSTM output
    if tree_models is None or all(m is None for m in tree_models.values()):
        return lstm_out_df

    # Build last-input feature mapping
    df_input = engineer_features(play_input_df.copy())
    last_input_map = {int(nfl_id): g.sort_values("frame_id").iloc[-1]
                      for nfl_id, g in df_input.groupby("nfl_id")}

    # Prepare feature matrix for tree residuals
    feat_rows = []
    for _, row in lstm_out_df.iterrows():
        base = last_input_map.get(int(row['nfl_id']))
        if base is None:
            feat_rows.append(np.zeros(len(TREE_FEATURE_COLS), dtype=np.float32))
        else:
            feat = np.array([base[c] if c in base.index else 0.0 for c in TREE_FEATURE_COLS], dtype=np.float32)
            feat_rows.append(feat)
    feat_matrix = np.vstack(feat_rows)

    # Predict residuals from tree models
    total_residual = np.zeros((len(lstm_out_df), 2), dtype=np.float32)

    for model_name in ['xgb','lgb','cat']:
        m = tree_models.get(model_name)
        if m is None:
            continue
        try:
            # XGBoost
            if model_name == 'xgb':
                if isinstance(m, xgb.Booster):
                    dmat = xgb.DMatrix(feat_matrix, feature_names=TREE_FEATURE_COLS)
                    pred_flat = m.predict(dmat)
                else:
                    pred_flat = m.predict(feat_matrix)
            # LightGBM
            elif model_name == 'lgb':
                pred_flat = m.predict(feat_matrix)
            # CatBoost
            elif model_name == 'cat':
                pred_flat = m.predict(feat_matrix)

            pred_flat = np.asarray(pred_flat)
            if pred_flat.ndim == 1:
                dx = pred_flat
                dy = np.zeros_like(dx)
                preds_model = np.vstack([dx, dy]).T
            else:
                preds_model = pred_flat.reshape(-1, 2)

            total_residual += ENSEMBLE_WEIGHTS.get(model_name, 0.0) * preds_model

        except Exception as e:
            print(f"Tree model {model_name} prediction failed: {e}")

    # Apply residuals and clip to field boundaries
    final_rows = []
    for i, row in lstm_out_df.iterrows():
        x_final = np.clip(row['x'] + total_residual[i,0], 0.0, 120.0)
        y_final = np.clip(row['y'] + total_residual[i,1], 0.0, 53.3)
        final_rows.append({
            "game_id": int(row['game_id']),
            "play_id": int(row['play_id']),
            "nfl_id": int(row['nfl_id']),
            "frame_id": int(row['frame_id']),
            "x": float(x_final),
            "y": float(y_final)
        })

    return pd.DataFrame(final_rows)

# 6. predict_one_play() 

In [8]:
def predict_one_play(test_play_df, models):
    """
    Predict positions for a single play using LSTM + optional tree model residuals.

    Args:
        test_play_df: DataFrame containing all frames of a single play
        models: dictionary containing loaded models with keys:
                'lstm', 'xgb', 'lgb', 'cat'

    Returns:
        DataFrame with columns: game_id, play_id, nfl_id, frame_id, x, y
    """
    lstm_model = models.get('lstm')
    if lstm_model is None:
        raise ValueError("LSTM model must be provided in models dictionary.")

    tree_models = {
        'xgb': models.get('xgb'),
        'lgb': models.get('lgb'),
        'cat': models.get('cat')
    }

    # Use the improved predict_play function
    pred_df = predict_play(test_play_df, lstm_model, tree_models=tree_models)

    # Return only required columns
    return pred_df[["game_id", "play_id", "nfl_id", "frame_id", "x", "y"]]


# 7. kaggle_main() — evaluation loop

In [9]:
def kaggle_main():
    """
    Runs only inside the Kaggle Big Data Bowl evaluation environment.
    When running locally, nflrush will not exist — this function is skipped.
    """
    # ------------------------------------------------
    # Detect Kaggle Environment
    # ------------------------------------------------
    try:
        import nflrush
    except ImportError:
        print("⚠ nflrush not available — running locally, kaggle_main() skipped.")
        return

    print("Kaggle evaluation environment detected. Starting pipeline...")

    # ------------------------------------------------
    # Load All Models
    # ------------------------------------------------
    print("Loading models...")
    models = {}

    # LSTM Sequence Model
    models["lstm"] = load_lstm_model(LSTM_MODEL_PATH)

    # Residual Tree Models (load only if libraries exist)
    models["xgb"] = load_xgb_model(XGB_MODEL_PATH) if xgb is not None else None
    models["lgb"] = load_lgb_model(LGB_MODEL_PATH) if lgb is not None else None
    models["cat"] = load_cat_model(CAT_MODEL_PATH) if cb is not None else None

    # Soft warnings if libs missing
    if xgb is None:
        print("⚠ xgboost not installed — XGB residual model disabled.")
    if lgb is None:
        print("⚠ lightgbm not installed — LGB residual model disabled.")
    if cb is None:
        print("⚠ catboost not installed — CAT residual model disabled.")

    # ------------------------------------------------
    # Kaggle NFL Environment
    # ------------------------------------------------
    env = nflrush.make_env()
    iter_test = env.iter_test()

    # ------------------------------------------------
    # Prediction Loop
    # ------------------------------------------------
    for (test_df, sample_pred_df) in iter_test:

        # 1. Predict movement outputs for this play
        preds_full = predict_one_play(test_df, models)

        # 2. Merge predictions with required Kaggle schema
        merged = sample_pred_df.merge(
            preds_full,
            on=["game_id", "play_id", "nfl_id", "frame_id"],
            how="left"
        )

        # 3. Fill missing predictions (failsafe)
        merged["x"] = merged["x"].fillna(0.0)
        merged["y"] = merged["y"].fillna(0.0)

        # 4. Maintain the Kaggle-required submission format
        out_df = merged[sample_pred_df.columns.tolist() + ["x", "y"]]

        # 5. Push to Kaggle evaluator
        env.predict(out_df)

    print("Kaggle evaluation completed.")

### Sample local environment simulation

In [10]:
import pandas as pd

class LocalNFLRushEnv:
    """
    Minimal local simulator for Kaggle's nflrush environment.
    Uses test_input.csv and sample submission structure.
    """
    def __init__(self, test_input_path="/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv", default_T_out=10):
        # Load test input
        self.test_df = pd.read_csv(test_input_path)
        
        # Determine output frames (T_out)
        T_out = default_T_out
        if 'num_frames_output' in self.test_df.columns:
            try:
                T_out_val = int(self.test_df['num_frames_output'].dropna().iloc[0])
                if T_out_val > 0:
                    T_out = T_out_val
            except:
                pass

        # Build fake sample submission frame
        cols = ["game_id", "play_id", "nfl_id"]
        base = self.test_df[cols].drop_duplicates()

        # Frame IDs for prediction
        frame_ids = list(range(1, T_out + 1))

        # Cross join base rows with frame_ids
        self.sample_pred_df = (
            base.assign(key=1)
                .merge(pd.DataFrame({"frame_id": frame_ids, "key": [1]*len(frame_ids)}), on="key")
                .drop("key", axis=1)
        )

        # Store predictions
        self.predictions = []

    def iter_test(self):
        """
        Yields (test_df, sample_pred_df) exactly like Kaggle environment,
        but only once for local testing.
        """
        yield self.test_df, self.sample_pred_df

    def predict(self, df):
        """
        Collect predictions, prints a preview.
        """
        self.predictions.append(df)
        print("\n=== LOCAL PREDICTIONS RECEIVED ===")
        print(df.head())
        print("==================================\n")


def make_local_env(test_input_path="/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv"):
    """
    Factory to create local simulator.
    """
    return LocalNFLRushEnv(test_input_path)

### Predict One play

In [11]:
if __name__ == "__main__":
    # Local testing
    local_env = make_local_env()
    models = {
        "lstm": load_lstm_model(LSTM_MODEL_PATH),
        "xgb": load_xgb_model(XGB_MODEL_PATH),
        "lgb": load_lgb_model(LGB_MODEL_PATH),
        "cat": load_cat_model(CAT_MODEL_PATH)
    }

    for test_df, sample_pred_df in local_env.iter_test():
        preds = predict_one_play(test_df, models)
        local_env.predict(preds)


=== LOCAL PREDICTIONS RECEIVED ===
      game_id  play_id  nfl_id  frame_id         x         y
0  2024120805      453   38588         1  0.016531  0.061495
1  2024120805      453   38588         2 -0.009225  0.020007
2  2024120805      453   38588         3 -0.016248  0.005007
3  2024120805      453   38588         4 -0.014225 -0.001004
4  2024120805      453   38588         5 -0.008161 -0.000556



# 8. __main__

In [12]:
if __name__ == "__main__":
    kaggle_main()

⚠ nflrush not available — running locally, kaggle_main() skipped.
