In [1]:
# 1. IMPORTS & SETUP
# We are using standard libraries. No complex Deep Learning frameworks.
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import os
import gc

# Config
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
TRAIN_PATH = DATA_DIR + 'train/'

# We will load Weeks 1-9
# but small enough to run quickly in the notebook.
WEEKS = list(range(1, 10)) 

print("Setup Complete. Ready to load data.")

Setup Complete. Ready to load data.


In [2]:
# DIAGNOSTIC STEP: Check Column Names
import pandas as pd

# Load just ONE week to peek at the structure
w = 1
path_in = f"{TRAIN_PATH}input_2023_w{w:02d}.csv"
path_out = f"{TRAIN_PATH}output_2023_w{w:02d}.csv"

try:
    print(f"--- Checking Week {w} ---")
    
    # Read the first 5 rows only (to save time)
    check_in = pd.read_csv(path_in, nrows=5)
    check_out = pd.read_csv(path_out, nrows=5)
    
    print("\nINPUT Columns (Tracking Data):")
    print(list(check_in.columns))
    
    print("\nOUTPUT Columns (Target Data):")
    print(list(check_out.columns))
    
except FileNotFoundError:
    print(f"Could not find files for Week {w}. Please check the path: {TRAIN_PATH}")

--- Checking Week 1 ---

INPUT Columns (Tracking Data):
['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id', 'play_direction', 'absolute_yardline_number', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y']

OUTPUT Columns (Target Data):
['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y']


In [3]:
# 2. DATA LOADING (TRAJECTORY MODE)
def load_data_trajectory(weeks):
    df_list = []
    for w in weeks:
        try:
            # Load Past (Input) and Future (Output)
            df_in = pd.read_csv(f"{TRAIN_PATH}input_2023_w{w:02d}.csv")
            df_out = pd.read_csv(f"{TRAIN_PATH}output_2023_w{w:02d}.csv")
            
            # 1. Create Physics Snapshot from Input (The Context)
            # Group by player and take the last known state
            last_known = df_in.groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
            
            # Rename columns to avoid confusion (e.g., x -> start_x)
            last_known = last_known.rename(columns={
                'x': 'start_x', 'y': 'start_y', 's': 'start_s', 
                'a': 'start_a', 'dir': 'start_dir', 'o': 'start_o'
            })
            
            # 2. Merge Context onto the Future Frames
            # We want to predict 'x' (in df_out) using 'start_x' (in last_known)
            merged = df_out.merge(
                last_known[['game_id', 'play_id', 'nfl_id', 'start_x', 'start_y', 'start_s', 'start_a', 'start_dir', 'player_weight', 'player_height']], 
                on=['game_id', 'play_id', 'nfl_id'], 
                how='left'
            )
            
            df_list.append(merged)
            print(f"Loaded Week {w}")
        except FileNotFoundError:
            print(f"Week {w} not found.")
            
    return pd.concat(df_list, ignore_index=True)

print("Loading Trajectory Data...")
train_df = load_data_trajectory(WEEKS)
print(f"Data Shape: {train_df.shape}")

Loading Trajectory Data...
Loaded Week 1
Loaded Week 2
Loaded Week 3
Loaded Week 4
Loaded Week 5
Loaded Week 6
Loaded Week 7
Loaded Week 8
Loaded Week 9
Data Shape: (279727, 13)


In [4]:
# 3. FEATURE ENGINEERING (TRAJECTORY PHYSICS)
def create_physics_features(df):
    print("Engineering Physics Features...")
    
    # 1. Time Delta: How far into the future are we predicting
    # Assuming frames are 0.1s apart. We calculate time based on frame_id difference or just use frame_id if it resets.
    # A simple proxy is 'frame_id' from the output file.
    df['time_step'] = df['frame_id'] 
    
    # 2. Convert Angles
    df['dir_rad'] = np.deg2rad(df['start_dir'])
    
    # 3. Kinematic Prediction
    # Where would the player be if they just kept running straight?
    # x = x0 + v*t
    # We use this as a "Hint" for the tree model.
    # Note: 'start_s' is speed in yards/sec. Time is roughly frame_id * 0.1
    time_sec = df['time_step'] * 0.1
    df['physics_expected_x'] = df['start_x'] + (df['start_s'] * np.cos(df['dir_rad']) * time_sec)
    df['physics_expected_y'] = df['start_y'] + (df['start_s'] * np.sin(df['dir_rad']) * time_sec)
    
    # 4. Momentum (Mass * Velocity)
    df['momentum'] = df['player_weight'] * df['start_s']
    
    return df

print("Processing...")
train_processed = create_physics_features(train_df)

# Features: Start State + Physics Estimate + Time
FEATURES = [
    'start_x', 'start_y', 'start_s', 'start_a', 'start_dir',  # Initial State
    'time_step',                                              # Time
    'physics_expected_x', 'physics_expected_y',               # Our Calculation
    'momentum', 'player_weight'                               # Body Stats
]

# TARGETS are now the FUTURE positions
TARGET_X = 'x'
TARGET_Y = 'y'

print("Features Ready.")

Processing...
Engineering Physics Features...
Features Ready.


In [5]:
# 4. ENSEMBLE TRAINING (XGBoost + LightGBM)
print("Starting Ensemble Training...")

# Hyperparameters
xgb_params = {
    'objective': 'reg:squarederror', 'n_estimators': 500, 'learning_rate': 0.05,
    'max_depth': 6, 'tree_method': 'hist', 'n_jobs': -1, 'random_state': 42
}
lgb_params = {
    'objective': 'regression', 'n_estimators': 500, 'learning_rate': 0.05,
    'num_leaves': 31, 'n_jobs': -1, 'random_state': 42, 'verbose': -1
}

# Validation Strategy
groups = train_processed['game_id'].astype(str) + '_' + train_processed['play_id'].astype(str)
gkf = GroupKFold(n_splits=5)

# Storage
scores = []
X = train_processed[FEATURES]
y_x = train_processed[TARGET_X]
y_y = train_processed[TARGET_Y]

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y_x, groups=groups), 1):
    
    # Slice Data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    
    # Train for X Coordinate
    yx_train, yx_val = y_x.iloc[train_idx], y_x.iloc[val_idx]
    
    # 1. LightGBM (X)
    lgb_x = lgb.LGBMRegressor(**lgb_params).fit(X_train, yx_train)
    pred_x_lgb = lgb_x.predict(X_val)
    
    # 2. XGBoost (X)
    xgb_x_mod = xgb.XGBRegressor(**xgb_params).fit(X_train, yx_train)
    pred_x_xgb = xgb_x_mod.predict(X_val)
    
    # Train for Y Coordinate
    yy_train, yy_val = y_y.iloc[train_idx], y_y.iloc[val_idx]
    
    # 1. LightGBM (Y)
    lgb_y = lgb.LGBMRegressor(**lgb_params).fit(X_train, yy_train)
    pred_y_lgb = lgb_y.predict(X_val)
    
    # 2. XGBoost (Y)
    xgb_y_mod = xgb.XGBRegressor(**xgb_params).fit(X_train, yy_train)
    pred_y_xgb = xgb_y_mod.predict(X_val)
    
    # ENSEMBLE AVERAGE
    ens_pred_x = (0.5 * pred_x_lgb) + (0.5 * pred_x_xgb)
    ens_pred_y = (0.5 * pred_y_lgb) + (0.5 * pred_y_xgb)
    
    # Calculate RMSE
    rmse_x = mean_squared_error(yx_val, ens_pred_x, squared=False)
    rmse_y = mean_squared_error(yy_val, ens_pred_y, squared=False)
    combined_rmse = np.sqrt((rmse_x**2 + rmse_y**2) / 2)
    scores.append(combined_rmse)
    
    print(f"Fold {fold} Ensemble RMSE: {combined_rmse:.4f}")

print("\n" + "="*40)
print(f"Mean CV RMSE: {np.mean(scores):.4f}")
print("="*40)

Starting Ensemble Training...
Fold 1 Ensemble RMSE: 1.6346
Fold 2 Ensemble RMSE: 1.5051
Fold 3 Ensemble RMSE: 1.5351
Fold 4 Ensemble RMSE: 1.4660
Fold 5 Ensemble RMSE: 1.5328

Mean CV RMSE: 1.5347
