In [1]:
# 1. IMPORTS & SETUP
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import gc

# Config
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
TRAIN_PATH = DATA_DIR + 'train/'

# We will load Weeks 1-9
# but small enough to run quickly in the notebook.
WEEKS = list(range(1, 10)) 

print("Setup Complete. Ready to load data.")

Setup Complete. Ready to load data.


In [2]:
#Check Column Names
import pandas as pd

# Load one week to peek at the structure
w = 1
path_in = f"{TRAIN_PATH}input_2023_w{w:02d}.csv"
path_out = f"{TRAIN_PATH}output_2023_w{w:02d}.csv"

try:
    print(f"--- Checking Week {w} ---")
    
    # Read the first 5 rows
    check_in = pd.read_csv(path_in, nrows=5)
    check_out = pd.read_csv(path_out, nrows=5)
    
    print("\nINPUT Columns (Tracking Data):")
    print(list(check_in.columns))
    
    print("\nOUTPUT Columns (Target Data):")
    print(list(check_out.columns))
    
except FileNotFoundError:
    print(f"Could not find files for Week {w}. Please check the path: {TRAIN_PATH}")

--- Checking Week 1 ---

INPUT Columns (Tracking Data):
['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id', 'play_direction', 'absolute_yardline_number', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y']

OUTPUT Columns (Target Data):
['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y']


In [3]:
# 2. DATA LOADING
def load_data_trajectory(weeks):
    df_list = []
    for w in weeks:
        try:
            # Load Past (Input) and Future (Output)
            df_in = pd.read_csv(f"{TRAIN_PATH}input_2023_w{w:02d}.csv")
            df_out = pd.read_csv(f"{TRAIN_PATH}output_2023_w{w:02d}.csv")
            
            # 1. Create Physics Snapshot from Input
            last_known = df_in.groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
            
            # Rename columns
            last_known = last_known.rename(columns={
                'x': 'start_x', 'y': 'start_y', 's': 'start_s', 
                'a': 'start_a', 'dir': 'start_dir', 'o': 'start_o'
            })
            
            # 2. Merge Context onto the Future Frames
            merged = df_out.merge(
                last_known[[
                    'game_id', 'play_id', 'nfl_id', 
                    'start_x', 'start_y', 'start_s', 'start_a', 'start_dir', 'start_o', 
                    'play_direction', 
                    'player_weight', 'player_height', 
                    'player_birth_date', 'player_role'
                ]], 
                on=['game_id', 'play_id', 'nfl_id'], 
                how='left'
            )
            
            df_list.append(merged)
            print(f"Loaded Week {w}")
        except FileNotFoundError:
            print(f"Week {w} not found.")
            
    return pd.concat(df_list, ignore_index=True)

print("Reloading Trajectory Data...")
train_df = load_data_trajectory(WEEKS)
print(f"Data Shape: {train_df.shape}")

Reloading Trajectory Data...
Loaded Week 1
Loaded Week 2
Loaded Week 3
Loaded Week 4
Loaded Week 5
Loaded Week 6
Loaded Week 7
Loaded Week 8
Loaded Week 9
Data Shape: (279727, 17)


In [4]:
# 3. FEATURE ENGINEERING
def create_slide_features(df):
    print("Engineering Features...")
    
    # --- 0. PRE-PROCESSING ---
    # Convert "6-2" to 74 inches
    def parse_height(h):
        try:
            ft, inch = h.split('-')
            return int(ft)*12 + int(inch)
        except:
            return 72 
            
    df['height_inches'] = df['player_height'].apply(parse_height)
    
    # --- 1. GEOMETRY ---
    # Standardize to Left->Right
    is_left = df['play_direction'] == 'left'
    
    df['std_x'] = np.where(is_left, 120 - df['x'], df['x'])
    df['std_y'] = np.where(is_left, 53.3 - df['y'], df['y'])
    df['std_start_x'] = np.where(is_left, 120 - df['start_x'], df['start_x'])
    df['std_start_y'] = np.where(is_left, 53.3 - df['start_y'], df['start_y'])
    df['std_dir'] = np.where(is_left, (df['start_dir'] + 180) % 360, df['start_dir'])
    
    # Normalize (0-1)
    df['norm_start_x'] = df['std_start_x'] / 120.0
    df['norm_start_y'] = df['std_start_y'] / 53.3
    
    # --- 2. PHYSICS ---
    df['time_step'] = df['frame_id']
    time_sec = df['time_step'] * 0.1
    dir_rad = np.deg2rad(df['std_dir'])
    
    # Components
    v_x = df['start_s'] * np.cos(dir_rad)
    v_y = df['start_s'] * np.sin(dir_rad)
    a_x = df['start_a'] * np.cos(dir_rad)
    a_y = df['start_a'] * np.sin(dir_rad)
    
    # Kinematics (d = vt + 0.5at^2)
    t_sq = time_sec ** 2
    df['physics_dx'] = (v_x * time_sec) + (0.5 * a_x * t_sq)
    df['physics_dy'] = (v_y * time_sec) + (0.5 * a_y * t_sq)
    
    # --- 3. BIOMECHANICS ---
    df['momentum'] = df['player_weight'] * df['start_s']
    df['kinetic_energy'] = 0.5 * df['player_weight'] * (df['start_s']**2)
    
    # BMI
    df['bmi'] = 703 * df['player_weight'] / (df['height_inches']**2)
    
    # Age
    game_year = df['game_id'].astype(str).str[:4].astype(int)
    birth_year = pd.to_datetime(df['player_birth_date']).dt.year
    df['age'] = game_year - birth_year
    
    # --- 4. TARGETS ---
    df['target_dx'] = df['std_x'] - df['std_start_x']
    df['target_dy'] = df['std_y'] - df['std_start_y']
    
    return df

print("Processing Features...")
train_processed = create_slide_features(train_df)

# One-Hot Encoding
print("Generating One-Hot Encodings...")
train_processed = pd.get_dummies(train_processed, columns=['player_role'], prefix='role', dtype=int)
role_features = [c for c in train_processed.columns if c.startswith('role_')]

# Final Feature List
FEATURES = [
    'norm_start_x', 'norm_start_y', 'start_s', 'start_a', 'std_dir', 
    'time_step', 'player_weight', 'height_inches', 'age',
    'physics_dx', 'physics_dy',       
    'momentum', 'kinetic_energy', 'bmi' 
] + role_features

TARGET_X = 'target_dx'
TARGET_Y = 'target_dy'

print("Feature Engineering Complete.")

Processing Features...
Engineering Features...
Generating One-Hot Encodings...
Feature Engineering Complete.


In [5]:
# 4. ENSEMBLE TRAINING (PHYSICS + STANDARDIZED)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

print("Starting Standardized Ensemble Training...")

# Hyperparameters (Low & Slow)
xgb_params = {'objective': 'reg:squarederror', 'n_estimators': 1500, 'learning_rate': 0.02, 'max_depth': 7, 'tree_method': 'hist', 'n_jobs': -1, 'random_state': 42}
lgb_params = {'objective': 'regression', 'n_estimators': 1500, 'learning_rate': 0.02, 'num_leaves': 40, 'n_jobs': -1, 'random_state': 42, 'verbose': -1}
cat_params = {'iterations': 1500, 'learning_rate': 0.02, 'depth': 7, 'loss_function': 'RMSE', 'verbose': 0, 'allow_writing_files': False, 'random_seed': 42}

groups = train_processed['game_id'].astype(str) + '_' + train_processed['play_id'].astype(str)
gkf = GroupKFold(n_splits=5)
mse_scores = [] 

X = train_processed[FEATURES]
y_x = train_processed[TARGET_X]
y_y = train_processed[TARGET_Y]

# We need the original 'play_direction' to un-flip the predictions for scoring
play_directions = train_processed['play_direction']
start_x_vals = train_processed['start_x'] # Original Start X
start_y_vals = train_processed['start_y'] # Original Start Y
true_x_vals = train_processed['x']        # Original True X
true_y_vals = train_processed['y']        # Original True Y

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y_x, groups=groups), 1):
    
    # Slice Data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    yx_train, yx_val = y_x.iloc[train_idx], y_x.iloc[val_idx]
    yy_train, yy_val = y_y.iloc[train_idx], y_y.iloc[val_idx]
    
    # Train Models
    lgb_x = lgb.LGBMRegressor(**lgb_params).fit(X_train, yx_train)
    lgb_y = lgb.LGBMRegressor(**lgb_params).fit(X_train, yy_train)
    xgb_x = xgb.XGBRegressor(**xgb_params).fit(X_train, yx_train)
    xgb_y = xgb.XGBRegressor(**xgb_params).fit(X_train, yy_train)
    cb_x = CatBoostRegressor(**cat_params).fit(X_train, yx_train)
    cb_y = CatBoostRegressor(**cat_params).fit(X_train, yy_train)
    
    # Ensemble Prediction (Standardized Deltas)
    pred_std_dx = (0.34 * lgb_x.predict(X_val)) + (0.33 * xgb_x.predict(X_val)) + (0.33 * cb_x.predict(X_val))
    pred_std_dy = (0.34 * lgb_y.predict(X_val)) + (0.33 * xgb_y.predict(X_val)) + (0.33 * cb_y.predict(X_val))
    
    # --- RECONSTRUCTION ---
    # Un-flip the delta to apply it to the REAL start position
    val_dirs = play_directions.iloc[val_idx]
    
    real_dx = np.where(val_dirs == 'left', -pred_std_dx, pred_std_dx)
    real_dy = np.where(val_dirs == 'left', -pred_std_dy, pred_std_dy)
    
    # Apply to Real Start Position
    final_pred_x = start_x_vals.iloc[val_idx] + real_dx
    final_pred_y = start_y_vals.iloc[val_idx] + real_dy
    
    # Score against Real Truth
    mse_x = mean_squared_error(true_x_vals.iloc[val_idx], final_pred_x)
    mse_y = mean_squared_error(true_y_vals.iloc[val_idx], final_pred_y)
    total_mse = (mse_x + mse_y) / 2
    
    mse_scores.append(total_mse)
    print(f"Fold {fold} MSE: {total_mse:.4f}")

# --- METRIC CALCULATION ---

# 1. MSE (Raw Error)
mse_final = np.mean(mse_scores)

# 2. RMSE (Standard Metric)
rmse_final = np.sqrt(mse_final)

# 3. MAE Using last fold data
mae_x = mean_absolute_error(true_x_vals.iloc[val_idx], final_pred_x)
mae_y = mean_absolute_error(true_y_vals.iloc[val_idx], final_pred_y)
mae_final = (mae_x + mae_y) / 2

# 4. R-Squared (Scientific Metric)
r2_x = r2_score(true_x_vals.iloc[val_idx], final_pred_x)
r2_y = r2_score(true_y_vals.iloc[val_idx], final_pred_y)
r2_final = (r2_x + r2_y) / 2

print("\n" + "="*40)
print("MODEL 2:  catboost+xgb+lgbm ensemble")
print(f"MSE:      {mse_final:.4f}")
print(f"RMSE: {rmse_final:.4f}")
print(f"MAE:     {mae_final:.4f} yards")
print(f"R2:          {r2_final:.4f}")
print("="*40)

Starting Standardized Ensemble Training...
Fold 1 MSE: 2.2613
Fold 2 MSE: 1.8526
Fold 3 MSE: 1.8220
Fold 4 MSE: 1.7767
Fold 5 MSE: 1.9457

MODEL 2:  catboost+xgb+lgbm ensemble
MSE:      1.9317
RMSE: 1.3898
MAE:     0.7116 yards
R2:          0.9925
