# NFL 2026 - Simple LightGBM Baseline

**Strategy**: Very simple LightGBM for X and Y prediction

## Table of Contents

1. [Configuration](#1-configuration)
2. [Data Loading](#2-data-loading)
3. [Feature Engineering](#3-feature-engineering)
4. [Aggregation](#4-aggregation)
5. [Cross-Validation](#6-cross-validation)
6. [Test Prediction](#7-test-prediction)
7. [Submission](#8-submission)

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from pathlib import Path
import gc

---
## 1. Configuration

In [2]:
# Config
DATA_PATH = Path('/kaggle/input/nfl-big-data-bowl-2026-prediction')
TRAIN_PATH = DATA_PATH / 'train'
WEEKS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
N_FOLDS = 5
RANDOM_STATE = 42

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "random_state": 42,
    "verbose": -1,
}

---
## 2. Data Loading

In [3]:
# Load data
def load_weeks(week_nums):
    inputs, outputs = [], []
    for w in week_nums:
        inp = pd.read_csv(TRAIN_PATH / f'input_2023_w{w:02d}.csv')
        out = pd.read_csv(TRAIN_PATH / f'output_2023_w{w:02d}.csv')
        inputs.append(inp)
        outputs.append(out)
    return pd.concat(inputs, ignore_index=True), pd.concat(outputs, ignore_index=True)

input_df, output_df = load_weeks(WEEKS)
print(f"Data loaded: {len(input_df)} input rows, {input_df['game_id'].nunique()} games")
print(f"Data loaded: {len(output_df)} output rows, {output_df['game_id'].nunique()} games")

Data loaded: 3236116 input rows, 180 games
Data loaded: 368514 output rows, 180 games


In [4]:
input_df.head(7)

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
0,2023090700,101,False,54527,1,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.94,0.09,0.39,322.4,238.24,21,63.259998,-0.22
1,2023090700,101,False,54527,2,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.94,0.04,0.61,200.89,236.05,21,63.259998,-0.22
2,2023090700,101,False,54527,3,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.93,0.12,0.73,147.55,240.6,21,63.259998,-0.22
3,2023090700,101,False,54527,4,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.35,36.92,0.23,0.81,131.4,244.25,21,63.259998,-0.22
4,2023090700,101,False,54527,5,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.37,36.9,0.35,0.82,123.26,244.25,21,63.259998,-0.22
5,2023090700,101,False,54527,6,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.44,36.88,0.6,0.87,106.89,247.67,21,63.259998,-0.22
6,2023090700,101,False,54527,7,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.51,36.86,0.76,0.64,103.87,247.67,21,63.259998,-0.22


In [5]:
input_df.player_height.isnull().values.any()

False

---
## 3. Feature Engineering

In [6]:
def parse_height(h):
    try:
        if isinstance(h, str) and '-' in h:
            ft, inch = h.split('-')
            return float(ft) + float(inch) / 12.0
        return np.nan
    except:
        return np.nan

In [7]:
def create_features(df):
    df = df.copy()

    # Player height
    df['player_height_feet'] = df['player_height'].map(parse_height)

    # Speed & acceleration components
    df['dir_rad'] = np.deg2rad(df['dir'].fillna(0.0))
    df['velocity_x'] = df['s'] * np.cos(df['dir_rad'])
    df['velocity_y'] = df['s'] * np.sin(df['dir_rad'])
    df['acceleration_x'] = df['a'] * np.cos(df['dir_rad'])
    df['acceleration_y'] = df['a'] * np.sin(df['dir_rad'])

    # Orientation
    df['o_rad'] = np.deg2rad(df['o'].fillna(0.0))
    df['orientation_x'] = np.cos(df['o_rad'])
    df['orientation_y'] = np.sin(df['o_rad'])

    # Kinetic energy
    df['kinetic_energy'] = 0.5 * (df['player_weight'] * 0.453592) * ((df['s'] * 0.9144) ** 2)

    # Roles
    df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
    df['is_defense'] = (df['player_side'] == 'Defense').astype(int)
    df['is_receiver'] = df['player_role'].str.contains('Receiver|Targeted', case=False, na=False).astype(int)
    df['is_coverage'] = df['player_role'].str.contains('Coverage|Defensive', case=False, na=False).astype(int)
    df['is_passer'] = df['player_role'].str.contains('Passer', case=False, na=False).astype(int)
    df['is_rusher'] = df['player_role'].str.contains('Rusher', case=False, na=False).astype(int)

    # Field position
    df['field_x_norm'] = df['x'] / 120.0
    df['field_y_norm'] = df['y'] / 53.3
    df['dist_from_sideline'] = np.minimum(df['y'], 53.3 - df['y'])
    df['dist_from_endzone'] = df['absolute_yardline_number']
    df['distance_to_sideline'] = df['dist_from_sideline']
    df['distance_to_endzone'] = df['dist_from_endzone']

    # Ball interaction
    df['ball_direction_x'] = df['ball_land_x'] - df['x']
    df['ball_direction_y'] = df['ball_land_y'] - df['y']
    df['distance_to_ball'] = np.sqrt(df['ball_direction_x']**2 + df['ball_direction_y']**2)
    df['angle_to_ball'] = np.arctan2(df['ball_direction_y'], df['ball_direction_x'])
    df['closing_speed'] = (
        (df['velocity_x'] * df['ball_direction_x'] + df['velocity_y'] * df['ball_direction_y'])
        / df['distance_to_ball'].replace(0, np.nan)
    ).fillna(0.0)

    # Target Alignment Features
    if 'ball_direction_x' in df.columns:
        df['velocity_alignment'] = (
            df['velocity_x'] * df['ball_direction_x'] +
            df['velocity_y'] * df['ball_direction_y']
        )
        df['velocity_perpendicular'] = (
            df['velocity_x'] * (-df['ball_direction_y']) +
            df['velocity_y'] * df['ball_direction_x']
        )
        if 'acceleration_x' in df.columns:
            df['accel_alignment'] = (
                df['acceleration_x'] * df['ball_direction_x'] +
                df['acceleration_y'] * df['ball_direction_y']
            )
    
    # Sort by time for sequential features
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']
    group = df.groupby(gcols)

     # Role-Specific Features
    if 'is_receiver' in df.columns and 'velocity_alignment' in df.columns:
        df['receiver_optimality'] = df['is_receiver'] * df['velocity_alignment']
        df['receiver_deviation'] = df['is_receiver'] * np.abs(df.get('velocity_perpendicular', 0))
    if 'is_coverage' in df.columns and 'closing_speed' in df.columns:
        df['defender_closing_speed'] = df['is_coverage'] * df['closing_speed']

    # Lags
    for lag in [1, 2, 3, 5]:
        for c in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            df[f"{c}_lag{lag}"] = group[c].shift(lag)

    # EMA (fixed + optimized)
    for alpha in [0.1, 0.3, 0.5]:
        df[f'velocity_x_ema_{alpha}'] = (
            df.groupby(gcols)['velocity_x']
            .transform(lambda s: s.ewm(alpha=alpha, adjust=False).mean())
        )
        df[f'velocity_y_ema_{alpha}'] = (
            df.groupby(gcols)['velocity_y']
            .transform(lambda s: s.ewm(alpha=alpha, adjust=False).mean())
        )

    # Rolling features
    for c in ['velocity_x', 'velocity_y', 's', 'a']:
        df[f'{c}_roll5'] = (
            group[c].rolling(5, min_periods=1)
            .mean()
            .reset_index(level=[0, 1, 2], drop=True)
        )
        df[f'{c}_std5'] = (
            group[c].rolling(5, min_periods=1)
            .std()
            .reset_index(level=[0, 1, 2], drop=True)
            .fillna(0)
        )

    return df

input_df = create_features(input_df)
print(f"Features created: {input_df.shape[1]} columns")

Features created: 90 columns


---
### 3.1 Aggregation

In [8]:
agg_features = (
    input_df
    .groupby(['game_id', 'play_id', 'nfl_id'])
    .agg({
        # --- Core tracking ---
        'x': ['first', 'last', 'mean', 'std'],
        'y': ['first', 'last', 'mean', 'std'],
        's': ['mean', 'max', 'std'],
        'a': ['mean', 'max', 'std'],
        'o': ['mean', 'std'],
        'dir': ['mean', 'std'],
        'frame_id': ['min', 'max', 'count'],

        # --- Motion features ---
        'velocity_x': ['mean', 'max', 'std', 'last'],
        'velocity_y': ['mean', 'max', 'std', 'last'],
        'acceleration_x': ['mean', 'max', 'std'],
        'acceleration_y': ['mean', 'max', 'std'],
        'kinetic_energy': ['mean', 'max'],

        # --- Ball interaction ---
        'distance_to_ball': ['first', 'last', 'mean', 'min', 'std'],
        'angle_to_ball': ['mean', 'std'],
        'ball_direction_x': ['mean', 'std', 'last'],
        'ball_direction_y': ['mean', 'std', 'last'],
        'closing_speed': ['mean', 'max', 'std'],

        # --- Temporal/derived dynamics ---
        'x_lag1': ['mean'], 'y_lag1': ['mean'],
        'x_lag3': ['mean'], 'y_lag3': ['mean'],
        'velocity_x_ema_0.3': ['last'], 'velocity_y_ema_0.3': ['last'],
        'velocity_x_roll5': ['last'], 'velocity_y_roll5': ['last'],
        's_roll5': ['last'], 'a_roll5': ['last'],

        # --- Field & positioning ---
        'field_x_norm': ['last', 'mean'],
        'field_y_norm': ['last', 'mean'],
        'dist_from_sideline': ['mean', 'min'],
        'distance_to_endzone': ['mean', 'min'],

        # --- Advanced features ---
        'velocity_alignment': ['mean'],
        'accel_alignment': ['mean'],
        'receiver_optimality': ['mean', 'max'],
        'defender_closing_speed': ['mean', 'max'],
    })
    .reset_index()
)

# Flatten multi-level column names
agg_features.columns = ['_'.join(col).strip('_') for col in agg_features.columns.values]


static = (
    input_df
    .groupby(['game_id', 'play_id', 'nfl_id'])
    .first()[[
        'ball_land_x', 'ball_land_y',
        'player_height_feet', 'player_weight',
        'is_offense', 'is_defense',
        'is_receiver', 'is_coverage', 'is_passer', 'is_rusher'
    ]]
    .reset_index()
)

# Merge 
features_df = agg_features.merge(static, on=['game_id', 'play_id', 'nfl_id'], how='left')

---
### 3.2 imputation

In [9]:
features_df['player_weight'] = features_df['player_weight'].fillna(features_df['player_weight'].median())
features_df['player_height_feet'] = features_df['player_height_feet'].fillna(features_df['player_height_feet'].median())

In [10]:
targets = (
    output_df[['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y']]
    .rename(columns={'x': 'target_x', 'y': 'target_y'})
)

train_df = targets.merge(features_df, on=['game_id', 'play_id', 'nfl_id'], how='left')

train_df["frame_ratio"] = train_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].transform(
    lambda s: s / s.max()
)

# feature selection
exclude_cols = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'target_x', 'target_y']
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

X = train_df[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
y_x = train_df['target_x']
y_y = train_df['target_y']
print(f"Number of features: {len(feature_cols)}")

Number of features: 88


---
## 5. Cross-Validation

In [11]:
cv_scores = []
models_x, models_y = [], []

groups = train_df['game_id'].astype(str) + '_' + train_df['play_id'].astype(str)
gkf = GroupKFold(n_splits=N_FOLDS)
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, groups=groups), 1):
    print(f"\n=== Fold {fold}/{N_FOLDS} ===")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_x_train, y_x_val = y_x.iloc[train_idx], y_x.iloc[val_idx]
    y_y_train, y_y_val = y_y.iloc[train_idx], y_y.iloc[val_idx]

    # --- Train model for X coordinate ---
    model_x = LGBMRegressor(**params)
    model_x.fit(
        X_train, y_x_train,
        eval_set=[(X_val, y_x_val)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(200),
        ],
    )

    # --- Train model for Y coordinate ---
    model_y = LGBMRegressor(**params)
    model_y.fit(
        X_train, y_y_train,
        eval_set=[(X_val, y_y_val)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(200),
        ],
    )

    # --- Save models ---
    models_x.append(model_x)
    models_y.append(model_y)

    # --- Evaluate fold performance ---
    pred_x = model_x.predict(X_val)
    pred_y = model_y.predict(X_val)

    rmse_x = np.sqrt(mean_squared_error(y_x_val, pred_x))
    rmse_y = np.sqrt(mean_squared_error(y_y_val, pred_y))
    rmse_combined = np.sqrt((rmse_x**2 + rmse_y**2) / 2)

    cv_scores.append(rmse_combined)

    print(f"Fold {fold}: RMSE_X={rmse_x:.4f}, RMSE_Y={rmse_y:.4f}, Combined={rmse_combined:.4f}")


print(f"Mean CV RMSE: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Models trained: {len(models_x)} for X, {len(models_y)} for Y")


=== Fold 1/5 ===
[200]	valid_0's rmse: 1.22523
[400]	valid_0's rmse: 1.175
[600]	valid_0's rmse: 1.15966
[800]	valid_0's rmse: 1.1504
[1000]	valid_0's rmse: 1.144
[200]	valid_0's rmse: 1.13101
[400]	valid_0's rmse: 1.09217
[600]	valid_0's rmse: 1.08116
[800]	valid_0's rmse: 1.07687
[1000]	valid_0's rmse: 1.07262
Fold 1: RMSE_X=1.1440, RMSE_Y=1.0726, Combined=1.1089

=== Fold 2/5 ===
[200]	valid_0's rmse: 1.24316
[400]	valid_0's rmse: 1.18513
[600]	valid_0's rmse: 1.164
[800]	valid_0's rmse: 1.15328
[1000]	valid_0's rmse: 1.14804
[200]	valid_0's rmse: 1.06378
[400]	valid_0's rmse: 1.01305
[600]	valid_0's rmse: 0.999728
[800]	valid_0's rmse: 0.992073
[1000]	valid_0's rmse: 0.987491
Fold 2: RMSE_X=1.1480, RMSE_Y=0.9875, Combined=1.0708

=== Fold 3/5 ===
[200]	valid_0's rmse: 1.20654
[400]	valid_0's rmse: 1.15432
[600]	valid_0's rmse: 1.13746
[800]	valid_0's rmse: 1.12516
[1000]	valid_0's rmse: 1.1187
[200]	valid_0's rmse: 1.08434
[400]	valid_0's rmse: 1.03222
[600]	valid_0's rmse: 1.0146

---
## 6. Test Prediction

In [12]:
test_input = pd.read_csv(DATA_PATH / 'test_input.csv')
test_template = pd.read_csv(DATA_PATH / 'test.csv')

# Create features (same as training)
test_input = create_features(test_input)

# Aggregation (same structure as training)
test_agg = (
    test_input
    .groupby(['game_id', 'play_id', 'nfl_id'])
    .agg({
        # --- Core tracking ---
        'x': ['first', 'last', 'mean', 'std'],
        'y': ['first', 'last', 'mean', 'std'],
        's': ['mean', 'max', 'std'],
        'a': ['mean', 'max', 'std'],
        'o': ['mean', 'std'],
        'dir': ['mean', 'std'],
        'frame_id': ['min', 'max', 'count'],

        # --- Motion features ---
        'velocity_x': ['mean', 'max', 'std', 'last'],
        'velocity_y': ['mean', 'max', 'std', 'last'],
        'acceleration_x': ['mean', 'max', 'std'],
        'acceleration_y': ['mean', 'max', 'std'],
        'kinetic_energy': ['mean', 'max'],

        # --- Ball interaction ---
        'distance_to_ball': ['first', 'last', 'mean', 'min', 'std'],
        'angle_to_ball': ['mean', 'std'],
        'ball_direction_x': ['mean', 'std', 'last'],
        'ball_direction_y': ['mean', 'std', 'last'],
        'closing_speed': ['mean', 'max', 'std'],

        # --- Temporal/derived dynamics ---
        'x_lag1': ['mean'], 'y_lag1': ['mean'],
        'x_lag3': ['mean'], 'y_lag3': ['mean'],
        'velocity_x_ema_0.3': ['last'], 'velocity_y_ema_0.3': ['last'],
        'velocity_x_roll5': ['last'], 'velocity_y_roll5': ['last'],
        's_roll5': ['last'], 'a_roll5': ['last'],

        # --- Field & positioning ---
        'field_x_norm': ['last', 'mean'],
        'field_y_norm': ['last', 'mean'],
        'dist_from_sideline': ['mean', 'min'],
        'distance_to_endzone': ['mean', 'min'],

        # --- Advanced features ---
        'velocity_alignment': ['mean'],
        'accel_alignment': ['mean'],
        'receiver_optimality': ['mean', 'max'],
        'defender_closing_speed': ['mean', 'max'],
    })
    .reset_index()
)
test_agg.columns = ['_'.join(col).strip('_') for col in test_agg.columns.values]

# Static features (same as training)
test_static = (
    test_input
    .groupby(['game_id', 'play_id', 'nfl_id'])
    .first()[[
        'ball_land_x', 'ball_land_y',
        'player_height_feet', 'player_weight',
        'is_offense', 'is_defense',
        'is_receiver', 'is_coverage', 'is_passer', 'is_rusher'
    ]]
    .reset_index()
)

# Merge aggregated + static at the player level
test_features = test_agg.merge(test_static, on=['game_id', 'play_id', 'nfl_id'], how='left')

# Fill using training medians
test_features['player_weight'] = test_features['player_weight'].fillna(features_df['player_weight'].median())
test_features['player_height_feet'] = test_features['player_height_feet'].fillna(features_df['player_height_feet'].median())

---
## 7. Submission

In [13]:
# Expand to frame-level by merging with test_template
test_df = test_template.merge(test_features, on=['game_id', 'play_id', 'nfl_id'], how='left')

# Add frame_ratio at frame level (same definition as training)
test_df['frame_ratio'] = test_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].transform(
    lambda s: s / (s.max() + 1e-6)
)


extra_in_test = [c for c in test_df.columns if c not in feature_cols]

# Final aligned matrix
X_test = test_df[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)

# Predict with ensemble (once, at frame level)
preds_x = np.column_stack([m.predict(X_test) for m in models_x])
preds_y = np.column_stack([m.predict(X_test) for m in models_y])
test_df['x'] = preds_x.mean(axis=1)
test_df['y'] = preds_y.mean(axis=1)

test_df['id'] = (
    test_df['game_id'].astype(str) + '_' +
    test_df['play_id'].astype(str) + '_' +
    test_df['nfl_id'].astype(str)  + "_" +
    test_df["frame_id"].astype(str)
)

# Make submission
submission_df = test_df[['id', 'x', 'y']]
submission_df.to_csv('submission.csv', index=False)
print(f"Frame-level submission created: {len(submission_df)} rows -> submission.csv")

Frame-level submission created: 5837 rows -> submission.csv
