In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_inference_server.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluati

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl # Used by the API
import glob
import os
from pathlib import Path

# Model and processing imports
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# --- PART 1: IMPORTS & HELPER FUNCTIONS ---

# This helper function will be used by both training and prediction
def height_to_inches(height_str):
    try:
        # Convert to string first to handle potential non-string inputs
        feet, inches = str(height_str).split('-')
        return int(feet) * 12 + int(inches)
    except:
        return np.nan # Return NaN for any parsing errors


# --- PART 2: MODEL TRAINING (VALIDATION) ---
# This code now runs 5-fold cross-validation to get a
# reliable performance score.

print("--- Starting Model Validation (5-Fold GroupKFold) ---")

# Define file paths
if os.path.exists("/kaggle/input"):
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
else:
    DATA_DIR = Path("nfl-big-data-bowl-2026-prediction")

# --- (Data loading is the same as before) ---
train_input_files = sorted(glob.glob(str(DATA_DIR /'train'/'input_*.csv')))
train_output_files = sorted(glob.glob(str(DATA_DIR /'train'/'output_*.csv')))
input_dfs = []
output_dfs = []
for input_file, output_file in zip(train_input_files, train_output_files):
    input_dfs.append(pd.read_csv(input_file))
    output_dfs.append(pd.read_csv(output_file))
train_input = pd.concat(input_dfs, ignore_index=True)
train_output = pd.concat(output_dfs, ignore_index=True)
train_input_filtered = train_input[train_input['player_to_predict'] == True].copy()
merged_data = train_input_filtered.merge(
    train_output,
    on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
    suffixes=('_input', '_output')
)
print(f'Merged training data shape: {merged_data.shape}')

# --- (Feature Processing is the same as before) ---
print("Standardizing play direction...")
left_mask = merged_data['play_direction'] == 'left'
merged_data.loc[left_mask, 'x_input'] = 120.0 - merged_data.loc[left_mask, 'x_input']
merged_data.loc[left_mask, 'ball_land_x'] = 120.0 - merged_data.loc[left_mask, 'ball_land_x']
merged_data.loc[left_mask, 'x_output'] = 120.0 - merged_data.loc[left_mask, 'x_output']
merged_data.loc[left_mask, 'y_input'] = 53.3 - merged_data.loc[left_mask, 'y_input']
merged_data.loc[left_mask, 'ball_land_y'] = 53.3 - merged_data.loc[left_mask, 'ball_land_y']
merged_data.loc[left_mask, 'y_output'] = 53.3 - merged_data.loc[left_mask, 'y_output']
merged_data.loc[left_mask, 'o'] = (merged_data.loc[left_mask, 'o'] + 180) % 360
merged_data.loc[left_mask, 'dir'] = (merged_data.loc[left_mask, 'dir'] + 180) % 360

print("Engineering features...")
merged_data['player_height'] = merged_data['player_height'].apply(height_to_inches)
height_m = merged_data['player_height'] * 0.0254
weight_kg = merged_data['player_weight'] * 0.453592
merged_data['bmi'] = weight_kg / (height_m ** 2)
# Using your simplified kinetic energy
merged_data['kinetic_energy'] = 0.5 * merged_data['player_weight'] * (merged_data['s'] ** 2) 
try:
    merged_data['game_year'] = merged_data['game_id'].astype(str).str[:4].astype(int)
    merged_data['birth_year'] = pd.to_datetime(merged_data['player_birth_date']).dt.year
    merged_data['age'] = merged_data['game_year'] - merged_data['birth_year']
    merged_data = merged_data.drop(columns=['game_year', 'birth_year'])
except Exception as e:
    merged_data['age'] = np.nan
merged_data['vec_x_to_ball'] = merged_data['ball_land_x'] - merged_data['x_input']
merged_data['vec_y_to_ball'] = merged_data['ball_land_y'] - merged_data['y_input']
merged_data['dist_to_ball'] = np.sqrt(
    merged_data['vec_x_to_ball']**2 + merged_data['vec_y_to_ball']**2
)
angle_to_ball_rad = np.arctan2(merged_data['vec_y_to_ball'], merged_data['vec_x_to_ball'])
angle_to_ball_deg = np.degrees(angle_to_ball_rad)
merged_data['angle_to_ball'] = (450 - angle_to_ball_deg) % 360
angle_diff = np.abs(merged_data['dir'] - merged_data['angle_to_ball'])
merged_data['diff_dir_ball_angle'] = np.min(
    np.stack([angle_diff, 360 - angle_diff], axis=0), axis=0
)

# --- (One-hot encoding is the same as before) ---
feature_columns = [
    'absolute_yardline_number', 'player_height', 'player_weight', 'x_input',
    'y_input', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x',
    'ball_land_y', 'bmi', 'age', 'vec_x_to_ball', 'vec_y_to_ball',
    'dist_to_ball', 'angle_to_ball', 'diff_dir_ball_angle', 'kinetic_energy'
]
position_dummies = pd.get_dummies(merged_data['player_position'], prefix='position')
role_dummies = pd.get_dummies(merged_data['player_role'], prefix='role')

# --- *** CREATE GLOBAL ARTIFACTS (needed for PART 3) *** ---
# We still create these so the 'predict' function doesn't break
POSITION_DUMMIES_LIST = position_dummies.columns.tolist()
ROLE_DUMMIES_LIST = role_dummies.columns.tolist()
FEATURE_COLUMNS_LIST = feature_columns + POSITION_DUMMIES_LIST + ROLE_DUMMIES_LIST
print(f"Total features created: {len(FEATURE_COLUMNS_LIST)}")

# Concatenate for training
merged_data = pd.concat([merged_data, position_dummies, role_dummies], axis=1)

# --- Final Data Prep ---
X = merged_data[FEATURE_COLUMNS_LIST].fillna(0)
y = merged_data[['x_output', 'y_output']].values

# Create a robust grouping key (e.g., 'gameId_playId')
groups = merged_data['game_id'].astype(str) + '_' + merged_data['play_id'].astype(str)
print(f"Total rows: {len(X)}, Total unique groups: {len(groups.unique())}")


# --- *** NEW 5-Fold Cross-Validation Loop *** ---
N_SPLITS = 5
gkf = GroupKFold(n_splits=N_SPLITS)

# Store scores
oof_scores = [] # OOF = Out-of-fold

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"--- Starting Fold {fold+1}/{N_SPLITS} ---")
    
    # 1. Get fold data
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    # 2. Fit Scaler *only* on training data for this fold
    # This prevents data leakage!
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold) # Only transform validation data
    
    # 3. Fit Model *only* on training data for this fold
    model = ElasticNet(alpha=0.01, l1_ratio=0.1)
    model.fit(X_train_fold_scaled, y_train_fold)
    
    # 4. Make predictions and evaluate
    val_preds = model.predict(X_val_fold_scaled)
    
    # We predict (x, y), so we get a per-coordinate error
    # Let's use Root Mean Squared Error (RMSE)
    # The competition metric is Euclidean distance, RMSE is closely related
    # RMSE(x,y) = sqrt( MSE(x) + MSE(y) ) / 2 ... or just per-coordinate:
    
    mse = mean_squared_error(y_val_fold, val_preds)
    rmse = np.sqrt(mse) # Total RMSE
    
    # We can also get per-coordinate RMSE
    rmse_x = np.sqrt(mean_squared_error(y_val_fold[:, 0], val_preds[:, 0]))
    rmse_y = np.sqrt(mean_squared_error(y_val_fold[:, 1], val_preds[:, 1]))
    
    print(f"Fold {fold+1} RMSE (Total): {rmse:.4f}")
    print(f"Fold {fold+1} RMSE (X-coord): {rmse_x:.4f}, (Y-coord): {rmse_y:.4f}")
    
    oof_scores.append(rmse)

print("--- CV Finished ---")
print(f"Average CV RMSE: {np.mean(oof_scores):.4f} +/- {np.std(oof_scores):.4f}")

# --- IMPORTANT ---
# The code above is for VALIDATION. For your *final submission*,
# you must re-train your model on ALL data.
# The 'predict' function (PART 3) needs a *globally* trained model.

print("\n--- Re-training on 100% of data for final submission ---")

# 1. Create and fit the *final* scaler on ALL data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Create and fit the *final* model on ALL data
elasticnet_model = ElasticNet(alpha=0.01, l1_ratio=0.1)
elasticnet_model.fit(X_scaled, y)

print("--- Final Model and Scaler are trained and saved globally. ---")



# --- PART 3: INFERENCE SERVER & PREDICT FUNCTION ---

# Import the server code
import kaggle_evaluation.nfl_inference_server

def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame:
    """
    This is the core inference function called by the API.
    It receives one batch of 'test' and 'test_input' data,
    processes it, and returns predictions.
    """
    
    # Access the globally trained objects
    global elasticnet_model, scaler, FEATURE_COLUMNS_LIST, POSITION_DUMMIES_LIST, ROLE_DUMMIES_LIST
    
    # 1. Convert to Pandas for processing
    test_pd = test.to_pandas()
    test_input_pd = test_input.to_pandas()

    # 2. Merge target rows with features
    submission_df = test_pd.merge(
        test_input_pd,
        on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
        how='left'
    )
    
    if submission_df.shape[0] != test_pd.shape[0]:
        print(f"--- WARNING: Row count mismatch! Expected {test_pd.shape[0]} rows, got {submission_df.shape[0]} ---")

    # 3. --- Feature Processing (must MATCH training) ---
    
    # 3a. Rename columns
    submission_df = submission_df.rename(columns={'x': 'x_input', 'y': 'y_input'})

    # 3b. Standardize play direction
    # This mask is CRUCIAL for inverting predictions later
    left_mask_test = submission_df['play_direction'] == 'left'
    
    submission_df.loc[left_mask_test, 'x_input'] = 120.0 - submission_df.loc[left_mask_test, 'x_input']
    submission_df.loc[left_mask_test, 'ball_land_x'] = 120.0 - submission_df.loc[left_mask_test, 'ball_land_x']
    submission_df.loc[left_mask_test, 'y_input'] = 53.3 - submission_df.loc[left_mask_test, 'y_input']
    submission_df.loc[left_mask_test, 'ball_land_y'] = 53.3 - submission_df.loc[left_mask_test, 'ball_land_y']
    submission_df.loc[left_mask_test, 'o'] = (submission_df.loc[left_mask_test, 'o'] + 180) % 360
    submission_df.loc[left_mask_test, 'dir'] = (submission_df.loc[left_mask_test, 'dir'] + 180) % 360

    # 3c. Feature Engineering
    submission_df['player_height'] = submission_df['player_height'].apply(height_to_inches)
    height_m_test = submission_df['player_height'] * 0.0254
    weight_kg_test = submission_df['player_weight'] * 0.453592
    submission_df['bmi'] = weight_kg_test / (height_m_test ** 2)
    submission_df['kinetic_energy'] = 0.5 * weight_kg_test * (submission_df['s'] ** 2)
    
    try:
        submission_df['game_year'] = submission_df['game_id'].astype(str).str[:4].astype(int)
        submission_df['birth_year'] = pd.to_datetime(submission_df['player_birth_date']).dt.year
        submission_df['age'] = submission_df['game_year'] - submission_df['birth_year']
        submission_df = submission_df.drop(columns=['game_year', 'birth_year'])
    except:
        submission_df['age'] = np.nan
    submission_df['vec_x_to_ball'] = submission_df['ball_land_x'] - submission_df['x_input']
    submission_df['vec_y_to_ball'] = submission_df['ball_land_y'] - submission_df['y_input']
    submission_df['dist_to_ball'] = np.sqrt(
        submission_df['vec_x_to_ball']**2 + submission_df['vec_y_to_ball']**2)
    angle_to_ball_rad_test = np.arctan2(
        submission_df['vec_y_to_ball'], submission_df['vec_x_to_ball'])
    angle_to_ball_deg_test = np.degrees(angle_to_ball_rad_test)
    submission_df['angle_to_ball'] = (450 - angle_to_ball_deg_test) % 360
    angle_diff_test = np.abs(submission_df['dir'] - submission_df['angle_to_ball'])
    submission_df['diff_dir_ball_angle'] = np.min(
        np.stack([angle_diff_test, 360 - angle_diff_test], axis=0), axis=0)

    # 3d. One-hot encode and align
    test_position_dummies = pd.get_dummies(submission_df['player_position'], prefix='position')
    test_role_dummies = pd.get_dummies(submission_df['player_role'], prefix='role')
    
    # Align position columns
    for col in POSITION_DUMMIES_LIST:
        if col not in test_position_dummies.columns:
            test_position_dummies[col] = 0
    test_position_dummies = test_position_dummies[POSITION_DUMMIES_LIST] # Ensure same order
    
    # Align role columns
    for col in ROLE_DUMMIES_LIST:
        if col not in test_role_dummies.columns:
            test_role_dummies[col] = 0
    test_role_dummies = test_role_dummies[ROLE_DUMMIES_LIST] # Ensure same order

    # 3e. Concatenate final features
    submission_df = pd.concat([submission_df, test_position_dummies, test_role_dummies], axis=1)

    # 3f. Select final feature columns
    X_submission = submission_df[FEATURE_COLUMNS_LIST].fillna(0)

    # 4. --- Scaling & Prediction ---
    X_submission_scaled = scaler.transform(X_submission)
    test_predictions = elasticnet_model.predict(X_submission_scaled)
    
    submission_df['pred_x'] = test_predictions[:, 0]
    submission_df['pred_y'] = test_predictions[:, 1]

    # 5. --- Post-Processing & Formatting ---
    # Invert predictions for 'left' plays
    submission_df.loc[left_mask_test, 'pred_x'] = 120.0 - submission_df.loc[left_mask_test, 'pred_x']
    submission_df.loc[left_mask_test, 'pred_y'] = 53.3 - submission_df.loc[left_mask_test, 'pred_y']
    
    # Format the final output DataFrame
    final_predictions_pd = submission_df[['pred_x', 'pred_y']].rename(
        columns={'pred_x': 'x', 'pred_y': 'y'}
    )
    
    # Fill any potential NaNs (e.g., from failed feature engineering)
    final_predictions_pd = final_predictions_pd.fillna(0.0)

    # 6. Convert back to Polars DataFrame for the API
    final_predictions_pl = pl.from_pandas(final_predictions_pd)
    
    assert isinstance(final_predictions_pl, pl.DataFrame)
    assert len(final_predictions_pl) == len(test_pd)
    
    return final_predictions_pl


# --- This code starts the server ---
print("Starting inference server...")
inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Serving predictions (Competition Rerun mode)")
    inference_server.serve()
else:
    print("Running local gateway (Interactive mode)")
    # This runs the local test set for debugging
    inference_server.run_local_gateway(('/kaggle/input/nfl-big-data-bowl-2026-prediction/',))

--- Starting Model Validation (5-Fold GroupKFold) ---
Merged training data shape: (560426, 25)
Standardizing play direction...
Engineering features...
Total features created: 39
Total rows: 560426, Total unique groups: 14108
--- Starting Fold 1/5 ---
Fold 1 RMSE (Total): 4.5083
Fold 1 RMSE (X-coord): 4.3219, (Y-coord): 4.6873
--- Starting Fold 2/5 ---
Fold 2 RMSE (Total): 4.4667
Fold 2 RMSE (X-coord): 4.2306, (Y-coord): 4.6909
--- Starting Fold 3/5 ---
Fold 3 RMSE (Total): 4.5174
Fold 3 RMSE (X-coord): 4.2154, (Y-coord): 4.8005
--- Starting Fold 4/5 ---
Fold 4 RMSE (Total): 4.4826
Fold 4 RMSE (X-coord): 4.1649, (Y-coord): 4.7792
--- Starting Fold 5/5 ---
Fold 5 RMSE (Total): 4.5415
Fold 5 RMSE (X-coord): 4.3020, (Y-coord): 4.7689
--- CV Finished ---
Average CV RMSE: 4.5033 +/- 0.0263

--- Re-training on 100% of data for final submission ---
--- Final Model and Scaler are trained and saved globally. ---
Starting inference server...
Running local gateway (Interactive mode)
