In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_inference_server.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/nfl_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/nfl-big-data-bowl-2026-prediction/kaggle_evaluation/core/generated/kaggle_evaluati

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl # Used by the API
import glob
import os
from pathlib import Path

# Model and processing imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# --- PART 1: IMPORTS & HELPER FUNCTIONS ---

# This helper function will be used by both training and prediction
def height_to_inches(height_str):
    try:
        # Convert to string first to handle potential non-string inputs
        feet, inches = str(height_str).split('-')
        return int(feet) * 12 + int(inches)
    except:
        return np.nan # Return NaN for any parsing errors


# --- PART 2: MODEL TRAINING (GLOBAL SCOPE) ---
# This code runs ONCE when the notebook starts.
# All objects created here (model, scaler, column_lists)
# will be available globally for the 'predict' function to use.

print("--- Starting Model Training ---")

# Define file paths
if os.path.exists("/kaggle/input"):
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
else:
    DATA_DIR = Path("nfl-big-data-bowl-2026-prediction")

# Load all training data
train_input_files = sorted(glob.glob(str(DATA_DIR /'train'/'input_*.csv')))
train_output_files = sorted(glob.glob(str(DATA_DIR /'train'/'output_*.csv')))

input_dfs = []
output_dfs = []
for input_file, output_file in zip(train_input_files, train_output_files):
    input_dfs.append(pd.read_csv(input_file))
    output_dfs.append(pd.read_csv(output_file))

train_input = pd.concat(input_dfs, ignore_index=True)
train_output = pd.concat(output_dfs, ignore_index=True)
print(f'Train input shape: {train_input.shape}')
print(f'Train output shape: {train_output.shape}')

# Filter and merge
train_input_filtered = train_input[train_input['player_to_predict'] == True].copy()
merged_data = train_input_filtered.merge(
    train_output,
    on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
    suffixes=('_input', '_output')
)
print(f'Merged training data shape: {merged_data.shape}')

# --- Training Data Preprocessing ---
print("Standardizing play direction for training data...")
left_mask = merged_data['play_direction'] == 'left'
merged_data.loc[left_mask, 'x_input'] = 120.0 - merged_data.loc[left_mask, 'x_input']
merged_data.loc[left_mask, 'ball_land_x'] = 120.0 - merged_data.loc[left_mask, 'ball_land_x']
merged_data.loc[left_mask, 'x_output'] = 120.0 - merged_data.loc[left_mask, 'x_output'] # Flip target
merged_data.loc[left_mask, 'y_input'] = 53.3 - merged_data.loc[left_mask, 'y_input']
merged_data.loc[left_mask, 'ball_land_y'] = 53.3 - merged_data.loc[left_mask, 'ball_land_y']
merged_data.loc[left_mask, 'y_output'] = 53.3 - merged_data.loc[left_mask, 'y_output'] # Flip target
merged_data.loc[left_mask, 'o'] = (merged_data.loc[left_mask, 'o'] + 180) % 360
merged_data.loc[left_mask, 'dir'] = (merged_data.loc[left_mask, 'dir'] + 180) % 360

# --- Training Feature Engineering ---
print("Engineering features for training data...")
# Height
merged_data['player_height'] = merged_data['player_height'].apply(height_to_inches)
# BMI
height_m = merged_data['player_height'] * 0.0254
weight_kg = merged_data['player_weight'] * 0.453592
merged_data['bmi'] = weight_kg / (height_m ** 2)
# Age
try:
    merged_data['game_year'] = merged_data['game_id'].astype(str).str[:4].astype(int)
    merged_data['birth_year'] = pd.to_datetime(merged_data['player_birth_date']).dt.year
    merged_data['age'] = merged_data['game_year'] - merged_data['birth_year']
    merged_data = merged_data.drop(columns=['game_year', 'birth_year'])
except Exception as e:
    merged_data['age'] = np.nan
# Ball vectors
merged_data['vec_x_to_ball'] = merged_data['ball_land_x'] - merged_data['x_input']
merged_data['vec_y_to_ball'] = merged_data['ball_land_y'] - merged_data['y_input']
merged_data['dist_to_ball'] = np.sqrt(
    merged_data['vec_x_to_ball']**2 + merged_data['vec_y_to_ball']**2
)
angle_to_ball_rad = np.arctan2(merged_data['vec_y_to_ball'], merged_data['vec_x_to_ball'])
angle_to_ball_deg = np.degrees(angle_to_ball_rad)
merged_data['angle_to_ball'] = (450 - angle_to_ball_deg) % 360
angle_diff = np.abs(merged_data['dir'] - merged_data['angle_to_ball'])
merged_data['diff_dir_ball_angle'] = np.min(
    np.stack([angle_diff, 360 - angle_diff], axis=0), axis=0
)

# --- One-hot encoding and final feature list ---
# Define base features
feature_columns = [
    'absolute_yardline_number', 'player_height', 'player_weight', 'x_input',
    'y_input', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x',
    'ball_land_y', 'bmi', 'age', 'vec_x_to_ball', 'vec_y_to_ball',
    'dist_to_ball', 'angle_to_ball', 'diff_dir_ball_angle'
]
# Create dummies
position_dummies = pd.get_dummies(merged_data['player_position'], prefix='position')
role_dummies = pd.get_dummies(merged_data['player_role'], prefix='role')

# --- *** CREATE GLOBAL ARTIFACTS *** ---
# We save these lists to use in the 'predict' function for column alignment
POSITION_DUMMIES_LIST = position_dummies.columns.tolist()
ROLE_DUMMIES_LIST = role_dummies.columns.tolist()
FEATURE_COLUMNS_LIST = feature_columns + POSITION_DUMMIES_LIST + ROLE_DUMMIES_LIST

print(f"Total features created: {len(FEATURE_COLUMNS_LIST)}")

# Concatenate for training
merged_data = pd.concat([merged_data, position_dummies, role_dummies], axis=1)

# --- Final Model Training ---
X = merged_data[FEATURE_COLUMNS_LIST].fillna(0)
y = merged_data[['x_output', 'y_output']].values # Use .values for numpy array

# Create and fit scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_scaled, y)

print("--- Model and Scaler are trained and saved globally. ---")


# --- PART 3: INFERENCE SERVER & PREDICT FUNCTION ---

# Import the server code
import kaggle_evaluation.nfl_inference_server

def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame:
    """
    This is the core inference function called by the API.
    It receives one batch of 'test' and 'test_input' data,
    processes it, and returns predictions.
    """
    
    # Access the globally trained objects
    global ridge_model, scaler, FEATURE_COLUMNS_LIST, POSITION_DUMMIES_LIST, ROLE_DUMMIES_LIST
    
    # 1. Convert to Pandas for processing
    test_pd = test.to_pandas()
    test_input_pd = test_input.to_pandas()

    # 2. Merge target rows with features
    submission_df = test_pd.merge(
        test_input_pd,
        on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
        how='left'
    )
    
    if submission_df.shape[0] != test_pd.shape[0]:
        print(f"--- WARNING: Row count mismatch! Expected {test_pd.shape[0]} rows, got {submission_df.shape[0]} ---")

    # 3. --- Feature Processing (must MATCH training) ---
    
    # 3a. Rename columns
    submission_df = submission_df.rename(columns={'x': 'x_input', 'y': 'y_input'})

    # 3b. Standardize play direction
    # This mask is CRUCIAL for inverting predictions later
    left_mask_test = submission_df['play_direction'] == 'left'
    
    submission_df.loc[left_mask_test, 'x_input'] = 120.0 - submission_df.loc[left_mask_test, 'x_input']
    submission_df.loc[left_mask_test, 'ball_land_x'] = 120.0 - submission_df.loc[left_mask_test, 'ball_land_x']
    submission_df.loc[left_mask_test, 'y_input'] = 53.3 - submission_df.loc[left_mask_test, 'y_input']
    submission_df.loc[left_mask_test, 'ball_land_y'] = 53.3 - submission_df.loc[left_mask_test, 'ball_land_y']
    submission_df.loc[left_mask_test, 'o'] = (submission_df.loc[left_mask_test, 'o'] + 180) % 360
    submission_df.loc[left_mask_test, 'dir'] = (submission_df.loc[left_mask_test, 'dir'] + 180) % 360

    # 3c. Feature Engineering
    submission_df['player_height'] = submission_df['player_height'].apply(height_to_inches)
    height_m_test = submission_df['player_height'] * 0.0254
    weight_kg_test = submission_df['player_weight'] * 0.453592
    submission_df['bmi'] = weight_kg_test / (height_m_test ** 2)
    try:
        submission_df['game_year'] = submission_df['game_id'].astype(str).str[:4].astype(int)
        submission_df['birth_year'] = pd.to_datetime(submission_df['player_birth_date']).dt.year
        submission_df['age'] = submission_df['game_year'] - submission_df['birth_year']
        submission_df = submission_df.drop(columns=['game_year', 'birth_year'])
    except:
        submission_df['age'] = np.nan
    submission_df['vec_x_to_ball'] = submission_df['ball_land_x'] - submission_df['x_input']
    submission_df['vec_y_to_ball'] = submission_df['ball_land_y'] - submission_df['y_input']
    submission_df['dist_to_ball'] = np.sqrt(
        submission_df['vec_x_to_ball']**2 + submission_df['vec_y_to_ball']**2)
    angle_to_ball_rad_test = np.arctan2(
        submission_df['vec_y_to_ball'], submission_df['vec_x_to_ball'])
    angle_to_ball_deg_test = np.degrees(angle_to_ball_rad_test)
    submission_df['angle_to_ball'] = (450 - angle_to_ball_deg_test) % 360
    angle_diff_test = np.abs(submission_df['dir'] - submission_df['angle_to_ball'])
    submission_df['diff_dir_ball_angle'] = np.min(
        np.stack([angle_diff_test, 360 - angle_diff_test], axis=0), axis=0)

    # 3d. One-hot encode and align
    test_position_dummies = pd.get_dummies(submission_df['player_position'], prefix='position')
    test_role_dummies = pd.get_dummies(submission_df['player_role'], prefix='role')
    
    # Align position columns
    for col in POSITION_DUMMIES_LIST:
        if col not in test_position_dummies.columns:
            test_position_dummies[col] = 0
    test_position_dummies = test_position_dummies[POSITION_DUMMIES_LIST] # Ensure same order
    
    # Align role columns
    for col in ROLE_DUMMIES_LIST:
        if col not in test_role_dummies.columns:
            test_role_dummies[col] = 0
    test_role_dummies = test_role_dummies[ROLE_DUMMIES_LIST] # Ensure same order

    # 3e. Concatenate final features
    submission_df = pd.concat([submission_df, test_position_dummies, test_role_dummies], axis=1)

    # 3f. Select final feature columns
    X_submission = submission_df[FEATURE_COLUMNS_LIST].fillna(0)

    # 4. --- Scaling & Prediction ---
    X_submission_scaled = scaler.transform(X_submission)
    test_predictions = ridge_model.predict(X_submission_scaled)
    
    submission_df['pred_x'] = test_predictions[:, 0]
    submission_df['pred_y'] = test_predictions[:, 1]

    # 5. --- Post-Processing & Formatting ---
    # Invert predictions for 'left' plays
    submission_df.loc[left_mask_test, 'pred_x'] = 120.0 - submission_df.loc[left_mask_test, 'pred_x']
    submission_df.loc[left_mask_test, 'pred_y'] = 53.3 - submission_df.loc[left_mask_test, 'pred_y']
    
    # Format the final output DataFrame
    final_predictions_pd = submission_df[['pred_x', 'pred_y']].rename(
        columns={'pred_x': 'x', 'pred_y': 'y'}
    )
    
    # Fill any potential NaNs (e.g., from failed feature engineering)
    final_predictions_pd = final_predictions_pd.fillna(0.0)

    # 6. Convert back to Polars DataFrame for the API
    final_predictions_pl = pl.from_pandas(final_predictions_pd)
    
    assert isinstance(final_predictions_pl, pl.DataFrame)
    assert len(final_predictions_pl) == len(test_pd)
    
    return final_predictions_pl


# --- This code starts the server ---
# It must be at the END of your notebook.
print("Starting inference server...")
inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Serving predictions (Competition Rerun mode)")
    inference_server.serve()
else:
    print("Running local gateway (Interactive mode)")
    # This runs the local test set for debugging
    inference_server.run_local_gateway(('/kaggle/input/nfl-big-data-bowl-2026-prediction/',))

--- Starting Model Training ---
Train input shape: (4880579, 23)
Train output shape: (562936, 6)
Merged training data shape: (560426, 25)
Standardizing play direction for training data...
Engineering features for training data...
Total features created: 38
--- Model and Scaler are trained and saved globally. ---
Starting inference server...
Running local gateway (Interactive mode)
