In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
import joblib
from pathlib import Path
import warnings
import os
import glob
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error, mean_squared_log_error, mean_absolute_percentage_error, mean_tweedie_deviance
warnings.filterwarnings('ignore')

In [20]:
class Config:
    BASE_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
    CATBOOST_MODEL_PATH = "Fall2025/AML/Project/models/catboost_5fold_models.pkl"
    LSTM_MODEL_DIR = "Fall2025/AML/Project/models/output"
    
    ENSEMBLE_WEIGHTS = {
        'catboost': 0.5,
        'lstm': 0.5
    }
    
    ROLE_SPECIFIC_WEIGHTS = {
        'Passer': {'catboost': 0.6, 'lstm': 0.4},
        'Targeted Receiver': {'catboost': 0.4, 'lstm': 0.6},
        'Defensive Coverage': {'catboost': 0.45, 'lstm': 0.55},
        'default': {'catboost': 0.5, 'lstm': 0.5}
    }
    
    USE_ROLE_SPECIFIC_WEIGHTS = False
    
    LSTM_N_FOLDS = 5
    LSTM_WINDOW_SIZE = 8
    
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3

In [3]:
print("Loading input files...")
input_files = glob.glob(os.path.join(Config.BASE_DIR, 'train/input_*.csv'))
input_df = pd.concat([pd.read_csv(f) for f in input_files], ignore_index=True)
print(f"Loaded {len(input_files)} input files.")

print("\nLoading output files...")
output_files = glob.glob(os.path.join(Config.BASE_DIR, 'train/output_*.csv'))
output_df = pd.concat([pd.read_csv(f) for f in output_files], ignore_index=True)
print(f"Loaded {len(output_files)} output files.")

Loading input files...
Loaded 18 input files.

Loading output files...
Loaded 18 output files.


In [4]:
def load_catboost_models(model_path):
    """Load pre-trained CatBoost models"""
    with open(model_path, 'rb') as f:
        saved = pickle.load(f)
    return saved['models_x'], saved['models_y'], saved['features']

In [5]:
def engineer_catboost_features(df):
    """Create physics-based features for CatBoost models"""
    df = df.copy()
    
    df['velocity_x'] = df['s'] * np.cos(np.radians(df['dir']))
    df['velocity_y'] = df['s'] * np.sin(np.radians(df['dir']))
    
    df['dist_to_ball'] = np.sqrt(
        (df['x'] - df['ball_land_x'])**2 + 
        (df['y'] - df['ball_land_y'])**2
    )
    
    df['angle_to_ball'] = np.arctan2(
        df['ball_land_y'] - df['y'],
        df['ball_land_x'] - df['x']
    )
    
    df['velocity_toward_ball'] = (
        df['velocity_x'] * np.cos(df['angle_to_ball']) + 
        df['velocity_y'] * np.sin(df['angle_to_ball'])
    )
    
    df['time_to_ball'] = df['num_frames_output'] / 10.0
    df['orientation_diff'] = np.abs(df['o'] - df['dir'])
    df['orientation_diff'] = np.minimum(df['orientation_diff'], 360 - df['orientation_diff'])
    
    df['role_targeted_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['role_defensive_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['role_passer'] = (df['player_role'] == 'Passer').astype(int)
    df['side_offense'] = (df['player_side'] == 'Offense').astype(int)
    
    height_parts = df['player_height'].str.split('-', expand=True)
    df['height_inches'] = height_parts[0].astype(float) * 12 + height_parts[1].astype(float)
    df['bmi'] = (df['player_weight'] / (df['height_inches']**2)) * 703
    
    df['acceleration_x'] = df['a'] * np.cos(np.radians(df['dir']))
    df['acceleration_y'] = df['a'] * np.sin(np.radians(df['dir']))
    df['distance_to_target_x'] = df['ball_land_x'] - df['x']
    df['distance_to_target_y'] = df['ball_land_y'] - df['y']
    df['speed_squared'] = df['s'] ** 2
    df['accel_magnitude'] = np.sqrt(df['acceleration_x']**2 + df['acceleration_y']**2)
    df['velocity_alignment'] = np.cos(df['angle_to_ball'] - np.radians(df['dir']))
    
    df['expected_x_at_ball'] = df['x'] + df['velocity_x'] * df['time_to_ball']
    df['expected_y_at_ball'] = df['y'] + df['velocity_y'] * df['time_to_ball']
    df['error_from_ball_x'] = df['expected_x_at_ball'] - df['ball_land_x']
    df['error_from_ball_y'] = df['expected_y_at_ball'] - df['ball_land_y']
    df['error_from_ball'] = np.sqrt(df['error_from_ball_x']**2 + df['error_from_ball_y']**2)
    
    df['momentum_x'] = df['player_weight'] * df['velocity_x']
    df['momentum_y'] = df['player_weight'] * df['velocity_y']
    df['kinetic_energy'] = 0.5 * df['player_weight'] * df['speed_squared']
    
    df['angle_diff'] = np.abs(df['o'] - np.degrees(df['angle_to_ball']))
    df['angle_diff'] = np.minimum(df['angle_diff'], 360 - df['angle_diff'])
    
    df['time_squared'] = df['time_to_ball'] ** 2
    df['dist_squared'] = df['dist_to_ball'] ** 2
    df['weighted_dist_by_time'] = df['dist_to_ball'] / (df['time_to_ball'] + 0.1)
    
    return df

In [6]:
def add_sequence_features_catboost(df):
    """Add temporal features using lag and rolling statistics"""
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    group_cols = ['game_id', 'play_id', 'nfl_id']
    
    for lag in [1, 2, 3, 4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(group_cols)[col].shift(lag)
    
    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            if col in df.columns:
                df[f'{col}_rolling_mean_{window}'] = df.groupby(group_cols)[col].rolling(window, min_periods=1).mean().reset_index(level=[0,1,2], drop=True)
                df[f'{col}_rolling_std_{window}'] = df.groupby(group_cols)[col].rolling(window, min_periods=1).std().reset_index(level=[0,1,2], drop=True)
    
    for col in ['velocity_x', 'velocity_y']:
        if col in df.columns:
            df[f'{col}_delta'] = df.groupby(group_cols)[col].diff()
    
    return df

In [7]:
def create_training_dataset(input_df, output_df):
    output_df = output_df.copy()
    output_df['id'] = (output_df['game_id'].astype(str) + '_' + 
                    output_df['play_id'].astype(str) + '_' + 
                    output_df['nfl_id'].astype(str) + '_' + 
                    output_df['frame_id'].astype(str))
    
    output_df = output_df.rename(columns={'x': 'target_x', 'y': 'target_y'})
    
    input_agg = input_df.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    
    if 'frame_id' in input_agg.columns:
        input_agg = input_agg.drop('frame_id', axis=1)
    
    merged = output_df.merge(
        input_agg,
        on=['game_id', 'play_id', 'nfl_id'],
        how='left',
        suffixes=('', '_input')
    )
    
    return merged

In [8]:
def train_model(input_df, features, train_template):
    train_features = engineer_catboost_features(input_df)
    train_features = add_sequence_features_catboost(train_features)
    
    train_agg = train_features.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    if 'frame_id' in train_agg.columns:
        train_agg = train_agg.drop('frame_id', axis=1)
    
    train_merged = train_template.merge(
        train_agg,
        on=['game_id', 'play_id', 'nfl_id'],
        how='left'
    )
    
    for col in features:
        if col not in train_merged.columns:
            train_merged[col] = 0
    
    X_train = train_merged[features].fillna(0).values

In [9]:
def predict_catboost(models_x, models_y, features, test_input, test_template):
    """Generate predictions from CatBoost ensemble"""
    test_features = engineer_catboost_features(test_input)
    test_features = add_sequence_features_catboost(test_features)
    
    test_agg = test_features.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    if 'frame_id' in test_agg.columns:
        test_agg = test_agg.drop('frame_id', axis=1)
    
    test_merged = test_template.merge(
        test_agg,
        on=['game_id', 'play_id', 'nfl_id'],
        how='left'
    )
    
    for col in features:
        if col not in test_merged.columns:
            test_merged[col] = 0
    
    X_test = test_merged[features].fillna(0).values
    
    pred_x = np.mean([model.predict(X_test) for model in models_x], axis=0)
    pred_y = np.mean([model.predict(X_test) for model in models_y], axis=0)
    
    predictions = pd.DataFrame({
        'id': (test_merged['game_id'].astype(str) + '_' + 
            test_merged['play_id'].astype(str) + '_' + 
            test_merged['nfl_id'].astype(str) + '_' + 
            test_merged['frame_id'].astype(str)),
        'x': pred_x,
        'y': pred_y
    })
    
    return predictions

In [10]:
def height_to_feet(height_str):
    try:
        ft, inches = map(int, height_str.split('-'))
        return ft + inches/12
    except:
        return None

In [11]:
def prepare_lstm_sequences(input_df, test_template, window_size):
    """Prepare sequential features for LSTM inference"""
    input_df = input_df.copy()
    input_df['player_height_feet'] = input_df['player_height'].map(height_to_feet)
    
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)
    
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg
    
    from datetime import datetime
    current_date = datetime.now()
    input_df['age'] = input_df['player_birth_date'].apply(
        lambda x: (current_date - datetime.strptime(x, '%Y-%m-%d')).days // 365 if pd.notnull(x) else None
    )
    
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)
    input_df['force'] = mass_kg * input_df['a']
    
    input_df['rolling_mean_velocity_x'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['velocity_x'].transform(
        lambda x: x.rolling(window=window_size, min_periods=1).mean()
    )
    input_df['rolling_std_acceleration'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['a'].transform(
        lambda x: x.rolling(window=window_size, min_periods=1).std()
    )
    
    if all(col in input_df.columns for col in ['ball_land_x', 'ball_land_y']):
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed'] = (input_df['velocity_x'] * input_df['ball_direction_x'] +
                                     input_df['velocity_y'] * input_df['ball_direction_y'])
        input_df['estimated_time_to_ball'] = input_df['distance_to_ball'] / 20.0
        input_df['projected_time_to_ball'] = input_df['distance_to_ball'] / (np.abs(input_df['closing_speed']) + 0.1)
    
    input_df['heading_x'] = np.sin(dir_rad)
    input_df['heading_y'] = np.cos(dir_rad)
    input_df['acceleration_x'] = input_df['a'] * input_df['heading_x']
    input_df['acceleration_y'] = input_df['a'] * input_df['heading_y']
    input_df['accel_magnitude'] = np.sqrt(input_df['acceleration_x']**2 + input_df['acceleration_y']**2)
    
    agg_rows = []
    for (g, p, f), grp in input_df.groupby(['game_id', 'play_id', 'frame_id'], sort=False):
        n = len(grp)
        nfl_ids = grp['nfl_id'].to_numpy()
        if n < 2:
            for nid in nfl_ids:
                agg_rows.append({
                    'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                    'distance_to_player_mean_offense': np.nan, 'distance_to_player_min_offense': np.nan, 'distance_to_player_max_offense': np.nan,
                    'relative_velocity_magnitude_mean_offense': np.nan, 'relative_velocity_magnitude_min_offense': np.nan, 'relative_velocity_magnitude_max_offense': np.nan,
                    'angle_to_player_mean_offense': np.nan, 'angle_to_player_min_offense': np.nan, 'angle_to_player_max_offense': np.nan,
                    'distance_to_player_mean_defense': np.nan, 'distance_to_player_min_defense': np.nan, 'distance_to_player_max_defense': np.nan,
                    'relative_velocity_magnitude_mean_defense': np.nan, 'relative_velocity_magnitude_min_defense': np.nan, 'relative_velocity_magnitude_max_defense': np.nan,
                    'angle_to_player_mean_defense': np.nan, 'angle_to_player_min_defense': np.nan, 'angle_to_player_max_defense': np.nan,
                })
            continue
        x = grp['x'].to_numpy(dtype=np.float32)
        y = grp['y'].to_numpy(dtype=np.float32)
        vx = grp['velocity_x'].to_numpy(dtype=np.float32)
        vy = grp['velocity_y'].to_numpy(dtype=np.float32)
        is_offense = grp['is_offense'].to_numpy()
        is_defense = grp['is_defense'].to_numpy()
        dx = x[None, :] - x[:, None]
        dy = y[None, :] - y[:, None]
        angle_mat = np.arctan2(-dy, -dx)
        dist = np.sqrt(dx ** 2 + dy ** 2)
        dvx = vx[:, None] - vx[None, :]
        dvy = vy[:, None] - vy[None, :]
        rel_speed = np.sqrt(dvx ** 2 + dvy ** 2)
        offense_mask = (is_offense[:, None] == is_offense[None, :])
        np.fill_diagonal(offense_mask, False)
        defense_mask = (is_defense[:, None] == is_defense[None, :])
        np.fill_diagonal(defense_mask, False)
        dist_diag_nan = dist.copy()
        np.fill_diagonal(dist_diag_nan, np.nan)
        rel_diag_nan = rel_speed.copy()
        np.fill_diagonal(rel_diag_nan, np.nan)
        angle_diag_nan = angle_mat.copy()
        np.fill_diagonal(angle_diag_nan, np.nan)
        def masked_stats(mat, mask):
            masked = np.where(mask, mat, np.nan)
            cnt = mask.sum(axis=1)
            mean = np.nanmean(masked, axis=1)
            amin = np.nanmin(masked, axis=1)
            amax = np.nanmax(masked, axis=1)
            zero = cnt == 0
            mean[zero] = np.nan; amin[zero] = np.nan; amax[zero] = np.nan
            return mean, amin, amax
        d_mean_o, d_min_o, d_max_o = masked_stats(dist_diag_nan, offense_mask)
        v_mean_o, v_min_o, v_max_o = masked_stats(rel_diag_nan, offense_mask)
        a_mean_o, a_min_o, a_max_o = masked_stats(angle_diag_nan, offense_mask)
        d_mean_d, d_min_d, d_max_d = masked_stats(dist_diag_nan, defense_mask)
        v_mean_d, v_min_d, v_max_d = masked_stats(rel_diag_nan, defense_mask)
        a_mean_d, a_min_d, a_max_d = masked_stats(angle_diag_nan, defense_mask)
        for idx, nid in enumerate(nfl_ids):
            agg_rows.append({
                'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                'distance_to_player_mean_offense': d_mean_o[idx], 'distance_to_player_min_offense': d_min_o[idx], 'distance_to_player_max_offense': d_max_o[idx],
                'relative_velocity_magnitude_mean_offense': v_mean_o[idx], 'relative_velocity_magnitude_min_offense': v_min_o[idx], 'relative_velocity_magnitude_max_offense': v_max_o[idx],
                'angle_to_player_mean_offense': a_mean_o[idx], 'angle_to_player_min_offense': a_min_o[idx], 'angle_to_player_max_offense': a_max_o[idx],
                'distance_to_player_mean_defense': d_mean_d[idx], 'distance_to_player_min_defense': d_min_d[idx], 'distance_to_player_max_defense': d_max_d[idx],
                'relative_velocity_magnitude_mean_defense': v_mean_d[idx], 'relative_velocity_magnitude_min_defense': v_min_d[idx], 'relative_velocity_magnitude_max_defense': v_max_d[idx],
                'angle_to_player_mean_defense': a_mean_d[idx], 'angle_to_player_min_defense': a_min_d[idx], 'angle_to_player_max_defense': a_max_d[idx],
            })
    interaction_agg = pd.DataFrame(agg_rows)
    input_df = input_df.merge(interaction_agg, on=['game_id', 'play_id', 'frame_id', 'nfl_id'], how='left')
    
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    
    feature_cols = [
        'x','y','s','a','o','dir','frame_id','ball_land_x','ball_land_y',
        'absolute_yardline_number',
        'player_height_feet','player_weight',
        'velocity_x','velocity_y',
        'momentum_x','momentum_y',
        'is_offense','is_defense','is_receiver','is_coverage','is_passer',
        'age','kinetic_energy','force',
        'rolling_mean_velocity_x','rolling_std_acceleration',
        'heading_x','heading_y','acceleration_x','acceleration_y','accel_magnitude',
        'distance_to_ball','angle_to_ball','ball_direction_x','ball_direction_y',
        'closing_speed','estimated_time_to_ball','projected_time_to_ball',
        'distance_to_ball','angle_to_ball','ball_direction_x','ball_direction_y',
        'closing_speed','estimated_time_to_ball','projected_time_to_ball',
        'distance_to_player_mean_offense','distance_to_player_min_offense','distance_to_player_max_offense',
        'relative_velocity_magnitude_mean_offense','relative_velocity_magnitude_min_offense','relative_velocity_magnitude_max_offense',
        'angle_to_player_mean_offense','angle_to_player_min_offense','angle_to_player_max_offense',
        'distance_to_player_mean_defense','distance_to_player_min_defense','distance_to_player_max_defense',
        'relative_velocity_magnitude_mean_defense','relative_velocity_magnitude_min_defense','relative_velocity_magnitude_max_defense',
        'angle_to_player_mean_defense','angle_to_player_min_defense','angle_to_player_max_defense'
    ]
    
    grouped_input = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])
    target_groups = test_template[['game_id', 'play_id', 'nfl_id']].drop_duplicates()
    
    sequences, sequence_ids = [], []
    
    for _, row in target_groups.iterrows():
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        try:
            group_df = grouped_input.get_group(key)
        except KeyError:
            continue
        
        input_window = group_df.tail(window_size)
        
        if len(input_window) < window_size:
            pad_length = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_length), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True).reset_index(drop=True)
        
        seq = input_window[feature_cols].values
        
        if np.isnan(seq.astype(np.float32)).any():
            seq = np.nan_to_num(seq, nan=0.0)
        
        sequences.append(seq)
        
        last_frame_id = input_window['frame_id'].iloc[-1]
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': last_frame_id
        })
    
    return sequences, sequence_ids

In [12]:
def predict_lstm(models_x, models_y, scalers, test_input, test_template):
    """Generate predictions from LSTM ensemble"""
    sequences, seq_ids = prepare_lstm_sequences(test_input, test_template, Config.LSTM_WINDOW_SIZE)
    X_test_unscaled = np.array(sequences, dtype=object)
    test_meta = pd.DataFrame(seq_ids)
    
    x_last = np.array([seq[-1, 0] for seq in X_test_unscaled], dtype=np.float32)
    y_last = np.array([seq[-1, 1] for seq in X_test_unscaled], dtype=np.float32)
    test_meta['x_last'] = x_last
    test_meta['y_last'] = y_last
    
    per_model_dx, per_model_dy = [], []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    for i, (model_x, model_y, scaler) in enumerate(zip(models_x, models_y, scalers)):
        scaled = np.array([scaler.transform(s) for s in X_test_unscaled], dtype=object)
        stacked = np.stack(scaled.astype(np.float32))
        test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(stacked))
        loader = torch.utils.data.DataLoader(test_dataset, batch_size=1024, shuffle=False)
        
        dx_list, dy_list = [], []
        with torch.no_grad():
            for (batch,) in loader:
                batch = batch.to(device)
                dx = model_x(batch).cpu().numpy()
                dy = model_y(batch).cpu().numpy()
                dx_list.append(dx)
                dy_list.append(dy)
        
        dx_cum = np.vstack(dx_list)
        dy_cum = np.vstack(dy_list)
        per_model_dx.append(dx_cum)
        per_model_dy.append(dy_cum)
    
    ens_dx = np.mean(np.stack(per_model_dx, axis=0), axis=0)
    ens_dy = np.mean(np.stack(per_model_dy, axis=0), axis=0)
    
    out_rows = []
    for i, seq_info in test_meta.iterrows():
        game_id = int(seq_info['game_id'])
        play_id = int(seq_info['play_id'])
        nfl_id = int(seq_info['nfl_id'])
        
        frame_ids = test_template[
            (test_template['game_id'] == game_id) &
            (test_template['play_id'] == play_id) &
            (test_template['nfl_id'] == nfl_id)
        ]['frame_id'].sort_values().tolist()
        
        for t, frame_id in enumerate(frame_ids):
            if t < ens_dx.shape[1]:
                px = x_last[i] + ens_dx[i, t]
                py = y_last[i] + ens_dy[i, t]
            else:
                px = x_last[i] + ens_dx[i, -1]
                py = y_last[i] + ens_dy[i, -1]
            
            out_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_id}",
                'x': px,
                'y': py
            })
    
    predictions = pd.DataFrame(out_rows)
    return predictions

In [13]:
def create_models(model_type):
    if model_type == 'catboost':
        models = {'model_x': CatBoostRegressor(), 'model_y': CatBoostRegressor()}
    elif model_type == 'random_forest':
        models = {'model_x': RandomForestRegressor(n_estimators=100, random_state=42), 'model_y': RandomForestRegressor(n_estimators=100, random_state=42)}
    elif model_type == 'gradient_boosting':
        models = {'model_x': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42), 'model_y': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)}
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    return models

In [14]:
input_df = engineer_catboost_features(input_df)
train_df = create_training_dataset(input_df, output_df)

In [15]:
feature_cols = [
        'x', 'y', 's', 'a', 'o', 'dir',
        'velocity_x', 'velocity_y', 'dist_to_ball', 'angle_to_ball',
        'velocity_toward_ball', 'time_to_ball', 'orientation_diff',
        'role_targeted_receiver', 'role_defensive_coverage', 'role_passer',
        'side_offense', 'height_inches', 'player_weight', 'bmi',
        'ball_land_x', 'ball_land_y', 'num_frames_output', 'frame_id',
        'acceleration_x', 'acceleration_y', 'distance_to_target_x', 'distance_to_target_y',
        'speed_squared', 'accel_magnitude', 'velocity_alignment',
        'expected_x_at_ball', 'expected_y_at_ball',
        'error_from_ball_x', 'error_from_ball_y', 'error_from_ball',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        'angle_diff', 'time_squared', 'dist_squared', 'weighted_dist_by_time'
    ]
for lag in [1, 2, 3, 4, 5]:
    for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']:
        feature_cols.append(f'{col}_lag{lag}')
    
for window in [3, 5]:
    for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
        feature_cols.append(f'{col}_rolling_mean_{window}')
        feature_cols.append(f'{col}_rolling_std_{window}')

feature_cols.extend(['velocity_x_delta', 'velocity_y_delta'])

available_features = [col for col in feature_cols if col in train_df.columns]
print(f"Available features: {len(available_features)}")

train_df = train_df.dropna(subset=available_features + ['target_x', 'target_y'])

Available features: 43


In [25]:
from sklearn.model_selection import train_test_split

In [33]:
X = train_df[available_features].values
y = train_df[['target_x', 'target_y']]

X_train, X_val, y_train, y_val =train_test_split(X, y, test_size = 0.1, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

y_x_train = y_train['target_x'].values
y_y_train = y_train['target_y'].values

(506642, 43) (506642, 2)
(56294, 43) (56294, 2)


In [34]:
model_x = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_x.fit(X_train, y_x_train)

In [None]:
model_y = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_y.fit(X_train, y_y_train)

In [None]:
def run_evaluation(y_x_pred, y_y_pred, y_actual):
    y_pred = np.stack((y_x_pred, y_y_pred))
    result= {}

    result['mse'] = mean_squared_error(y_actual, y_pred)
    result['mae'] = mean_absolute_error(y_actual, y_pred)
    result['msle'] = mean_absolute_error(y_actual, y_pred)
    result['mape'] = mean_absolute_percentage_error(y_actual, y_pred)
    result['mpd'] = mean_tweedie_deviance(y_actual, y_pred, power=1)
    result['mgd'] = mean_tweedie_deviance(y_actual, y_pred, power=2)

In [21]:
test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
catboost_pred = predict_catboost(model_x, model_y, available_features, test_input, test_template)

In [24]:
catboost_pred.to_csv('submission.csv')

In [None]:
# def main():
#     print("Loading test data...")
#     test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
#     test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
#     print(f"Test input: {test_input.shape[0]:,} rows, {test_input.shape[1]} columns")
#     print(f"Test template: {test_template.shape[0]:,} predictions required")
    
#     print("\nCreate models...")
#     models_x_cat, models_y_cat, features_cat = load_catboost_models(Config.CATBOOST_MODEL_PATH)
#     print(f"CatBoost: {len(models_x_cat)} folds, {len(features_cat)} features")
    
#     print("\nGenerating CatBoost predictions...")
#     catboost_pred = predict_catboost(models_x_cat, models_y_cat, features_cat, test_input, test_template)
#     print(f"CatBoost predictions: {len(catboost_pred):,}")
#     print(f"  X range: [{catboost_pred['x'].min():.2f}, {catboost_pred['x'].max():.2f}]")
#     print(f"  Y range: [{catboost_pred['y'].min():.2f}, {catboost_pred['y'].max():.2f}]")
    
#     print("\nGenerating LSTM predictions...")
#     lstm_pred = predict_lstm(models_x_lstm, models_y_lstm, scalers_lstm, test_input, test_template)
#     print(f"LSTM predictions: {len(lstm_pred):,}")
#     print(f"  X range: [{lstm_pred['x'].min():.2f}, {lstm_pred['x'].max():.2f}]")
#     print(f"  Y range: [{lstm_pred['y'].min():.2f}, {lstm_pred['y'].max():.2f}]")
    
#     print("\nCreating ensemble...")
#     print(f"\nEnsemble predictions: {len(catboost_pred):,}")
#     print(f"  X range: [{ensemble_pred['x'].min():.2f}, {ensemble_pred['x'].max():.2f}]")
#     print(f"  Y range: [{ensemble_pred['y'].min():.2f}, {ensemble_pred['y'].max():.2f}]")
#     print(f"  Mean X: {ensemble_pred['x'].mean():.2f}, Std X: {ensemble_pred['x'].std():.2f}")
#     print(f"  Mean Y: {ensemble_pred['y'].mean():.2f}, Std Y: {ensemble_pred['y'].std():.2f}")
#     print(f"  NaN values: {ensemble_pred.isnull().sum().sum()}")
    
#     ensemble_pred.to_csv('submission.csv', index=False)
#     print("\nSubmission file created successfully")

# if __name__ == "__main__":
#     main()

In [None]:
# model_x = CatBoostRegressor(
#             iterations=1000,
#             learning_rate=0.05,
#             depth=10,
#             l2_leaf_reg=3.0,
#             random_seed=42, 
#             task_type='GPU',
#             devices='0',
#             early_stopping_rounds=500,
#             verbose=200,
#             loss_function='RMSE'
#         )
        
# model_x.fit(
#             X,
#             eval_set=y_x,
#             verbose=200
#         )