In [None]:
import os
print("=== SYSTEM DIAGNOSTIC ===")
print("Listing ALL files in /kaggle/input:")
if os.path.exists('/kaggle/input'):
    found_any = False
    for root, dirs, files in os.walk('/kaggle/input'):
        level = root.replace('/kaggle/input', '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            found_any = True
            print(f"{subindent}{f}")
    if not found_any:
        print("  (Directory is empty)")
else:
    print("/kaggle/input does not exist (Are you running locally?)")
print("=========================")

import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
import json
import sys
import torch
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Configuration
CONFIG = {
    # 1. COMPETITION DATA (Where test.csv lives)
    # DO NOT CHANGE this if you added the 'NFL Big Data Bowl 2026' dataset.
    'data_dir': '/kaggle/input/nfl-big-data-bowl-2026-prediction',

    # 2. YOUR MODELS (Where you uploaded your 'models' folder)
    # CHANGE THIS to match the path of your uploaded dataset.
    # It usually looks like: '/kaggle/input/YOUR-DATASET-NAME/models/xgboost'
    'models_dir': '/kaggle/input/my-data-set-2/models/xgboost',

    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Auto-detect models_dir if not found
if not Path(CONFIG['models_dir']).exists():
    print(f"⚠️ Default models_dir '{CONFIG['models_dir']}' not found. Searching...")
    # Search for a known model file to locate the directory
    possible_models = list(Path('/kaggle/input').glob('**/xgboost_params.json'))
    if possible_models:
        # Found the params file, the parent is likely the models dir
        # Check if it is inside 'xgboost' folder or if it IS the root
        # We expect structure: .../models/xgboost/xgboost_params.json
        found_dir = possible_models[0].parent
        CONFIG['models_dir'] = str(found_dir)
        print(f"✅ Auto-detected models_dir at: {CONFIG['models_dir']}")
    else:
        print("❌ Could not auto-detect models directory. Please check 'models_dir' path.")

# Auto-detect competition data if not found
if not Path(CONFIG['data_dir']).exists():
    print(f"⚠️ Default data_dir '{CONFIG['data_dir']}' not found. Searching...")
    found_data = False
    if Path('/kaggle/input').exists():
        for d in Path('/kaggle/input').iterdir():
            if d.is_dir() and (d / 'test.csv').exists():
                CONFIG['data_dir'] = str(d)
                print(f"✅ Auto-detected competition data at: {CONFIG['data_dir']}")
                found_data = True
                break
    if not found_data:
        print("❌ Could not auto-detect competition data. Please check 'Add Data'.")

# Feature columns (Must match training)
FEATURE_COLS = [
    'x', 'y', 's', 'a', 'dir', 'o',
    'ball_land_x', 'ball_land_y', 'dist_to_ball_land',
    'v_x', 'v_y',
    'player_position_encoded', 'player_side_encoded', 'player_role_encoded',
    'player_height_inches', 'player_weight', 'player_age',
    'play_direction_binary', 'absolute_yardline_number'
]


# Verify paths immediately
print('Checking configuration paths...')
for key, path in CONFIG.items():
    if key.endswith('_dir'):
        if Path(path).exists():
            print(f'✅ {key} found: {path}')
        else:
            print(f'❌ {key} NOT found: {path}')

print('Cell 1 (Imports and Config) executed successfully')

"""
Data Loading and Preprocessing Module for NFL Big Data Bowl 2026
This module handles loading, cleaning, and preprocessing of NFL tracking data.
"""

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Tuple, List, Dict
import warnings
warnings.filterwarnings('ignore')


class NFLDataLoader:
    """
    Handles loading and preprocessing of NFL tracking data.
    
    The data consists of:
    - Input files: Player tracking data BEFORE the pass is thrown
    - Output files: Player positions AFTER the pass (targets to predict)
    """
    
    def __init__(self, data_dir: str = '/kaggle/input/nfl-big-data-bowl-2026-prediction'):
        """
        Initialize the data loader.
        
        Args:
            data_dir: Path to the directory containing the competition data
        """
        self.data_dir = Path(data_dir)
        self.train_dir = self.data_dir / 'train'
        
    def load_week_data(self, week: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Load input and output data for a specific week.
        
        Args:
            week: Week number (1-18)
            
        Returns:
            Tuple of (input_df, output_df)
        """
        input_file = self.train_dir / f'input_2023_w{week:02d}.csv'
        output_file = self.train_dir / f'output_2023_w{week:02d}.csv'
        
        input_df = pd.read_csv(input_file)
        output_df = pd.read_csv(output_file)
        
        return input_df, output_df
    
    def load_all_training_data(self, weeks: List[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Load and concatenate data from multiple weeks.
        
        Args:
            weeks: List of week numbers to load. If None, loads all weeks 1-18.
            
        Returns:
            Tuple of (combined_input_df, combined_output_df)
        """
        if weeks is None:
            weeks = range(1, 19)  # Weeks 1-18
        
        input_dfs = []
        output_dfs = []
        
        print(f"Loading data from {len(weeks)} weeks...")
        for week in weeks:
            try:
                input_df, output_df = self.load_week_data(week)
                input_dfs.append(input_df)
                output_dfs.append(output_df)
                print(f"  Week {week}: {len(input_df)} input rows, {len(output_df)} output rows")
            except FileNotFoundError:
                print(f"  Week {week}: Files not found, skipping...")
                continue
        
        combined_input = pd.concat(input_dfs, ignore_index=True)
        combined_output = pd.concat(output_dfs, ignore_index=True)
        
        print(f"\nTotal: {len(combined_input)} input rows, {len(combined_output)} output rows")
        return combined_input, combined_output
    
    def preprocess_input_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess input tracking data.
        
        Args:
            df: Raw input dataframe
            
        Returns:
            Preprocessed dataframe
        """
        df = df.copy()
        
        # Convert play_direction to binary (left=0, right=1)
        df['play_direction_binary'] = (df['play_direction'] == 'right').astype(int)
        
        # Parse player height to inches
        if 'player_height' in df.columns:
            df['player_height_inches'] = df['player_height'].apply(self._height_to_inches)
        
        # Calculate age from birth date
        if 'player_birth_date' in df.columns:
            df['player_birth_date'] = pd.to_datetime(df['player_birth_date'], errors='coerce')
            df['player_age'] = (pd.Timestamp('2023-01-01') - df['player_birth_date']).dt.days / 365.25
        
        # Encode categorical variables
        df = self._encode_categorical_features(df)
        
        # Calculate distance to ball landing location
        if 'ball_land_x' in df.columns and 'ball_land_y' in df.columns:
            df['dist_to_ball_land'] = np.sqrt(
                (df['x'] - df['ball_land_x'])**2 + 
                (df['y'] - df['ball_land_y'])**2
            )
        
        # Calculate velocity components
        if 's' in df.columns and 'dir' in df.columns:
            df['v_x'] = df['s'] * np.cos(np.radians(df['dir']))
            df['v_y'] = df['s'] * np.sin(np.radians(df['dir']))
        
        return df
    
    def _height_to_inches(self, height_str: str) -> float:
        """Convert height string (e.g., '6-2') to inches."""
        try:
            if pd.isna(height_str):
                return np.nan
            feet, inches = height_str.split('-')
            return int(feet) * 12 + int(inches)
        except:
            return np.nan
    
    def _encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical features using label encoding."""
        categorical_cols = ['player_position', 'player_side', 'player_role']
        
        for col in categorical_cols:
            if col in df.columns:
                df[f'{col}_encoded'] = pd.Categorical(df[col]).codes
        
        return df
    
    def create_play_sequences(self, input_df: pd.DataFrame, output_df: pd.DataFrame) -> Dict:
        """
        Organize data into sequences for each play and player.
        
        Args:
            input_df: Preprocessed input dataframe
            output_df: Output dataframe with targets
            
        Returns:
            Dictionary mapping (game_id, play_id, nfl_id) to sequence data
        """
        sequences = {}
        
        # Group by play and player
        for (game_id, play_id, nfl_id), group in input_df.groupby(['game_id', 'play_id', 'nfl_id']):
            # Sort by frame_id to ensure temporal order
            group = group.sort_values('frame_id')
            
            # Get corresponding output data
            output_mask = (
                (output_df['game_id'] == game_id) & 
                (output_df['play_id'] == play_id) & 
                (output_df['nfl_id'] == nfl_id)
            )
            output_group = output_df[output_mask].sort_values('frame_id')
            
            # Only include if we have both input and output
            if len(group) > 0 and len(output_group) > 0:
                sequences[(game_id, play_id, nfl_id)] = {
                    'input': group,
                    'output': output_group,
                    'player_to_predict': group['player_to_predict'].iloc[0] if 'player_to_predict' in group.columns else True,
                    'num_frames_output': group['num_frames_output'].iloc[0] if 'num_frames_output' in group.columns else len(output_group)
                }
        
        return sequences


class FeatureEngineering:
    """
    Advanced feature engineering for NFL tracking data.
    """
    
    @staticmethod
    def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
        """
        Add temporal features based on frame sequences.
        
        Args:
            df: Input dataframe with frame_id
            
        Returns:
            Dataframe with additional temporal features
        """
        df = df.copy()
        
        # Group by play and player
        for (game_id, play_id, nfl_id), group in df.groupby(['game_id', 'play_id', 'nfl_id']):
            idx = group.index
            
            # Calculate changes over time (velocity approximation)
            if len(group) > 1:
                df.loc[idx, 'dx'] = group['x'].diff().fillna(0)
                df.loc[idx, 'dy'] = group['y'].diff().fillna(0)
                df.loc[idx, 'ds'] = group['s'].diff().fillna(0)
                
                # Acceleration approximation
                df.loc[idx, 'dv_x'] = group['v_x'].diff().fillna(0) if 'v_x' in group.columns else 0
                df.loc[idx, 'dv_y'] = group['v_y'].diff().fillna(0) if 'v_y' in group.columns else 0
        
        return df
    
    @staticmethod
    def add_interaction_features(df: pd.DataFrame) -> pd.DataFrame:
        """
        Add features representing player interactions.
        
        Args:
            df: Input dataframe
            
        Returns:
            Dataframe with interaction features
        """
        df = df.copy()
        
        # For each frame in each play, calculate distances to other players
        for (game_id, play_id, frame_id), group in df.groupby(['game_id', 'play_id', 'frame_id']):
            idx = group.index
            
            # Calculate distance to nearest defender/offensive player
            if 'player_side' in group.columns:
                for i, row in group.iterrows():
                    # Distance to nearest opponent
                    opponents = group[group['player_side'] != row['player_side']]
                    if len(opponents) > 0:
                        distances = np.sqrt(
                            (opponents['x'] - row['x'])**2 + 
                            (opponents['y'] - row['y'])**2
                        )
                        df.loc[i, 'dist_to_nearest_opponent'] = distances.min()
                        df.loc[i, 'avg_dist_to_opponents'] = distances.mean()
        
        return df
    
    @staticmethod
    def add_physics_features(df: pd.DataFrame) -> pd.DataFrame:
        """
        Add physics-based features for trajectory prediction.
        
        Args:
            df: Input dataframe
            
        Returns:
            Dataframe with physics features
        """
        df = df.copy()
        
        # Time to reach ball landing location (assuming constant velocity)
        if 'dist_to_ball_land' in df.columns and 's' in df.columns:
            df['time_to_ball'] = df['dist_to_ball_land'] / (df['s'] + 1e-6)  # Add small epsilon to avoid division by zero
        
        # Direction towards ball landing location
        if 'ball_land_x' in df.columns and 'ball_land_y' in df.columns:
            df['angle_to_ball'] = np.arctan2(
                df['ball_land_y'] - df['y'],
                df['ball_land_x'] - df['x']
            )
            
            # Angle difference between current direction and ball direction
            if 'dir' in df.columns:
                df['angle_diff_to_ball'] = np.abs(np.radians(df['dir']) - df['angle_to_ball'])
        
        return df


def prepare_model_data(sequences: Dict, feature_cols: List[str]) -> Tuple[np.ndarray, np.ndarray, List]:
    """
    Prepare data for model training.
    
    Args:
        sequences: Dictionary of play sequences
        feature_cols: List of feature column names to use
        
    Returns:
        Tuple of (X, y, sequence_keys) where:
        - X: Input features array of shape (n_sequences, n_frames, n_features)
        - y: Target array of shape (n_sequences, n_output_frames, 2)
        - sequence_keys: List of (game_id, play_id, nfl_id) tuples
    """
    X_list = []
    y_list = []
    keys_list = []
    
    for key, seq_data in sequences.items():
        # Only include sequences where player should be predicted
        if not seq_data['player_to_predict']:
            continue
        
        input_seq = seq_data['input']
        output_seq = seq_data['output']
        
        # Extract features
        try:
            features = input_seq[feature_cols].values
            targets = output_seq[['x', 'y']].values
            
            X_list.append(features)
            y_list.append(targets)
            keys_list.append(key)
        except KeyError as e:
            print(f"Warning: Missing feature {e} for sequence {key}")
            continue
    
    return X_list, y_list, keys_list

print('Cell 2 (Data Loader) executed successfully')

"""
XGBoost-based Model for NFL Player Trajectory Prediction
This model uses gradient boosting to predict future positions frame by frame.
"""

import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

try:
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("Warning: XGBoost not available.")


class XGBoostTrajectoryModel:
    """
    XGBoost model for trajectory prediction.
    
    Strategy: Train separate models for x and y coordinates at each future time step.
    """
    
    def __init__(self, max_future_frames=30, **xgb_params):
        """
        Args:
            max_future_frames: Maximum number of future frames to predict
            xgb_params: Parameters for XGBoost model
        """
        self.max_future_frames = max_future_frames
        self.models_x = {}  # Models for x coordinate at each frame
        self.models_y = {}  # Models for y coordinate at each frame
        
        # Default XGBoost parameters
        self.xgb_params = {
            'objective': 'reg:squarederror',
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Load optimized parameters if available
        # In notebook environment, we skip this auto-loading.
        # Parameters will be loaded via load_models() or passed explicitly.
        pass

                
        self.xgb_params.update(xgb_params)
    
    def create_features_for_frame(self, input_df: pd.DataFrame, 
                                  frame_offset: int) -> pd.DataFrame:
        """
        Create features for predicting a specific future frame.
        
        Args:
            input_df: Input tracking data
            frame_offset: Which future frame to predict (1, 2, 3, ...)
            
        Returns:
            DataFrame with features
        """
        features = input_df.copy()
        
        # Add frame offset as a feature
        features['frame_offset'] = frame_offset
        
        # Calculate time to target frame (in seconds)
        features['time_to_frame'] = frame_offset / 10.0  # 10 fps
        
        # Predicted position based on constant velocity
        if 'v_x' in features.columns and 'v_y' in features.columns:
            features['predicted_x_cv'] = features['x'] + features['v_x'] * features['time_to_frame']
            features['predicted_y_cv'] = features['y'] + features['v_y'] * features['time_to_frame']
        
        # Distance and direction to ball at predicted time
        if 'ball_land_x' in features.columns:
            features['predicted_dist_to_ball'] = np.sqrt(
                (features['predicted_x_cv'] - features['ball_land_x'])**2 +
                (features['predicted_y_cv'] - features['ball_land_y'])**2
            )
        
        return features
    
    def prepare_training_data(self, input_df: pd.DataFrame, 
                            output_df: pd.DataFrame,
                            feature_cols: List[str]) -> Dict[int, Tuple]:
        """
        Prepare training data for each future frame.
        
        Args:
            input_df: Input tracking data
            output_df: Output tracking data with targets
            feature_cols: List of feature columns
            
        Returns:
            Dictionary mapping frame_offset to (X, y_x, y_y) tuples
        """
        data_by_frame = {}
        
        # Group by play and player
        # Group by play and player
        # Optimize: Pre-group output_df to avoid repeated filtering
        output_groups = dict(list(output_df.groupby(['game_id', 'play_id', 'nfl_id'])))
        
        for (game_id, play_id, nfl_id), input_group in input_df.groupby(['game_id', 'play_id', 'nfl_id']):
            # Only include players to predict
            if not input_group['player_to_predict'].iloc[0]:
                continue
            
            # Get last input frame
            last_input = input_group.sort_values('frame_id').iloc[-1]
            
            # Get output frames from pre-grouped dict
            if (game_id, play_id, nfl_id) not in output_groups:
                continue
                
            output_group = output_groups[(game_id, play_id, nfl_id)].sort_values('frame_id')
            
            if len(output_group) == 0:
                continue
            
            # For each output frame
            for i, (_, output_row) in enumerate(output_group.iterrows()):
                frame_offset = i + 1  # 1-indexed
                
                if frame_offset > self.max_future_frames:
                    break
                
                # Create features
                features_df = self.create_features_for_frame(
                    pd.DataFrame([last_input]), frame_offset
                )
                
                # Extract feature values
                try:
                    X_row = features_df[feature_cols].values[0]
                    y_x = output_row['x']
                    y_y = output_row['y']
                    
                    if frame_offset not in data_by_frame:
                        data_by_frame[frame_offset] = {'X': [], 'y_x': [], 'y_y': []}
                    
                    data_by_frame[frame_offset]['X'].append(X_row)
                    data_by_frame[frame_offset]['y_x'].append(y_x)
                    data_by_frame[frame_offset]['y_y'].append(y_y)
                except KeyError as e:
                    continue
        
        # Convert to arrays
        result = {}
        for frame_offset, data in data_by_frame.items():
            result[frame_offset] = (
                np.array(data['X']),
                np.array(data['y_x']),
                np.array(data['y_y'])
            )
        
        return result
    
    def train(self, training_data: Dict[int, Tuple], verbose=True):
        """
        Train XGBoost models for each future frame.
        
        Args:
            training_data: Dictionary from prepare_training_data
            verbose: Whether to print progress
        """
        if verbose:
            print(f"Training XGBoost models for {len(training_data)} future frames...")
        
        for frame_offset, (X, y_x, y_y) in training_data.items():
            if verbose:
                print(f"  Frame +{frame_offset}: {len(X)} samples")
            
            # Split data
            X_train, X_val, y_x_train, y_x_val = train_test_split(
                X, y_x, test_size=0.2, random_state=42
            )
            _, _, y_y_train, y_y_val = train_test_split(
                X, y_y, test_size=0.2, random_state=42
            )
            
            # Train model for x coordinate
            model_x = xgb.XGBRegressor(**self.xgb_params)
            model_x.fit(
                X_train, y_x_train,
                eval_set=[(X_val, y_x_val)],
                verbose=False
            )
            self.models_x[frame_offset] = model_x
            
            # Train model for y coordinate
            model_y = xgb.XGBRegressor(**self.xgb_params)
            model_y.fit(
                X_train, y_y_train,
                eval_set=[(X_val, y_y_val)],
                verbose=False
            )
            self.models_y[frame_offset] = model_y
            
            if verbose:
                # Calculate validation RMSE
                pred_x = model_x.predict(X_val)
                pred_y = model_y.predict(X_val)
                rmse = np.sqrt(((pred_x - y_x_val)**2 + (pred_y - y_y_val)**2).mean() / 2)
                print(f"    Validation RMSE: {rmse:.4f}")
        
        if verbose:
            print("Training complete!")
    
    def predict(self, input_df: pd.DataFrame, feature_cols: List[str],
               num_frames: int) -> np.ndarray:
        """
        Predict future positions (Single Player).
        Wrapper around predict_batch for compatibility.
        """
        # Ensure input is a DataFrame with one row (last frame)
        if len(input_df) > 1:
             last_input = input_df.sort_values('frame_id').iloc[[-1]]
        else:
             last_input = input_df
             
        # Add dummy index for batch processing
        last_input = last_input.copy()
        
        # Predict
        batch_preds = self.predict_batch(last_input, feature_cols, num_frames)
        
        # Extract result
        key = (last_input['game_id'].iloc[0], last_input['play_id'].iloc[0], last_input['nfl_id'].iloc[0])
        return batch_preds.get(key, np.zeros((num_frames, 2)))

    def predict_batch(self, last_input_df: pd.DataFrame, feature_cols: List[str],
                     num_frames: int) -> Dict[Tuple, np.ndarray]:
        """
        Predict future positions for multiple players simultaneously.
        
        Args:
            last_input_df: DataFrame containing the LAST frame for each player.
            feature_cols: List of feature columns
            num_frames: Number of frames to predict
            
        Returns:
            Dictionary mapping (game_id, play_id, nfl_id) -> Array of shape (num_frames, 2)
        """
        # Initialize results dictionary
        results = {}
        keys = []
        for _, row in last_input_df.iterrows():
            key = (row['game_id'], row['play_id'], row['nfl_id'])
            results[key] = np.zeros((num_frames, 2))
            keys.append(key)
            
        # We will update a working DataFrame frame by frame
        current_df = last_input_df.copy()
        
        # Pre-calculate constant velocity components if available
        if 'v_x' in current_df.columns and 'v_y' in current_df.columns:
            v_x = current_df['v_x'].values
            v_y = current_df['v_y'].values
        else:
            v_x = np.zeros(len(current_df))
            v_y = np.zeros(len(current_df))
            
        start_x = current_df['x'].values
        start_y = current_df['y'].values
        
        for frame_offset in range(1, num_frames + 1):
            # Create features for this batch
            # We use the helper but need to ensure it handles batches
            features_df = self.create_features_for_frame(current_df, frame_offset)
            
            X = features_df[feature_cols].values
            
            # Predict
            if frame_offset in self.models_x and frame_offset in self.models_y:
                pred_x = self.models_x[frame_offset].predict(X)
                pred_y = self.models_y[frame_offset].predict(X)
            else:
                # Fallback: Constant Velocity
                time_delta = frame_offset / 10.0
                pred_x = start_x + v_x * time_delta
                pred_y = start_y + v_y * time_delta
            
            # Clip
            pred_x = np.clip(pred_x, 0, 120)
            pred_y = np.clip(pred_y, 0, 53.3)
            
            # Store predictions
            try:
                # Ensure predictions are arrays
                if not isinstance(pred_x, np.ndarray):
                    pred_x = np.array(pred_x)
                if not isinstance(pred_y, np.ndarray):
                    pred_y = np.array(pred_y)
                    
                for i, key in enumerate(keys):
                    # Defensive casting to int
                    idx_frame = int(frame_offset - 1)
                    idx_player = int(i)
                    
                    results[key][idx_frame] = [pred_x[idx_player], pred_y[idx_player]]
            except Exception as e:
                print(f"ERROR in loop: {e}")
                print(f"frame_offset: {frame_offset}, type: {type(frame_offset)}")
                print(f"i: {i}, type: {type(i)}")
                print(f"pred_x type: {type(pred_x)}")
                print(f"pred_x shape: {getattr(pred_x, 'shape', 'N/A')}")
                # Don't raise, just skip this frame/player to keep running
                continue
                
        # Apply smoothing to all trajectories
        for key in results:
            results[key] = self.smooth_trajectory(results[key])
            
        return results
    
    def smooth_trajectory(self, trajectory: np.ndarray, window_size=5) -> np.ndarray:
        """
        Apply smoothing to the predicted trajectory.
        Uses a simple moving average.
        """
        if len(trajectory) < window_size:
            return trajectory
            
        smoothed = trajectory.copy()
        
        # Simple Moving Average for x and y
        for i in range(2):  # x and y
            # Use pandas rolling mean for convenience if available, else numpy
            series = pd.Series(trajectory[:, i])
            # Min_periods=1 ensures we don't get NaNs at the start
            smoothed[:, i] = series.rolling(window=window_size, min_periods=1, center=True).mean().values
            
        return smoothed
    
    def save_models(self, save_dir: str):
        """Save trained models."""
        import pickle
        from pathlib import Path
        
        save_dir = Path(save_dir)
        save_dir.mkdir(exist_ok=True, parents=True)
        
        # Save models
        for frame_offset, model in self.models_x.items():
            model.save_model(save_dir / f'xgb_x_frame_{frame_offset}.json')
        
        for frame_offset, model in self.models_y.items():
            model.save_model(save_dir / f'xgb_y_frame_{frame_offset}.json')
        
        # Save metadata
        metadata = {
            'max_future_frames': self.max_future_frames,
            'xgb_params': self.xgb_params,
            'trained_frames': list(self.models_x.keys())
        }
        with open(save_dir / 'metadata.pkl', 'wb') as f:
            pickle.dump(metadata, f)
    
    def load_models(self, save_dir: str):
        """Load trained models."""
        import pickle
        from pathlib import Path
        
        save_dir = Path(save_dir)
        
        # Load metadata
        with open(save_dir / 'metadata.pkl', 'rb') as f:
            metadata = pickle.load(f)
        
        self.max_future_frames = metadata['max_future_frames']
        self.xgb_params = metadata['xgb_params']
        
        # Load models
        for frame_offset in metadata['trained_frames']:
            model_x = xgb.XGBRegressor()
            model_x.load_model(save_dir / f'xgb_x_frame_{frame_offset}.json')
            self.models_x[frame_offset] = model_x
            
            model_y = xgb.XGBRegressor()
            model_y.load_model(save_dir / f'xgb_y_frame_{frame_offset}.json')
            self.models_y[frame_offset] = model_y


if __name__ == '__main__':
    if not XGBOOST_AVAILABLE:
        print("XGBoost is required. Please install: pip install xgboost")
    else:
        print("XGBoost model module loaded successfully")

print('Cell 3 (XGBoost Model) executed successfully')

import polars as pl
import kaggle_evaluation.nfl_inference_server
import os
import glob

# --- Global Setup ---
print('Initializing Global Resources...')

# 1. Robust Scaler Search
scaler = None
print('Searching for scaler.pkl...')
possible_scalers = glob.glob('/kaggle/input/**/scaler.pkl', recursive=True)
if possible_scalers:
    scaler_path = possible_scalers[0]
    print(f'✅ Found scaler at: {scaler_path}')
    scaler = joblib.load(scaler_path)
else:
    print('❌ Scaler NOT found. Model predictions will be garbage.')

# 2. Load Models
model = None
print('Searching for metadata.pkl...')
possible_models = glob.glob('/kaggle/input/**/metadata.pkl', recursive=True)
if possible_models:
    model_dir = os.path.dirname(possible_models[0])
    print(f'✅ Found model directory at: {model_dir}')
    loader = NFLDataLoader(CONFIG['data_dir'])
    fe = FeatureEngineering()
    model = XGBoostTrajectoryModel()
    model.load_models(model_dir)
    print('✅ Models loaded successfully')
else:
    print('❌ Model metadata NOT found in /kaggle/input. Model predictions will be garbage.')

def predict(test: pl.DataFrame, test_input: pl.DataFrame):
    # Initialize with Center Field (Better than 0,0)
    n_rows = len(test)
    x_preds = [60.0] * n_rows
    y_preds = [26.65] * n_rows
    
    try:
        # Convert to Pandas
        test_pd = test.to_pandas()
        test_input_pd = test_input.to_pandas()
        
        # DEBUG: Print columns to diagnose 'frame_id' error
        if not hasattr(predict, 'debug_printed'):
            print(f'DEBUG: Test Columns: {test_pd.columns.tolist()}')
            print(f'DEBUG: Input Columns: {test_input_pd.columns.tolist()}')
            predict.debug_printed = True

        # --- 1. Physics Calculation (Vectorized) ---
        # Get last known state for each player
        last_input = test_input_pd.sort_values('frame_id').groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
        
        # Calculate velocities
        s_filled = last_input['s'].fillna(0.0)
        dir_filled = last_input['dir'].fillna(0.0)
        last_input['v_x_calc'] = s_filled * np.sin(np.radians(dir_filled))
        last_input['v_y_calc'] = s_filled * np.cos(np.radians(dir_filled))
        
        # Prepare last state for merge
        last_input_merge = last_input[['game_id', 'play_id', 'nfl_id', 'x', 'y', 'frame_id', 'v_x_calc', 'v_y_calc']].rename(
            columns={'x': 'x_last', 'y': 'y_last', 'frame_id': 'frame_id_last'}
        )
        
        # Merge last state into test dataframe
        test_pd['original_index'] = test_pd.index
        test_merged = test_pd.merge(
            last_input_merge,
            on=['game_id', 'play_id', 'nfl_id'],
            how='left'
        )
        
        # Calculate dt
        sort_cols = ['game_id', 'play_id', 'nfl_id']
        if 'frame_id' in test_merged.columns:
            sort_cols.append('frame_id')
        test_merged = test_merged.sort_values(sort_cols)
        test_merged['frame_offset'] = test_merged.groupby(['game_id', 'play_id', 'nfl_id']).cumcount() + 1
        test_merged['dt'] = test_merged['frame_offset'] * 0.1
            
        # Physics Predictions
        test_merged['x_cv'] = test_merged['x_last'] + test_merged['v_x_calc'] * test_merged['dt']
        test_merged['y_cv'] = test_merged['y_last'] + test_merged['v_y_calc'] * test_merged['dt']
        
        # Fill NaNs with Center Field
        test_merged['x_cv'] = test_merged['x_cv'].fillna(60.0)
        test_merged['y_cv'] = test_merged['y_cv'].fillna(26.65)
        
        # Default final predictions to Physics
        test_merged['x_final'] = test_merged['x_cv']
        test_merged['y_final'] = test_merged['y_cv']
        
        # --- 2. XGBoost Prediction ---
        if model is not None and scaler is not None:
            # Preprocessing
            test_input_pd = loader.preprocess_input_data(test_input_pd)
            test_input_pd = fe.add_physics_features(test_input_pd)
            test_input_pd = fe.add_temporal_features(test_input_pd)
            
            # Scaling
            for col in FEATURE_COLS:
                if col not in test_input_pd.columns:
                    test_input_pd[col] = 0.0
            test_input_pd[FEATURE_COLS] = scaler.transform(test_input_pd[FEATURE_COLS])

            # Predict
            last_input_preprocessed = test_input_pd.sort_values('frame_id').groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
            players_to_predict = last_input_preprocessed[last_input_preprocessed['player_to_predict'] == True]
            
            if len(players_to_predict) > 0:
                max_frame = int(test_merged['frame_offset'].max())
                batch_preds = model.predict_batch(last_input_preprocessed, FEATURE_COLS, max_frame)
                
                # Convert to DataFrame
                pred_data = []
                for key, preds in batch_preds.items():
                    for i, (px, py) in enumerate(preds):
                        pred_data.append(list(key) + [i + 1, px, py])
                
                if pred_data:
                    pred_df = pd.DataFrame(pred_data, columns=['game_id', 'play_id', 'nfl_id', 'frame_offset', 'x_model', 'y_model'])
                    
                    # Merge model preds
                    test_merged = test_merged.merge(
                        pred_df,
                        on=['game_id', 'play_id', 'nfl_id', 'frame_offset'],
                        how='left'
                    )

                    # --- 3. SANITY CHECK & ENSEMBLE ---
                    # Calculate distance between Model and Physics
                    test_merged['dist_diff'] = np.sqrt(
                        (test_merged['x_model'] - test_merged['x_cv'])**2 + 
                        (test_merged['y_model'] - test_merged['y_cv'])**2
                    )
                    
                    # Logic: Aggressive Model Trust (Break the Physics Baseline)
                    # Threshold: 15.0 yards (Allow turns)
                    # Weights: 0.6 Model / 0.4 Physics
                    
                    mask_use_ensemble = (test_merged['x_model'].notna()) & (test_merged['dist_diff'] < 15.0)
                    
                    # Apply Ensemble (0.6 * Model + 0.4 * Physics)
                    test_merged.loc[mask_use_ensemble, 'x_final'] = (
                        0.6 * test_merged.loc[mask_use_ensemble, 'x_model'] + 
                        0.4 * test_merged.loc[mask_use_ensemble, 'x_cv']
                    )
                    test_merged.loc[mask_use_ensemble, 'y_final'] = (
                        0.6 * test_merged.loc[mask_use_ensemble, 'y_model'] + 
                        0.4 * test_merged.loc[mask_use_ensemble, 'y_cv']
                    )
                    # Else: Keep Physics (already in x_final)
                    
                    # --- 4. TEMPORAL SMOOTHING ---
                    # Smooth the final trajectory to remove jitter
                    # Window 3
                    test_merged['x_final'] = test_merged.groupby(['game_id', 'play_id', 'nfl_id'])['x_final'].transform(
                        lambda x: x.rolling(3, min_periods=1).mean()
                    )
                    test_merged['y_final'] = test_merged.groupby(['game_id', 'play_id', 'nfl_id'])['y_final'].transform(
                        lambda x: x.rolling(3, min_periods=1).mean()
                    )

        test_merged = test_merged.sort_values('original_index')
        x_preds = test_merged['x_final'].values
        y_preds = test_merged['y_final'].values

    except Exception as e:
        print(f'CRITICAL ERROR in predict function: {e}')
        # Fallback is already initialized to Center Field
    return pl.DataFrame({'x': x_preds, 'y': y_preds})

# --- Server Startup ---
inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print('Starting Inference Server...')
    inference_server.serve()
else:
    print('Running Local Gateway...')
    inference_server.run_local_gateway((CONFIG['data_dir'],))
