In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. DATA LOADING
# ============================================================================

def load_data():
    """Load all data files"""
    print("Loading data...")
    
    # Load training data (adjust paths as needed)
    input_files = []
    output_files = []
    
    # Load all weeks of 2023 data
    for week in range(1, 19):
        try:
            input_df = pd.read_csv(f'input_2023_w{week:02d}.csv')
            output_df = pd.read_csv(f'output_2023_w{week:02d}.csv')
            input_files.append(input_df)
            output_files.append(output_df)
            print(f"Loaded week {week}")
        except FileNotFoundError:
            print(f"Week {week} not found, skipping...")
            continue
    
    # Combine all weeks
    train_input = pd.concat(input_files, ignore_index=True)
    train_output = pd.concat(output_files, ignore_index=True)
    
    # Load test data
    test_input = pd.read_csv('test_input.csv')
    test_structure = pd.read_csv('test.csv')
    sample_submission = pd.read_csv('sample_submission.csv')
    
    print(f"Train input shape: {train_input.shape}")
    print(f"Train output shape: {train_output.shape}")
    print(f"Test input shape: {test_input.shape}")
    
    return train_input, train_output, test_input, test_structure, sample_submission


# ============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# ============================================================================

def basic_eda(train_input, train_output):
    """Perform basic exploratory data analysis"""
    print("\n" + "="*80)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*80)
    
    # Basic statistics
    print("\nInput Data Info:")
    print(train_input.info())
    
    print("\nPlayer Roles Distribution:")
    print(train_input['player_role'].value_counts())
    
    print("\nPlayer Positions Distribution:")
    print(train_input['player_position'].value_counts())
    
    print("\nFrames to Predict Statistics:")
    print(train_input['num_frames_output'].describe())
    
    # Analyze last frame positions vs ball landing
    last_frames = train_input.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    last_frames['dist_to_ball'] = np.sqrt(
        (last_frames['x'] - last_frames['ball_land_x'])**2 + 
        (last_frames['y'] - last_frames['ball_land_y'])**2
    )
    
    print("\nDistance to Ball Landing (at throw time):")
    print(last_frames.groupby('player_role')['dist_to_ball'].describe())
    
    return last_frames


def visualize_sample_play(train_input, train_output, game_id=None, play_id=None):
    """Visualize a sample play trajectory"""
    if game_id is None or play_id is None:
        # Pick a random play
        plays = train_input[['game_id', 'play_id']].drop_duplicates()
        sample = plays.sample(1).iloc[0]
        game_id, play_id = sample['game_id'], sample['play_id']
    
    # Get data for this play
    input_play = train_input[(train_input['game_id'] == game_id) & 
                             (train_input['play_id'] == play_id)]
    output_play = train_output[(train_output['game_id'] == game_id) & 
                               (train_output['play_id'] == play_id)]
    
    plt.figure(figsize=(15, 8))
    
    # Get last input frame for each player
    last_input = input_play.groupby('nfl_id').last().reset_index()
    
    # Plot trajectories for each player
    for nfl_id in input_play['nfl_id'].unique():
        player_input = input_play[input_play['nfl_id'] == nfl_id]
        player_output = output_play[output_play['nfl_id'] == nfl_id]
        player_info = last_input[last_input['nfl_id'] == nfl_id].iloc[0]
        
        role = player_info['player_role']
        side = player_info['player_side']
        
        color = 'red' if side == 'Offense' else 'blue'
        marker = 'o' if role == 'Targeted Receiver' else 's'
        
        # Input trajectory
        plt.plot(player_input['x'], player_input['y'], 
                color=color, alpha=0.3, linewidth=1)
        
        # Output trajectory
        plt.plot(player_output['x'], player_output['y'], 
                color=color, alpha=0.8, linewidth=2, linestyle='--')
        
        # Mark last input position
        plt.scatter(player_info['x'], player_info['y'], 
                   c=color, marker=marker, s=100, edgecolors='black', linewidth=2)
    
    # Mark ball landing location
    ball_x = last_input['ball_land_x'].iloc[0]
    ball_y = last_input['ball_land_y'].iloc[0]
    plt.scatter(ball_x, ball_y, c='gold', marker='*', s=500, 
               edgecolors='black', linewidth=2, label='Ball Landing')
    
    plt.xlim(0, 120)
    plt.ylim(0, 53.3)
    plt.xlabel('X Position (yards)', fontsize=12)
    plt.ylabel('Y Position (yards)', fontsize=12)
    plt.title(f'Play Visualization - Game {game_id}, Play {play_id}\n'
             f'Red=Offense, Blue=Defense, Circle=Targeted Receiver', fontsize=14)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('sample_play_visualization.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\nVisualized play: Game {game_id}, Play {play_id}")


# ============================================================================
# 3. FEATURE ENGINEERING
# ============================================================================

def engineer_features(input_df):
    """Create features for modeling"""
    df = input_df.copy()
    
    # Get last frame for each player in each play
    last_frame = df.groupby(['game_id', 'play_id', 'nfl_id']).last().reset_index()
    
    # Distance and angle to ball landing
    last_frame['dist_to_ball'] = np.sqrt(
        (last_frame['x'] - last_frame['ball_land_x'])**2 + 
        (last_frame['y'] - last_frame['ball_land_y'])**2
    )
    
    last_frame['angle_to_ball'] = np.arctan2(
        last_frame['ball_land_y'] - last_frame['y'],
        last_frame['ball_land_x'] - last_frame['x']
    ) * 180 / np.pi
    
    # Velocity components
    last_frame['vx'] = last_frame['s'] * np.cos(np.radians(last_frame['dir']))
    last_frame['vy'] = last_frame['s'] * np.sin(np.radians(last_frame['dir']))
    
    # Direction difference (direction vs angle to ball)
    last_frame['dir_diff'] = np.abs(last_frame['dir'] - last_frame['angle_to_ball'])
    last_frame['dir_diff'] = np.minimum(last_frame['dir_diff'], 360 - last_frame['dir_diff'])
    
    # Encode categorical variables
    last_frame['is_targeted'] = (last_frame['player_role'] == 'Targeted Receiver').astype(int)
    last_frame['is_passer'] = (last_frame['player_role'] == 'Passer').astype(int)
    last_frame['is_defense'] = (last_frame['player_side'] == 'Defense').astype(int)
    last_frame['is_coverage'] = (last_frame['player_role'] == 'Defensive Coverage').astype(int)
    
    # Play direction encoding
    last_frame['play_dir_right'] = (last_frame['play_direction'] == 'right').astype(int)
    
    # Player physical attributes
    last_frame['height_inches'] = last_frame['player_height'].apply(
        lambda x: int(x.split('-')[0])*12 + int(x.split('-')[1]) if pd.notna(x) else 72
    )
    
    # Age calculation
    last_frame['player_birth_date'] = pd.to_datetime(last_frame['player_birth_date'])
    last_frame['age_years'] = (pd.Timestamp('2023-09-01') - last_frame['player_birth_date']).dt.days / 365.25
    
    return last_frame


# ============================================================================
# 4. BASELINE MODELS
# ============================================================================

def physics_based_prediction(last_x, last_y, vx, vy, ball_x, ball_y, 
                             num_frames, is_targeted, is_defense, attraction_weight=0.3):
    """Physics-based prediction with attraction to ball landing"""
    predictions = []
    
    for frame in range(1, num_frames + 1):
        t = frame * 0.1  # 10 fps = 0.1 seconds per frame
        
        # Base physics prediction
        pred_x = last_x + vx * t
        pred_y = last_y + vy * t
        
        # Add attraction to ball landing location
        if is_targeted or is_defense:
            # Calculate vector towards ball
            dx_to_ball = ball_x - pred_x
            dy_to_ball = ball_y - pred_y
            dist = np.sqrt(dx_to_ball**2 + dy_to_ball**2)
            
            if dist > 0:
                # Stronger attraction for targeted receiver
                weight = attraction_weight * 2 if is_targeted else attraction_weight
                # Attraction increases over time
                time_factor = min(t / (num_frames * 0.1), 1.0)
                attraction = weight * time_factor
                
                pred_x += attraction * dx_to_ball
                pred_y += attraction * dy_to_ball
        
        # Apply field boundaries
        pred_x = np.clip(pred_x, 0, 120)
        pred_y = np.clip(pred_y, 0, 53.3)
        
        predictions.append([pred_x, pred_y])
    
    return np.array(predictions)


def create_baseline_predictions(features_df):
    """Generate baseline predictions using physics model"""
    all_predictions = []
    
    for idx, row in features_df.iterrows():
        preds = physics_based_prediction(
            last_x=row['x'],
            last_y=row['y'],
            vx=row['vx'],
            vy=row['vy'],
            ball_x=row['ball_land_x'],
            ball_y=row['ball_land_y'],
            num_frames=int(row['num_frames_output']),
            is_targeted=row['is_targeted'],
            is_defense=row['is_defense']
        )
        
        for frame_idx, (pred_x, pred_y) in enumerate(preds, start=1):
            all_predictions.append({
                'game_id': row['game_id'],
                'play_id': row['play_id'],
                'nfl_id': row['nfl_id'],
                'frame_id': frame_idx,
                'x': pred_x,
                'y': pred_y
            })
    
    return pd.DataFrame(all_predictions)


# ============================================================================
# 5. MACHINE LEARNING MODEL
# ============================================================================

def prepare_ml_data(train_input, train_output):
    """Prepare data for ML model - predict displacement per frame"""
    # Get features
    features = engineer_features(train_input)
    
    # Merge with ground truth
    merged = train_output.merge(
        features,
        on=['game_id', 'play_id', 'nfl_id'],
        suffixes=('_actual', '_start')
    )
    
    # Calculate displacement from start position
    merged['dx'] = merged['x_actual'] - merged['x_start']
    merged['dy'] = merged['y_actual'] - merged['y_start']
    
    # Average displacement per frame
    merged['dx_per_frame'] = merged['dx'] / merged['frame_id']
    merged['dy_per_frame'] = merged['dy'] / merged['frame_id']
    
    return merged


def train_ml_model(merged_df):
    """Train ML model to predict displacement"""
    feature_cols = [
        'x_start', 'y_start', 's', 'a', 'vx', 'vy',
        'dist_to_ball', 'angle_to_ball', 'dir_diff',
        'is_targeted', 'is_passer', 'is_defense', 'is_coverage',
        'play_dir_right', 'height_inches', 'age_years', 'player_weight',
        'absolute_yardline_number', 'frame_id', 'num_frames_output'
    ]
    
    # Remove any rows with missing values
    model_df = merged_df[feature_cols + ['dx_per_frame', 'dy_per_frame']].dropna()
    
    X = model_df[feature_cols]
    y_dx = model_df['dx_per_frame']
    y_dy = model_df['dy_per_frame']
    
    # Split data
    X_train, X_val, y_dx_train, y_dx_val, y_dy_train, y_dy_val = train_test_split(
        X, y_dx, y_dy, test_size=0.2, random_state=42
    )
    
    print("\nTraining ML models...")
    
    # Train separate models for x and y
    model_x = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
    model_y = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
    
    model_x.fit(X_train, y_dx_train)
    model_y.fit(X_train, y_dy_train)
    
    # Evaluate
    pred_dx = model_x.predict(X_val)
    pred_dy = model_y.predict(X_val)
    
    rmse_x = np.sqrt(mean_squared_error(y_dx_val, pred_dx))
    rmse_y = np.sqrt(mean_squared_error(y_dy_val, pred_dy))
    rmse_combined = np.sqrt(rmse_x**2 + rmse_y**2)
    
    print(f"Validation RMSE X: {rmse_x:.4f}")
    print(f"Validation RMSE Y: {rmse_y:.4f}")
    print(f"Combined RMSE: {rmse_combined:.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance_x': model_x.feature_importances_,
        'importance_y': model_y.feature_importances_
    }).sort_values('importance_x', ascending=False)
    
    print("\nTop 10 Important Features (X):")
    print(feature_importance.head(10))
    
    return model_x, model_y, feature_cols


# ============================================================================
# 6. PREDICTION AND SUBMISSION
# ============================================================================

def create_submission(test_input, test_structure, model_type='physics', 
                     model_x=None, model_y=None, feature_cols=None):
    """Create submission file"""
    print(f"\nCreating submission with {model_type} model...")
    
    # Get test features
    test_features = engineer_features(test_input)
    
    all_predictions = []
    
    for idx, row in test_features.iterrows():
        if model_type == 'physics':
            # Physics-based prediction
            preds = physics_based_prediction(
                last_x=row['x'],
                last_y=row['y'],
                vx=row['vx'],
                vy=row['vy'],
                ball_x=row['ball_land_x'],
                ball_y=row['ball_land_y'],
                num_frames=int(row['num_frames_output']),
                is_targeted=row['is_targeted'],
                is_defense=row['is_defense']
            )
            
            for frame_idx, (pred_x, pred_y) in enumerate(preds, start=1):
                all_predictions.append({
                    'game_id': row['game_id'],
                    'play_id': row['play_id'],
                    'nfl_id': row['nfl_id'],
                    'frame_id': frame_idx,
                    'x': pred_x,
                    'y': pred_y
                })
        
        elif model_type == 'ml' and model_x is not None and model_y is not None:
            # ML-based prediction
            for frame_idx in range(1, int(row['num_frames_output']) + 1):
                # Prepare features for this frame
                frame_features = row[feature_cols].copy()
                frame_features['frame_id'] = frame_idx
                
                # Predict displacement per frame
                dx_per_frame = model_x.predict([frame_features])[0]
                dy_per_frame = model_y.predict([frame_features])[0]
                
                # Calculate position
                pred_x = row['x'] + dx_per_frame * frame_idx
                pred_y = row['y'] + dy_per_frame * frame_idx
                
                # Apply boundaries
                pred_x = np.clip(pred_x, 0, 120)
                pred_y = np.clip(pred_y, 0, 53.3)
                
                all_predictions.append({
                    'game_id': row['game_id'],
                    'play_id': row['play_id'],
                    'nfl_id': row['nfl_id'],
                    'frame_id': frame_idx,
                    'x': pred_x,
                    'y': pred_y
                })
    
    # Create submission dataframe
    pred_df = pd.DataFrame(all_predictions)
    
    # Merge with test structure to get proper format
    submission = test_structure.copy()
    submission['id'] = (submission['game_id'].astype(str) + '_' + 
                       submission['play_id'].astype(str) + '_' + 
                       submission['nfl_id'].astype(str) + '_' + 
                       submission['frame_id'].astype(str))
    
    # Merge predictions
    submission = submission.merge(
        pred_df,
        on=['game_id', 'play_id', 'nfl_id', 'frame_id'],
        how='left'
    )
    
    # Fill any missing with 0 (shouldn't happen but just in case)
    submission['x'] = submission['x'].fillna(0)
    submission['y'] = submission['y'].fillna(0)
    
    # Final submission format
    final_submission = submission[['id', 'x', 'y']]
    
    return final_submission


# ============================================================================
# 7. MAIN EXECUTION
# ============================================================================

def main():
    """Main execution pipeline"""
    print("NFL Big Data Bowl 2026 - Prediction Competition")
    print("="*80)
    
    # Load data
    train_input, train_output, test_input, test_structure, sample_submission = load_data()
    
    # EDA
    last_frames = basic_eda(train_input, train_output)
    
    # Visualize sample play (optional - comment out if not needed)
    # visualize_sample_play(train_input, train_output)
    
    # Create baseline submission (physics-based)
    print("\n" + "="*80)
    print("CREATING BASELINE SUBMISSION")
    print("="*80)
    
    baseline_submission = create_submission(
        test_input, 
        test_structure, 
        model_type='physics'
    )
    baseline_submission.to_csv('submission_baseline_physics.csv', index=False)
    print("✓ Baseline submission saved: submission_baseline_physics.csv")
    
    # Train ML model (optional - can be commented out for quick baseline)
    print("\n" + "="*80)
    print("TRAINING ML MODEL")
    print("="*80)
    
    merged_df = prepare_ml_data(train_input, train_output)
    model_x, model_y, feature_cols = train_ml_model(merged_df)
    
    # Create ML submission
    ml_submission = create_submission(
        test_input,
        test_structure,
        model_type='ml',
        model_x=model_x,
        model_y=model_y,
        feature_cols=feature_cols
    )
    ml_submission.to_csv('submission_ml_model.csv', index=False)
    print("✓ ML submission saved: submission_ml_model.csv")
    
    print("\n" + "="*80)
    print("PIPELINE COMPLETE!")
    print("="*80)
    print("\nSubmission files created:")
    print("1. submission_baseline_physics.csv - Physics-based baseline")
    print("2. submission_ml_model.csv - ML-based predictions")
    print("\nNext steps:")
    print("- Upload submissions to Kaggle")
    print("- Iterate on feature engineering")
    print("- Try ensemble methods")
    print("- Experiment with sequence models (LSTM/GRU)")


if __name__ == "__main__":
    main()