## 1. Setup and Configuration

In [None]:
# ==================================================
# Cell 1: Setup and Configuration
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from pathlib import Path
import warnings
import time
from datetime import datetime
import os
import sys

# Sklearn imports
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    KFold, train_test_split
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor
)
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print("=" * 100)
print("IMPORTING LIBRARIES...")
print("=" * 100)

# Try importing XGBoost and LightGBM
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
    print("XGBoost available")
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not installed")

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
    print("LightGBM available")
except ImportError:
    LGB_AVAILABLE = False
    print("LightGBM not installed")

# Global Configuration
print("\n" + "=" * 100)
print("CONFIGURING PATHS...")
print("=" * 100)

os.chdir('d:\\ScoreSight')
print(f"Working directory: {os.getcwd()}")

# Create directory structure
MODELS_DIR = Path('models')
VIZ_DIR = Path('visualizations/ps2_top_scorer')
DATA_DIR = Path('data')
DATASETS_DIR = Path('datasets')
FINAL_DATA_PATH = DATA_DIR / 'top_scorer' / 'top_scorer_data.csv'

for dir_path in [MODELS_DIR, VIZ_DIR, FINAL_DATA_PATH.parent]:
    dir_path.mkdir(exist_ok=True)
    print(f"Created/verified: {dir_path}")

# Raw data path
RAW_PLAYER_DATA_PATH = DATASETS_DIR / 'Goals & Assist.xlsx'

# Display options
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("\n" + "=" * 100)
print("PS2 TOP SCORER PIPELINE")
print("=" * 100)

## 2. Data Loading, Cleaning, and Feature Engineering

In [None]:
# ==================================================
# Cell 2: Data Loading and Engineering
# ==================================================
print(f"Loading raw player data from: {RAW_PLAYER_DATA_PATH}")
try:
    df_player_raw = pd.read_excel(RAW_PLAYER_DATA_PATH)
    print(f"Loaded raw player data: {df_player_raw.shape}")

    # --- Data Cleaning ---
    df_player_raw.columns = df_player_raw.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    df_player_raw.drop_duplicates(inplace=True)
    
    for col in df_player_raw.select_dtypes(include=[np.number]).columns:
        if df_player_raw[col].isnull().sum() > 0:
            df_player_raw[col].fillna(df_player_raw[col].median(), inplace=True)
            
    for col in df_player_raw.select_dtypes(include=['object']).columns:
        if df_player_raw[col].isnull().sum() > 0:
            df_player_raw[col].fillna(df_player_raw[col].mode()[0], inplace=True)
    
    print(f"Cleaned data shape: {df_player_raw.shape}")

    # --- Feature Engineering ---
    df_player_eng = df_player_raw.copy()
    df_player_eng['90s'] = df_player_eng['matches_played']
    df_player_eng['goals_per_90'] = (df_player_eng['goals'] / df_player_eng['90s'].replace(0, 1)).round(2)
    df_player_eng['assists_per_90'] = (df_player_eng['assists'] / df_player_eng['90s'].replace(0, 1)).round(2)
    df_player_eng['xg_overperformance'] = df_player_eng['goals'] - df_player_eng['xg']
    
    age_bins = [0, 24, 29, 34, 100]
    age_labels = ['Prospect', 'Peak', 'Experienced', 'Veteran']
    df_player_eng['age_band'] = pd.cut(df_player_eng['age'], bins=age_bins, labels=age_labels, right=False)
    
    # --- Feature Selection ---
    exclude = ['player','team','season','nationality','position','player_encoded','nation_encoded','position_encoded','goals','non_penalty_goals','penalty_goals_made', 'age_band']
    feature_cols = [c for c in df_player_eng.columns if c not in exclude and df_player_eng[c].dtype in ['float64','int64']]
    
    df_ps2 = df_player_eng[feature_cols + ['goals']].copy()
    
    # Save the engineered data
    df_ps2.to_csv(FINAL_DATA_PATH, index=False)
    
    print(f"Engineered dataset created with {len(feature_cols)} features.")
    print(f"Saved final modeling data to: {FINAL_DATA_PATH}")
    print("\nFinal DataFrame head:")
    display(df_ps2.head())

except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_PLAYER_DATA_PATH}")
    df_ps2 = None

## 3. Model Training

In [None]:
# ==================================================
# Cell 3: Model Training
# ==================================================
if df_ps2 is not None:
    print("\n" + "=" * 100)
    print("PS2: TOP SCORER PREDICTION - TRAINING")
    print("=" * 100)

    # Define target and features
    TARGET_COL_PS2 = 'goals'
    feature_cols_ps2 = [c for c in df_ps2.columns if c != TARGET_COL_PS2]
    
    X_ps2 = df_ps2[feature_cols_ps2].copy()
    y_ps2 = df_ps2[TARGET_COL_PS2]
    
    print(f"Features ({len(feature_cols_ps2)}): {feature_cols_ps2}")
    print(f"Target: {TARGET_COL_PS2}")

    # Temporal split (80/20)
    split_idx = int(len(df_ps2) * 0.8)
    X_train_ps2, y_train_ps2 = X_ps2.iloc[:split_idx], y_ps2.iloc[:split_idx]
    X_test_ps2, y_test_ps2 = X_ps2.iloc[split_idx:], y_ps2.iloc[split_idx:]
    
    print(f"\nData split (temporal): {X_train_ps2.shape[0]} train / {X_test_ps2.shape[0]} test")
    
    # Model configurations
    models_ps2 = {
        'Ridge': Ridge(random_state=42),
        'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingRegressor(random_state=42)
    }
    if XGB_AVAILABLE:
        models_ps2['XGBoost'] = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    if LGB_AVAILABLE:
        models_ps2['LightGBM'] = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
    
    # Parameter grids
    param_grids_ps2 = {
        'Ridge': {'model__alpha': [0.1, 1.0, 10.0]},
        'RandomForest': {'model__n_estimators': [50, 100], 'model__max_depth': [10, 15]},
        'GradientBoosting': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]},
        'XGBoost': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]},
        'LightGBM': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]}
    }
    
    # --- Execute Training ---
    best_model_data = None
    best_mae = float('inf')

    for model_name, model in models_ps2.items():
        print(f"\n--- Training {model_name} ---")
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        search = RandomizedSearchCV(
            estimator=pipeline, param_distributions=param_grids_ps2.get(model_name, {}),
            n_iter=10, cv=KFold(n_splits=5, shuffle=True, random_state=42), 
            scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42
        )
        search.fit(X_train_ps2, y_train_ps2)
        y_pred = search.predict(X_test_ps2)
        mae = mean_absolute_error(y_test_ps2, y_pred)
        
        print(f"  Test MAE: {mae:.4f}")
        print(f"  Best CV Score (neg MAE): {search.best_score_:.4f}")

        if mae < best_mae:
            best_mae = mae
            best_model_data = {
                'name': model_name,
                'model': search.best_estimator_,
                'mae': mae,
                'r2': r2_score(y_test_ps2, y_pred)
            }

    print(f"\n--- Best Model: {best_model_data['name']} with MAE: {best_model_data['mae']:.4f} ---")

    # --- Save Artifacts ---
    model_path = MODELS_DIR / 'ps2_top_scorer_best_model.joblib'
    metadata_path = MODELS_DIR / 'ps2_top_scorer_metadata.json'
    
    joblib.dump(best_model_data['model'], model_path)
    
    metadata = {
        'problem_name': 'PS2_Top_Scorer',
        'best_model': best_model_data['name'],
        'task_type': 'regression',
        'test_metrics': {'mae': best_model_data['mae'], 'r2': best_model_data['r2']}
    }
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print(f"Model saved: {model_path}")
    print(f"Metadata saved: {metadata_path}")

else:
    print("Skipping training because data loading failed.")