## 1. Setup and Configuration

In [None]:
# ==================================================
# Cell 1: Setup and Configuration
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from pathlib import Path
import warnings
import time
from datetime import datetime
import os
import sys

# Sklearn imports
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    TimeSeriesSplit, KFold, train_test_split
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor
)
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print("=" * 100)
print("IMPORTING LIBRARIES...")
print("=" * 100)

# Try importing XGBoost and LightGBM
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
    print("XGBoost available")
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not installed")

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
    print("LightGBM available")
except ImportError:
    LGB_AVAILABLE = False
    print("LightGBM not installed")

# Global Configuration
print("\n" + "=" * 100)
print("CONFIGURING PATHS...")
print("=" * 100)

os.chdir('d:\\ScoreSight')
print(f"Working directory: {os.getcwd()}")

# Create directory structure
MODELS_DIR = Path('models')
VIZ_DIR = Path('visualizations/ps3_total_points')
DATA_DIR = Path('data')
DATASETS_DIR = Path('datasets')
FINAL_DATA_PATH = DATA_DIR / 'points_tally' / 'points_tally_data.csv'

for dir_path in [MODELS_DIR, VIZ_DIR, FINAL_DATA_PATH.parent]:
    dir_path.mkdir(exist_ok=True)
    print(f"Created/verified: {dir_path}")

# Raw data path
RAW_LEAGUE_DATA_PATH = DATASETS_DIR / 'ScoreSight_ML_Season_LeagueWinner_Champion.csv'

# Display options
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("\n" + "=" * 100)
print("PS3 TOTAL POINTS PIPELINE")
print("=" * 100)

## 2. Data Loading, Cleaning, and Feature Engineering

In [None]:
# ==================================================
# Cell 2: Data Loading and Engineering
# ==================================================
print(f"Loading raw league data from: {RAW_LEAGUE_DATA_PATH}")
try:
    df_league_raw = pd.read_csv(RAW_LEAGUE_DATA_PATH)
    print(f"Loaded raw league data: {df_league_raw.shape}")

    # --- Data Cleaning ---
    df_league_raw.columns = df_league_raw.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    df_league_raw.drop_duplicates(inplace=True)
    print(f"Cleaned data shape: {df_league_raw.shape}")

    # --- Feature Engineering & Selection ---
    # These columns are either direct calculations of points or other targets
    leak_cols = ['points_per_game', 'target_league_position', 'target_champion', 'wins', 'draws', 'losses', 'points']
    
    # Define feature columns
    feature_cols = [c for c in df_league_raw.columns if c not in leak_cols + ['team', 'season_encoded', 'team_encoded'] and df_league_raw[c].dtype in ['float64', 'int64']]
    
    # Create the final DataFrame for modeling
    # The target is 'target_total_points'
    df_ps3 = df_league_raw[feature_cols + ['target_total_points', 'season']].copy()
    df_ps3 = df_ps3.rename(columns={'target_total_points': 'total_points'})
    
    # Save the engineered data
    df_ps3.to_csv(FINAL_DATA_PATH, index=False)
    
    print(f"Engineered dataset created with {len(feature_cols)} features.")
    print(f"Saved final modeling data to: {FINAL_DATA_PATH}")
    print("\nFinal DataFrame head:")
    display(df_ps3.head())

except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_LEAGUE_DATA_PATH}")
    df_ps3 = None

## 3. Model Training

In [None]:
# ==================================================
# Cell 3: Model Training
# ==================================================
if df_ps3 is not None:
    print("\n" + "=" * 100)
    print("PS3: TOTAL POINTS PREDICTION - TRAINING")
    print("=" * 100)

    # Define target and features
    TARGET_COL_PS3 = 'total_points'
    feature_cols_ps3 = [c for c in df_ps3.columns if c not in [TARGET_COL_PS3, 'season']]
    
    X_ps3 = df_ps3[feature_cols_ps3].copy()
    y_ps3 = df_ps3[TARGET_COL_PS3]
    
    print(f"Features ({len(feature_cols_ps3)}): {feature_cols_ps3}")
    print(f"Target: {TARGET_COL_PS3}")

    # Temporal split by season
    seasons = sorted(df_ps3['season'].dropna().unique())
    if len(seasons) > 1:
        test_season = seasons[-1]
        train_mask = df_ps3['season'].isin(seasons[:-1])
        test_mask = df_ps3['season'] == test_season
        X_train_ps3, y_train_ps3 = X_ps3[train_mask], y_ps3[train_mask]
        X_test_ps3, y_test_ps3 = X_ps3[test_mask], y_ps3[test_mask]
        print(f"\nTemporal split: Training on {seasons[:-1]}, Testing on {test_season}")
    else:
        X_train_ps3, X_test_ps3, y_train_ps3, y_test_ps3 = train_test_split(X_ps3, y_ps3, test_size=0.25, random_state=42)
        print("\nRandom split (only one season of data)")

    print(f"Data split: {X_train_ps3.shape[0]} train / {X_test_ps3.shape[0]} test")
    
    # Model configurations
    models_ps3 = {
        'Ridge': Ridge(random_state=42),
        'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingRegressor(random_state=42)
    }
    if XGB_AVAILABLE:
        models_ps3['XGBoost'] = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    if LGB_AVAILABLE:
        models_ps3['LightGBM'] = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
    
    # Parameter grids
    param_grids_ps3 = {
        'Ridge': {'model__alpha': [0.1, 1.0, 10.0]},
        'RandomForest': {'model__n_estimators': [50, 100], 'model__max_depth': [10, 15]},
        'GradientBoosting': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]},
        'XGBoost': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]},
        'LightGBM': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]}
    }
    
    # --- Execute Training ---
    best_model_data = None
    best_mae = float('inf')

    for model_name, model in models_ps3.items():
        print(f"\n--- Training {model_name} ---")
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        search = RandomizedSearchCV(
            estimator=pipeline, param_distributions=param_grids_ps3.get(model_name, {}),
            n_iter=10, cv=TimeSeriesSplit(n_splits=5), 
            scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42
        )
        search.fit(X_train_ps3, y_train_ps3)
        y_pred = search.predict(X_test_ps3)
        mae = mean_absolute_error(y_test_ps3, y_pred)
        
        print(f"  Test MAE: {mae:.4f}")
        print(f"  Best CV Score (neg MAE): {search.best_score_:.4f}")

        if mae < best_mae:
            best_mae = mae
            best_model_data = {
                'name': model_name,
                'model': search.best_estimator_,
                'mae': mae,
                'r2': r2_score(y_test_ps3, y_pred)
            }

    print(f"\n--- Best Model: {best_model_data['name']} with MAE: {best_model_data['mae']:.4f} ---")

    # --- Save Artifacts ---
    model_path = MODELS_DIR / 'ps3_total_points_best_model.joblib'
    metadata_path = MODELS_DIR / 'ps3_total_points_metadata.json'
    
    joblib.dump(best_model_data['model'], model_path)
    
    metadata = {
        'problem_name': 'PS3_Total_Points',
        'best_model': best_model_data['name'],
        'task_type': 'regression',
        'test_metrics': {'mae': best_model_data['mae'], 'r2': best_model_data['r2']}
    }
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print(f"Model saved: {model_path}")
    print(f"Metadata saved: {metadata_path}")

else:
    print("Skipping training because data loading failed.")