## 1. Setup and Configuration

In [None]:
# ==================================================
# Cell 1: Setup and Configuration
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from pathlib import Path
import warnings
import time
from datetime import datetime
import os
import sys

# Sklearn imports
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    StratifiedKFold, train_test_split
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier
)
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print("=" * 100)
print("IMPORTING LIBRARIES...")
print("=" * 100)

# Try importing XGBoost
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
    print("XGBoost available")
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not installed")

# Global Configuration
print("\n" + "=" * 100)
print("CONFIGURING PATHS...")
print("=" * 100)

os.chdir('d:\\ScoreSight')
print(f"Working directory: {os.getcwd()}")

# Create directory structure
MODELS_DIR = Path('models')
VIZ_DIR = Path('visualizations/ps1_league_winner')
REPORTS_DIR = Path('reports')
DATA_DIR = Path('data')
DATASETS_DIR = Path('datasets')
FINAL_DATA_PATH = DATA_DIR / 'league_winner' / 'league_winner_data.csv'

for dir_path in [MODELS_DIR, VIZ_DIR, REPORTS_DIR, FINAL_DATA_PATH.parent]:
    dir_path.mkdir(exist_ok=True)
    print(f"Created/verified: {dir_path}")

# Raw data path
RAW_LEAGUE_DATA_PATH = DATASETS_DIR / 'ScoreSight_ML_Season_LeagueWinner_Champion.csv'

# Display options
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("\n" + "=" * 100)
print("PS1 LEAGUE WINNER PIPELINE")
print("=" * 100)

## 2. Data Loading, Cleaning, and Feature Engineering

In [None]:
# ==================================================
# Cell 2: Data Loading and Engineering
# ==================================================
print(f"Loading raw league data from: {RAW_LEAGUE_DATA_PATH}")
try:
    df_league_raw = pd.read_csv(RAW_LEAGUE_DATA_PATH)
    print(f"Loaded raw league data: {df_league_raw.shape}")

    # --- Data Cleaning ---
    df_league_raw.columns = df_league_raw.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    df_league_raw.drop_duplicates(inplace=True)
    print(f"Cleaned data shape: {df_league_raw.shape}")

    # --- Feature Engineering & Selection ---
    # These are columns that could cause data leakage
    leak_cols = ['points_per_game', 'target_total_points', 'target_league_position']
    
    # Define feature columns (numeric types, not identifiers or leakage columns)
    feature_cols = [c for c in df_league_raw.columns if c not in leak_cols + ['team', 'season_encoded', 'team_encoded', 'target_champion'] and df_league_raw[c].dtype in ['float64', 'int64']]
    
    # Create the final DataFrame for modeling
    df_ps1 = df_league_raw[feature_cols + ['target_champion']].copy()
    
    # Save the engineered data for traceability
    FINAL_DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
    df_ps1.to_csv(FINAL_DATA_PATH, index=False)
    
    print(f"Engineered dataset created with {len(feature_cols)} features.")
    print(f"Saved final modeling data to: {FINAL_DATA_PATH}")
    print("\nFinal DataFrame head:")
    display(df_ps1.head())

except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_LEAGUE_DATA_PATH}")
    df_ps1 = None

## 3. Model Training

In [None]:
# ==================================================
# Cell 3: Model Training
# ==================================================
if df_ps1 is not None:
    print("\n" + "=" * 100)
    print("PS1: LEAGUE WINNER PREDICTION - TRAINING")
    print("=" * 100)

    # Define target and features
    TARGET_COL_PS1 = 'target_champion'
    
    # Exclude the target from the feature list
    feature_cols_ps1 = [c for c in df_ps1.columns if c != TARGET_COL_PS1]
    
    X_ps1 = df_ps1[feature_cols_ps1].copy()
    y_ps1 = df_ps1[TARGET_COL_PS1].astype(int)
    
    print(f"Features ({len(feature_cols_ps1)}): {feature_cols_ps1}")
    print(f"Target: {TARGET_COL_PS1}")

    # Check for class imbalance
    print("\nClass distribution:")
    print(y_ps1.value_counts())
    
    # Stratified split to maintain class balance
    X_train_ps1, X_test_ps1, y_train_ps1, y_test_ps1 = train_test_split(
        X_ps1, y_ps1, test_size=0.25, random_state=42, stratify=y_ps1
    )
    
    print(f"\nData split: {X_train_ps1.shape[0]} train / {X_test_ps1.shape[0]} test")
    
    # Model configurations
    models_ps1 = {
        'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'GradientBoosting': GradientBoostingClassifier(random_state=42)
    }
    
    if XGB_AVAILABLE:
        pos = y_train_ps1.sum()
        neg = len(y_train_ps1) - pos
        spw = max((neg / pos) if pos > 0 else 1.0, 1.0)
        models_ps1['XGBoost'] = xgb.XGBClassifier(
            random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=spw
        )
    
    # Parameter grids
    param_grids_ps1 = {
        'RandomForest': {
            'model__n_estimators': [50, 100], 'model__max_depth': [5, 10],
            'model__min_samples_leaf': [5, 10], 'model__min_samples_split': [10, 20]
        },
        'GradientBoosting': {
            'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05],
            'model__max_depth': [3, 5], 'model__subsample': [0.7, 0.8]
        },
        'XGBoost': {
            'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05],
            'model__max_depth': [3, 5], 'model__subsample': [0.7, 0.8]
        }
    }
    
    # --- Execute Training ---
    best_model_data = None
    best_f1_score = -1

    for model_name, model in models_ps1.items():
        print(f"\n--- Training {model_name} ---")
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        search = RandomizedSearchCV(
            estimator=pipeline, param_distributions=param_grids_ps1.get(model_name, {}),
            n_iter=10, cv=StratifiedKFold(n_splits=5), scoring='f1_macro', n_jobs=-1, random_state=42
        )
        search.fit(X_train_ps1, y_train_ps1)
        y_pred = search.predict(X_test_ps1)
        f1 = f1_score(y_test_ps1, y_pred, average='macro')
        
        print(f"  Test F1-Macro: {f1:.4f}")
        print(f"  Best CV Score: {search.best_score_:.4f}")

        if f1 > best_f1_score:
            best_f1_score = f1
            best_model_data = {
                'name': model_name,
                'model': search.best_estimator_,
                'f1_score': f1,
                'report': classification_report(y_test_ps1, y_pred, output_dict=True)
            }

    print(f"\n--- Best Model: {best_model_data['name']} with F1-Macro: {best_model_data['f1_score']:.4f} ---")

    # --- Save Artifacts ---
    model_path = MODELS_DIR / 'ps1_league_winner_best_model.joblib'
    metadata_path = MODELS_DIR / 'ps1_league_winner_metadata.json'
    
    joblib.dump(best_model_data['model'], model_path)
    
    metadata = {
        'problem_name': 'PS1_League_Winner',
        'best_model': best_model_data['name'],
        'task_type': 'classification',
        'test_metrics': {'f1_macro': best_model_data['f1_score']},
        'classification_report': best_model_data['report']
    }
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print(f"Model saved: {model_path}")
    print(f"Metadata saved: {metadata_path}")

else:
    print("Skipping training because data loading failed.")