## 1. Setup and Configuration

In [None]:
# ==================================================
# Cell 1: Setup and Configuration
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from pathlib import Path
import warnings
import time
from datetime import datetime
import os
import sys

# Sklearn imports
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    StratifiedKFold, train_test_split
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier
)
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print("=" * 100)
print("IMPORTING LIBRARIES...")
print("=" * 100)

# Try importing XGBoost
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
    print("XGBoost available")
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not installed")

# Global Configuration
print("\n" + "=" * 100)
print("CONFIGURING PATHS...")
print("=" * 100)

os.chdir('d:\\ScoreSight')
print(f"Working directory: {os.getcwd()}")

# Create directory structure
MODELS_DIR = Path('models')
VIZ_DIR = Path('visualizations/ps4_match_result')
DATA_DIR = Path('data')
DATASETS_DIR = Path('datasets')
FINAL_DATA_PATH = DATA_DIR / 'match_result' / 'match_result_data.csv'

for dir_path in [MODELS_DIR, VIZ_DIR, FINAL_DATA_PATH.parent]:
    dir_path.mkdir(exist_ok=True)
    print(f"Created/verified: {dir_path}")

# Raw data path
RAW_MATCH_DATA_PATH = DATASETS_DIR / 'Match Winner.csv'

# Display options
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("\n" + "=" * 100)
print("PS4 MATCH RESULT PIPELINE")
print("=" * 100)

## 2. Data Loading and Advanced Feature Engineering

In [None]:
# ==================================================
# Cell 2: Data Loading and Advanced Feature Engineering
# ==================================================
print(f"Loading raw match data from: {RAW_MATCH_DATA_PATH}")
try:
    df_mw = pd.read_csv(RAW_MATCH_DATA_PATH)
    df_mw['Date'] = pd.to_datetime(df_mw['Date'], format='mixed')
    df_mw = df_mw.sort_values('Date').reset_index(drop=True)
    df_mw = df_mw[['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR']].dropna()
    print(f"Loaded and sorted raw match data: {df_mw.shape}")

    # --- Advanced Feature Engineering ---
    WINDOWS=[5,10]; ALPHAS=[0.1,0.2]
    teams = pd.concat([df_mw['HomeTeam'], df_mw['AwayTeam']]).unique()
    team_hist={t:[] for t in teams}; h2h_hist={}
    rows=[]
    for _,r in df_mw.iterrows():
        h,a=r['HomeTeam'], r['AwayTeam']
        key=tuple(sorted((h,a)))
        if key not in h2h_hist: h2h_hist[key]=[]
        def feats(hist, opp_hist, h2h):
            out={}
            for W in WINDOWS:
                if len(hist)>=W:
                    w=pd.DataFrame(hist[-W:])
                    out[f'avg_gs_{W}']=w['gs'].mean(); out[f'avg_gc_{W}']=w['gc'].mean(); out[f'avg_gd_{W}']=w['gd'].mean(); out[f'avg_pts_{W}']=w['pts'].mean()
                else:
                    out[f'avg_gs_{W}']=np.nan; out[f'avg_gc_{W}']=np.nan; out[f'avg_gd_{W}']=np.nan; out[f'avg_pts_{W}']=np.nan
            if len(hist)>1:
                hist_df=pd.DataFrame(hist)
                for a_ in ALPHAS:
                    out[f'ewma_gs_{a_}']=hist_df['gs'].ewm(alpha=a_).mean().iloc[-1]
                    out[f'ewma_gc_{a_}']=hist_df['gc'].ewm(alpha=a_).mean().iloc[-1]
                    out[f'ewma_gd_{a_}']=hist_df['gd'].ewm(alpha=a_).mean().iloc[-1]
            else:
                for a_ in ALPHAS:
                    out[f'ewma_gs_{a_}']=np.nan; out[f'ewma_gc_{a_}']=np.nan; out[f'ewma_gd_{a_}']=np.nan
            if len(h2h)>0:
                d=pd.DataFrame(h2h)
                out['h2h_avg_gs']=d['gs'].mean(); out['h2h_avg_gc']=d['gc'].mean(); out['h2h_win_rate']=(d['pts']==3).mean()
            else:
                out['h2h_avg_gs']=np.nan; out['h2h_avg_gc']=np.nan; out['h2h_win_rate']=np.nan
            return out
        hf=feats(team_hist[h], team_hist[a], h2h_hist[key]); af=feats(team_hist[a], team_hist[h], h2h_hist[key])
        row={f'H_{k}':v for k,v in hf.items()}
        row.update({f'A_{k}':v for k,v in af.items()})
        for W in WINDOWS: row[f'diff_avg_gd_{W}']=hf.get(f'avg_gd_{W}')-af.get(f'avg_gd_{W}')
        for a_ in ALPHAS: row[f'diff_ewma_gd_{a_}']=hf.get(f'ewma_gd_{a_}')-af.get(f'ewma_gd_{a_}')
        rows.append(row)
        # update histories
        hg,ag=int(r['FTHG']), int(r['FTAG'])
        if r['FTR']=='H': hp,ap=3,0
        elif r['FTR']=='A': hp,ap=0,3
        else: hp,ap=1,1
        team_hist[h].append({'gs':hg,'gc':ag,'gd':hg-ag,'pts':hp}); team_hist[a].append({'gs':ag,'gc':hg,'gd':ag-hg,'pts':ap}); h2h_hist[key].append({'team':h,'gs':hg,'gc':ag,'pts':hp})

    feat_df=pd.DataFrame(rows, index=df_mw.index).dropna()
    df_mw2=df_mw.loc[feat_df.index].copy()
    
    # Encode target: H/D/A as 0/1/2
    class_map={'H':0,'D':1,'A':2}
    df_ps4=pd.concat([feat_df.reset_index(drop=True), df_mw2['FTR'].map(class_map).rename('target')], axis=1)
    
    # Save the engineered data
    df_ps4.to_csv(FINAL_DATA_PATH, index=False)
    
    print(f"Engineered dataset created with {df_ps4.shape[1]-1} features.")
    print(f"Saved final modeling data to: {FINAL_DATA_PATH}")
    print("\nFinal DataFrame head:")
    display(df_ps4.head())

except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_MATCH_DATA_PATH}")
    df_ps4 = None

## 3. Model Training

In [None]:
# ==================================================
# Cell 3: Model Training
# ==================================================
if df_ps4 is not None:
    print("\n" + "=" * 100)
    print("PS4: MATCH RESULT PREDICTION - TRAINING")
    print("=" * 100)

    # Define target and features
    TARGET_COL_PS4 = 'target'
    feature_cols_ps4 = [c for c in df_ps4.columns if c != TARGET_COL_PS4]
    
    X_ps4 = df_ps4[feature_cols_ps4].copy()
    y_ps4 = df_ps4[TARGET_COL_PS4]
    
    print(f"Features ({len(feature_cols_ps4)}): {feature_cols_ps4[:5]}...")
    print(f"Target: {TARGET_COL_PS4}")

    # Stratified split
    X_train_ps4, X_test_ps4, y_train_ps4, y_test_ps4 = train_test_split(
        X_ps4, y_ps4, test_size=0.30, random_state=42, stratify=y_ps4
    )
    
    print(f"\nData split: {X_train_ps4.shape[0]} train / {X_test_ps4.shape[0]} test")
    
    # Model configurations
    models_ps4 = {
        'LogisticRegression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42),
        'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'GradientBoosting': GradientBoostingClassifier(random_state=42)
    }
    if XGB_AVAILABLE:
        models_ps4['XGBoost'] = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    
    # Parameter grids
    param_grids_ps4 = {
        'LogisticRegression': {'model__C': [0.01, 0.1, 1.0]},
        'RandomForest': {'model__n_estimators': [50, 100], 'model__max_depth': [8, 15]},
        'GradientBoosting': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]},
        'XGBoost': {'model__n_estimators': [50, 100], 'model__learning_rate': [0.01, 0.05]}
    }
    
    # --- Execute Training ---
    best_model_data = None
    best_f1_score = -1

    for model_name, model in models_ps4.items():
        print(f"\n--- Training {model_name} ---")
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        search = RandomizedSearchCV(
            estimator=pipeline, param_distributions=param_grids_ps4.get(model_name, {}),
            n_iter=10, cv=StratifiedKFold(n_splits=5), 
            scoring='f1_macro', n_jobs=-1, random_state=42
        )
        search.fit(X_train_ps4, y_train_ps4)
        y_pred = search.predict(X_test_ps4)
        f1 = f1_score(y_test_ps4, y_pred, average='macro')
        
        print(f"  Test F1-Macro: {f1:.4f}")
        print(f"  Best CV Score: {search.best_score_:.4f}")

        if f1 > best_f1_score:
            best_f1_score = f1
            best_model_data = {
                'name': model_name,
                'model': search.best_estimator_,
                'f1_score': f1,
                'report': classification_report(y_test_ps4, y_pred, output_dict=True)
            }

    print(f"\n--- Best Model: {best_model_data['name']} with F1-Macro: {best_model_data['f1_score']:.4f} ---")

    # --- Save Artifacts ---
    model_path = MODELS_DIR / 'ps4_match_result_model.joblib'
    metadata_path = MODELS_DIR / 'ps4_match_result_metrics.json'
    
    joblib.dump(best_model_data['model'], model_path)
    
    metadata = {
        'problem_name': 'PS4_Match_Result',
        'best_model': best_model_data['name'],
        'task_type': 'classification',
        'test_metrics': {'f1_macro': best_model_data['f1_score']},
        'classification_report': best_model_data['report']
    }
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print(f"Model saved: {model_path}")
    print(f"Metadata saved: {metadata_path}")

else:
    print("Skipping training because data loading failed.")