# Problem Statement 5: Match Result Prediction
## Predict Match Result (Home/Draw/Away)

**Author:** ScoreSight ML Team  
**Date:** 2025-11-12  
**Problem Type:** Multi-class Classification (H/D/A)

### Dataset
- **File:** `data/match_prediction_corrected.csv`
- **Task:** Predict match result (Home/Draw/Away)
- **Features:** Match data from corrected dataset
- **Target:** Result (H/D/A)

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             confusion_matrix, classification_report)
from sklearn.impute import SimpleImputer

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False

# GitHub configuration
GITHUB_REPO = 'https://raw.githubusercontent.com/springboardmentor345a-create/Projects_2/Prathamesh_Fuke'
DATA_URL_BASE_CORRECTED = f'{GITHUB_REPO}/data/corrected'

models_dir = Path('models')
models_dir.mkdir(exist_ok=True, parents=True)

print("[OK] All libraries imported")
print(f"XGBoost: {XGB_AVAILABLE} | LightGBM: {LGB_AVAILABLE}")

## 2. Load Data

In [None]:
# Load data from GitHub (from corrected folder, not engineered)
data_url = f'{DATA_URL_BASE_CORRECTED}/match_prediction_corrected.csv'
print(f"[LOAD] Loading from GitHub: {data_url}")

df = pd.read_csv(data_url)
df.columns = df.columns.str.lower().str.strip()

print(f"[OK] Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nTarget (mw) distribution:")
print(df['mw'].value_counts())

# Prepare features
exclude_cols = ['mw', 'unnamed: 0']
feature_cols = [col for col in df.columns 
               if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

X = df[feature_cols].copy()
y = df['mw'].copy()

# Handle missing values
X = X.fillna(X.mean())

print(f"\n[OK] Features: {len(feature_cols)}, Samples: {len(X)}")
print(f"[OK] Feature columns: {feature_cols[:10]}... (showing first 10)")

## 3. Train Models

In [None]:
# ============================================================================
# TRAIN/TEST SPLIT
# ============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"[SPLIT] Train: {len(X_train)}, Test: {len(X_test)}")
print(f"[SPLIT] Train class distribution:\n{y_train.value_counts()}")
print(f"[SPLIT] Test class distribution:\n{y_test.value_counts()}")

# ============================================================================
# HYPERPARAMETER SEARCH SPACE - Match Result (Large Balanced Dataset)
# ============================================================================

param_grids = {
    'LogisticRegression': {
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['lbfgs', 'liblinear'],
        'model__max_iter': [3000, 5000]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [10, 15, 20, None],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [5, 7, 9],
        'model__subsample': [0.8, 0.9]
    }
}

if XGB_AVAILABLE:
    param_grids['XGBoost'] = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [5, 7, 9],
        'model__subsample': [0.8, 0.9, 1.0]
    }

if LGB_AVAILABLE:
    param_grids['LightGBM'] = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__num_leaves': [30, 50, 70],
        'model__min_data_in_leaf': [10, 20]
    }

# ============================================================================
# TRAINING WITH HYPERPARAMETER TUNING
# ============================================================================

def create_pipeline(model, use_scaling=True):
    steps = [('imputer', SimpleImputer(strategy='mean'))]
    if use_scaling:
        steps.append(('scaler', StandardScaler()))
    steps.append(('model', model))
    return Pipeline(steps)

print("\n" + "="*80)
print("HYPERPARAMETER TUNING - RandomizedSearchCV")
print("="*80)

results = {}
trained_models = {}
best_models = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_to_train = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=5000),
    'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

if XGB_AVAILABLE:
    models_to_train['XGBoost'] = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss', use_label_encoder=False)

if LGB_AVAILABLE:
    models_to_train['LightGBM'] = lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)

for model_name, model in models_to_train.items():
    print(f"\n{'-'*80}")
    print(f"[{model_name}] Hyperparameter Tuning...")
    print(f"{'-'*80}")
    
    pipeline = create_pipeline(model)
    param_grid = param_grids[model_name]
    
    search = RandomizedSearchCV(
        pipeline,
        param_grid,
        n_iter=25,
        cv=cv,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    search.fit(X_train, y_train)
    
    best_models[model_name] = search.best_estimator_
    print(f"\n[BEST] Params: {search.best_params_}")
    print(f"[CV] F1 Score: {search.best_score_:.4f}")
    
    # Test evaluation
    y_pred = best_models[model_name].predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    
    results[model_name] = {
        'best_params': search.best_params_,
        'cv_f1': float(search.best_score_),
        'test_accuracy': float(acc),
        'test_f1': float(f1),
        'test_precision': float(precision),
        'test_recall': float(recall)
    }
    
    print(f"\n[TEST] Accuracy:  {acc:.4f}")
    print(f"[TEST] F1:        {f1:.4f}")
    print(f"[TEST] Precision: {precision:.4f}")
    print(f"[TEST] Recall:    {recall:.4f}")
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    trained_models[model_name] = best_models[model_name]

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)

## 4. Save Models

In [None]:
# ============================================================================
# IDENTIFY BEST MODEL
# ============================================================================

best_model_name = max(results, key=lambda x: results[x]['test_f1'])
best_metrics = results[best_model_name]

print(f"\n[WINNER] Best Model: {best_model_name}")
print(f"[WINNER] Test F1 Score: {best_metrics['test_f1']:.4f}")
print(f"[WINNER] Test Accuracy: {best_metrics['test_accuracy']:.4f}")

# ============================================================================
# SAVE ALL MODELS
# ============================================================================

print(f"\n{'='*80}")
print("SAVING MODELS")
print(f"{'='*80}")

for name, model in trained_models.items():
    path = models_dir / f"ps5_match_result_{name.lower().replace(' ', '_')}.joblib"
    joblib.dump(model, path)
    print(f"[SAVE] {name:20} -> {path}")

# ============================================================================
# SAVE TRAINING SUMMARY
# ============================================================================

summary = {
    'problem_statement': 'PS5: Match Result Prediction',
    'task_type': 'Multi-class Classification (Home/Draw/Away)',
    'best_model': best_model_name,
    'best_test_f1': best_metrics['test_f1'],
    'best_test_accuracy': best_metrics['test_accuracy'],
    'timestamp': datetime.now().isoformat(),
    'data': {
        'url': data_url,
        'shape': df.shape,
        'train_size': len(X_train),
        'test_size': len(X_test),
        'n_features': len(feature_cols),
        'n_classes': len(np.unique(y))
    },
    'cv_strategy': '5-Fold Stratified KFold',
    'tuning_method': 'RandomizedSearchCV (25 iterations)',
    'all_models': results
}

summary_path = models_dir / 'ps5_match_result_summary.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"\n[SAVE] Summary -> {summary_path}")
print(f"\n{json.dumps(summary, indent=2, default=str)}")