# Problem Statement 2: Match Winner Prediction
## Predict Match Result (Home/Draw/Away)

**Author:** ScoreSight ML Team  
**Date:** 2025-11-12  
**Problem Type:** Multi-class Classification

### Dataset
- **File:** `data/data_engineered_match_prediction.csv`
- **Task:** Predict match winner
- **Features:** Match statistics, team form (56+ engineered features)
- **Target:** mw (Match winner code)

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             confusion_matrix, classification_report)
from sklearn.impute import SimpleImputer

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False

# GitHub configuration
GITHUB_REPO = 'https://raw.githubusercontent.com/springboardmentor345a-create/Projects_2/Prathamesh_Fuke'
DATA_URL_BASE = f'{GITHUB_REPO}/data/engineered'

models_dir = Path('models')
models_dir.mkdir(exist_ok=True, parents=True)

print("[OK] All libraries imported")
print(f"XGBoost: {XGB_AVAILABLE} | LightGBM: {LGB_AVAILABLE}")

## 2. Load Data

In [None]:
# Load data with FTR target
data_path = '../data/corrected/match_prediction_with_ftr.csv'
print(f"[LOAD] Loading from: {data_path}")

df = pd.read_csv(data_path)
df.columns = df.columns.str.lower().str.strip()

print(f"[OK] Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

# Show target distribution
print(f"\nüìä Target (FTR) Distribution:")
print(df['ftr'].value_counts())
print(f"\nPercentages:")
print(df['ftr'].value_counts(normalize=True) * 100)

# Prepare features - exclude targets and encodings
exclude_cols = ['ftr', 'ftr_encoded', 'hometeam_encoded', 'awayteam_encoded']
feature_cols = [col for col in df.columns 
               if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

X = df[feature_cols].copy()
y = df['ftr_encoded'].copy()  # 0=H, 1=D, 2=A

# Handle missing values
if X.isnull().sum().sum() > 0:
    print(f"\n‚ö†Ô∏è  Found {X.isnull().sum().sum()} missing values - filling with column means")
    X = X.fillna(X.mean())

print(f"\n[OK] Features ({len(feature_cols)}): {feature_cols}")
print(f"[OK] Samples: {len(X)}")
print(f"[OK] Target encoding: 0=Home Win, 1=Draw, 2=Away Win")

# Train/test split (80/20) with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n[SPLIT] Train: {len(X_train)}, Test: {len(X_test)}")
print(f"[SPLIT] Train class distribution:")
for i, label in enumerate(['Home Win', 'Draw', 'Away Win']):
    count = (y_train == i).sum()
    pct = (y_train == i).mean() * 100
    print(f"  {label}: {count} ({pct:.1f}%)")
print(f"\n[SPLIT] Test class distribution:")
for i, label in enumerate(['Home Win', 'Draw', 'Away Win']):
    count = (y_test == i).sum()
    pct = (y_test == i).mean() * 100
    print(f"  {label}: {count} ({pct:.1f}%)")

## 3. Train Models

In [None]:
# ========================================
# PS2: Match Winner Prediction (H/D/A)
# 3-Class Classification
# ========================================

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, precision_score, 
                              recall_score, confusion_matrix, classification_report)
import time

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("="*80)
print("TRAINING MATCH WINNER MODELS (3-Class Classification)")
print("="*80)
print(f"\nDataset: {len(X_train)} training, {len(X_test)} test samples")
print(f"Target: FTR (0=Home Win, 1=Draw, 2=Away Win)")
print(f"Baseline (random): 33.3% accuracy")
print(f"Goal: 50-60% accuracy = EXCELLENT for football!\n")
print(f"‚è∞ This will take 15-20 minutes (large dataset)...\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

# ========================================
# MODEL 1: Logistic Regression (Multinomial)
# ========================================
print("\n" + "-"*80)
print("[1/5] Logistic Regression (Multinomial)")
print("-"*80)
start_time = time.time()

param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'class_weight': ['balanced'],
    'multi_class': ['multinomial'],
    'max_iter': [1000]
}

lr = LogisticRegression(random_state=42)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_lr.fit(X_train_scaled, y_train)

y_pred_lr = grid_lr.predict(X_test_scaled)

elapsed = time.time() - start_time
acc_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print(f"‚úì Completed in {elapsed:.1f}s")
print(f"  Best params: {grid_lr.best_params_}")
print(f"  Best CV Accuracy: {grid_lr.best_score_:.4f}")
print(f"  Test Accuracy: {acc_lr:.4f}")
print(f"  Test F1 (weighted): {f1_lr:.4f}")

results.append({
    'Model': 'Logistic Regression',
    'Best_Params': grid_lr.best_params_,
    'CV_Accuracy': grid_lr.best_score_,
    'Test_Accuracy': acc_lr,
    'Test_F1': f1_lr,
    'Training_Time_s': elapsed
})

# ========================================
# MODEL 2: Random Forest
# ========================================
print("\n" + "-"*80)
print("[2/5] Random Forest")
print("-"*80)
start_time = time.time()

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'class_weight': ['balanced']
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)

y_pred_rf = grid_rf.predict(X_test)

elapsed = time.time() - start_time
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print(f"‚úì Completed in {elapsed:.1f}s")
print(f"  Best params: {grid_rf.best_params_}")
print(f"  Best CV Accuracy: {grid_rf.best_score_:.4f}")
print(f"  Test Accuracy: {acc_rf:.4f}")
print(f"  Test F1 (weighted): {f1_rf:.4f}")

results.append({
    'Model': 'Random Forest',
    'Best_Params': grid_rf.best_params_,
    'CV_Accuracy': grid_rf.best_score_,
    'Test_Accuracy': acc_rf,
    'Test_F1': f1_rf,
    'Training_Time_s': elapsed
})

# ========================================
# MODEL 3: Gradient Boosting
# ========================================
print("\n" + "-"*80)
print("[3/5] Gradient Boosting")
print("-"*80)
start_time = time.time()

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0]
}

gb = GradientBoostingClassifier(random_state=42)
grid_gb = GridSearchCV(gb, param_grid_gb, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_gb.fit(X_train, y_train)

y_pred_gb = grid_gb.predict(X_test)

elapsed = time.time() - start_time
acc_gb = accuracy_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb, average='weighted')

print(f"‚úì Completed in {elapsed:.1f}s")
print(f"  Best params: {grid_gb.best_params_}")
print(f"  Best CV Accuracy: {grid_gb.best_score_:.4f}")
print(f"  Test Accuracy: {acc_gb:.4f}")
print(f"  Test F1 (weighted): {f1_gb:.4f}")

results.append({
    'Model': 'Gradient Boosting',
    'Best_Params': grid_gb.best_params_,
    'CV_Accuracy': grid_gb.best_score_,
    'Test_Accuracy': acc_gb,
    'Test_F1': f1_gb,
    'Training_Time_s': elapsed
})

# ========================================
# MODEL 4: XGBoost
# ========================================
print("\n" + "-"*80)
print("[4/5] XGBoost")
print("-"*80)
start_time = time.time()

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'objective': ['multi:softmax'],
    'num_class': [3]
}

xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss', n_jobs=-1)
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_xgb.fit(X_train, y_train)

y_pred_xgb = grid_xgb.predict(X_test)

elapsed = time.time() - start_time
acc_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

print(f"‚úì Completed in {elapsed:.1f}s")
print(f"  Best params: {grid_xgb.best_params_}")
print(f"  Best CV Accuracy: {grid_xgb.best_score_:.4f}")
print(f"  Test Accuracy: {acc_xgb:.4f}")
print(f"  Test F1 (weighted): {f1_xgb:.4f}")

results.append({
    'Model': 'XGBoost',
    'Best_Params': grid_xgb.best_params_,
    'CV_Accuracy': grid_xgb.best_score_,
    'Test_Accuracy': acc_xgb,
    'Test_F1': f1_xgb,
    'Training_Time_s': elapsed
})

# ========================================
# MODEL 5: LightGBM
# ========================================
print("\n" + "-"*80)
print("[5/5] LightGBM")
print("-"*80)
start_time = time.time()

param_grid_lgbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7, -1],
    'num_leaves': [31, 63],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'class_weight': ['balanced']
}

lgbm = LGBMClassifier(random_state=42, verbose=-1, n_jobs=-1)
grid_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_lgbm.fit(X_train, y_train)

y_pred_lgbm = grid_lgbm.predict(X_test)

elapsed = time.time() - start_time
acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
f1_lgbm = f1_score(y_test, y_pred_lgbm, average='weighted')

print(f"‚úì Completed in {elapsed:.1f}s")
print(f"  Best params: {grid_lgbm.best_params_}")
print(f"  Best CV Accuracy: {grid_lgbm.best_score_:.4f}")
print(f"  Test Accuracy: {acc_lgbm:.4f}")
print(f"  Test F1 (weighted): {f1_lgbm:.4f}")

results.append({
    'Model': 'LightGBM',
    'Best_Params': grid_lgbm.best_params_,
    'CV_Accuracy': grid_lgbm.best_score_,
    'Test_Accuracy': acc_lgbm,
    'Test_F1': f1_lgbm,
    'Training_Time_s': elapsed
})

print("\n" + "="*80)
print("‚úÖ ALL MODELS TRAINED SUCCESSFULLY!")
print("="*80)

## 4. Save Models

In [None]:
# Compare all models
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test_Accuracy', ascending=False)

print("\n" + "="*80)
print("üìä MODEL COMPARISON - PS2: Match Winner Prediction (H/D/A)")
print("="*80)
print(results_df.to_string(index=False))

# Find best model
best_model_name = results_df.iloc[0]['Model']
best_accuracy = results_df.iloc[0]['Test_Accuracy']
best_f1 = results_df.iloc[0]['Test_F1']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test Accuracy: {best_accuracy:.4f}")
print(f"   Test F1 Score: {best_f1:.4f}")

# Get best model object
model_mapping = {
    'Logistic Regression': grid_lr,
    'Random Forest': grid_rf,
    'Gradient Boosting': grid_gb,
    'XGBoost': grid_xgb,
    'LightGBM': grid_lgbm
}
best_model = model_mapping[best_model_name]

print(f"\nüìà Performance Analysis:")
print(f"   ‚Ä¢ Random baseline (guess): 33.3% accuracy")
print(f"   ‚Ä¢ Your model: {best_accuracy*100:.1f}% accuracy")
print(f"   ‚Ä¢ Improvement: {(best_accuracy - 0.333) / 0.333 * 100:.1f}% better than random!")
print(f"   ‚Ä¢ This dataset: {len(X_train)} training samples")
print(f"   ‚Ä¢ For football match prediction, 50-60% accuracy is EXCELLENT!")
print(f"   ‚Ä¢ Professional betting models achieve 50-55%")

# Confusion Matrix
print(f"\nüìä Confusion Matrix (Best Model):")
if best_model_name == 'Logistic Regression':
    y_pred_best = y_pred_lr
elif best_model_name == 'Random Forest':
    y_pred_best = y_pred_rf
elif best_model_name == 'Gradient Boosting':
    y_pred_best = y_pred_gb
elif best_model_name == 'XGBoost':
    y_pred_best = y_pred_xgb
else:
    y_pred_best = y_pred_lgbm

cm = confusion_matrix(y_test, y_pred_best)
print(cm)
print(f"\n   Rows: Actual | Columns: Predicted")
print(f"   [0] Home Win, [1] Draw, [2] Away Win")

# Classification Report
print(f"\nüìã Classification Report:")
print(classification_report(y_test, y_pred_best, 
                            target_names=['Home Win', 'Draw', 'Away Win']))

# Save results
results_df.to_csv('../visualizations/ps2_model_comparison.csv', index=False)
print(f"\nüíæ Results saved to: ../visualizations/ps2_model_comparison.csv")

# Save best model
import joblib
joblib.dump({
    'model': best_model.best_estimator_,
    'scaler': scaler,
    'features': feature_cols,
    'model_name': best_model_name,
    'accuracy': best_accuracy,
    'f1_score': best_f1,
    'class_mapping': {0: 'H', 1: 'D', 2: 'A'},
    'class_names': {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
}, '../models/ps2_match_winner_model.joblib')

print(f"\nüíæ Best model saved to: ../models/ps2_match_winner_model.joblib")

print("\n" + "="*80)
print("‚úÖ PS2: MATCH WINNER PREDICTION - COMPLETE!")
print("="*80)
print(f"\nüéØ Final Results:")
print(f"   Best Model: {best_model_name}")
print(f"   Test Accuracy: {best_accuracy:.4f}")
print(f"   Test F1 Score: {best_f1:.4f}")
print(f"\nüìù Output Format for Deployment:")
print(f"   Input: Home team stats, Away team stats")
print(f"   Output: Team name (e.g., 'Manchester United') OR 'Draw'")
print(f"   NO scores, NO probabilities, ONLY winner name or 'Draw'")