# Problem Statement 5: Match Result Prediction
## Predict Match Result (Home/Draw/Away)

**Author:** ScoreSight ML Team  
**Date:** 2025-11-12  
**Problem Type:** Multi-class Classification (H/D/A)

### Dataset
- **File:** `data/match_prediction_corrected.csv`
- **Task:** Predict match result (Home/Draw/Away)
- **Features:** Match data from corrected dataset
- **Target:** Result (H/D/A)

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer

try:
    import xgboost as xgb
    import lightgbm as lgb
    BOTH_AVAILABLE = True
except:
    BOTH_AVAILABLE = False

# For Colab: Base directory is /content/drive/MyDrive/ScoreSight
SCORESIGHT_DIR = '/content/drive/MyDrive/ScoreSight'
DATA_DIR = os.path.join(SCORESIGHT_DIR, 'data')
MODELS_DIR = os.path.join(SCORESIGHT_DIR, 'models')

Path(MODELS_DIR).mkdir(exist_ok=True, parents=True)

print("[OK] Libraries imported")

## 2. Load Data

In [None]:
data_path = os.path.join(DATA_DIR, 'corrected', 'match_prediction_corrected.csv')
print(f"Loading from: {data_path}")

df = pd.read_csv(data_path)
df.columns = df.columns.str.lower().str.strip()

print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst rows:")
print(df.head())

# Identify target (usually FTR or result)
target = None
for col in df.columns:
    if any(x in col.lower() for x in ['result', 'ftr', 'outcome', 'winner']):
        target = col
        break

if target is None:
    target = df.columns[-1]

print(f"\nTarget: {target}")
print(f"Classes: {df[target].unique()}")

# Prepare features
exclude_cols = [target, 'unnamed: 0', 'date', 'team', 'match']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

X = df[feature_cols].fillna(df[feature_cols].mean())
y_raw = df[target]

# Encode target
le = LabelEncoder()
y = le.fit_transform(y_raw)

print(f"\nFeatures: {len(feature_cols)}, Samples: {len(X)}")
print(f"Classes: {list(le.classes_)}")

## 3. Train Models

In [None]:
def create_pipeline(model):
    return Pipeline([('imputer', SimpleImputer()), ('model', model)])

models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=2000),
    'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

if BOTH_AVAILABLE:
    models['XGBoost'] = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
    models['LightGBM'] = lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)

results = {}
trained_models = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"[{name}] Training...")
    pipeline = create_pipeline(model)
    pipeline.fit(X, y)
    
    scores = []
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        temp_pipeline = create_pipeline(model.__class__(**model.get_params()))
        temp_pipeline.fit(X_train, y_train)
        scores.append(f1_score(y_test, temp_pipeline.predict(X_test), average='weighted'))
    
    results[name] = np.mean(scores)
    trained_models[name] = pipeline
    print(f"  F1: {results[name]:.4f}")

print("\n[OK] Training complete")

## 4. Save Models

In [None]:
models_dir = Path(MODELS_DIR)
models_dir.mkdir(exist_ok=True, parents=True)

best_name = max(results, key=results.get)
best_f1 = results[best_name]

for name, model in trained_models.items():
    path = models_dir / f"match_result_{name.lower().replace(' ', '_')}_ps5.joblib"
    joblib.dump(model, path)
    print(f"[SAVE] {name} -> {path}")

# Save encoder
le_path = models_dir / "match_result_encoder_ps5.joblib"
joblib.dump(le, le_path)
print(f"[SAVE] Encoder -> {le_path}")

summary = {
    'problem_statement': 'Match Result Prediction',
    'best_model': best_name,
    'best_f1': float(best_f1),
    'timestamp': datetime.now().isoformat(),
    'classes': list(le.classes_),
    'models': {name: {'f1': float(results[name])} for name in results.keys()}
}

summary_path = models_dir / 'match_result_training_summary_ps5.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n[COMPLETE] Best: {best_name} (F1: {best_f1:.4f})")