# ScoreSight: Model Training Pipeline v1.0
## Multi-Algorithm ML Optimization for EPL Prediction

**Author:** Prathamesh Fuke  
**Version:** 1.0 - Comprehensive Model Training  
**Focus:** Multi-algorithm training, hyperparameter optimization, temporal CV, performance analysis

### Notebook Overview
This notebook implements the comprehensive model training pipeline:
- **Phase 1:** Data Loading & Feature Selection
- **Phase 2:** Multi-Algorithm Training (Linear, Ridge, Random Forest, XGBoost, LightGBM)
- **Phase 3:** Hyperparameter Optimization (GridSearchCV, RandomizedSearchCV)
- **Phase 4:** Temporal Cross-Validation (Walk-Forward, Season-Aware)
- **Phase 5:** Performance Evaluation & Comparison
- **Phase 6:** Feature Importance Analysis

### Expected Outcomes
- Best Model: XGBoost or LightGBM with MAE < 1.0
- Feature Importance Ranking
- Cross-validation Performance Metrics
- Model Comparison Report

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# XGBoost and LightGBM
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("‚ö†Ô∏è  XGBoost not installed")

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False
    print("‚ö†Ô∏è  LightGBM not installed")

import os
os.chdir('d:\\ScoreSight')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úì All libraries imported successfully")
print(f"  XGBoost Available: {XGB_AVAILABLE}")
print(f"  LightGBM Available: {LGB_AVAILABLE}")

‚úì All libraries imported successfully
  XGBoost Available: True
  LightGBM Available: True


## 2. Data Loading & Preparation

In [3]:
# Load engineered features
df = pd.read_csv('data/engineered/data_engineered_match_v3.csv')

print("‚úì Data loaded successfully")
print(f"\nüìä DATASET OVERVIEW")
print("="*80)
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns[:10])}...")

# Standardize column names (lowercase)
df.columns = df.columns.str.lower().str.strip()

print(f"\nTarget Variable (fthg): Home team goals")
print(f"  Mean: {df['fthg'].mean():.2f}")
print(f"  Std: {df['fthg'].std():.2f}")
print(f"  Min: {df['fthg'].min():.0f}, Max: {df['fthg'].max():.0f}")
print(f"\nMissing values: {df.isnull().sum().sum()}")

# Check for date column
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")

‚úì Data loaded successfully

üìä DATASET OVERVIEW
Shape: (6840, 96)

Columns: ['unnamed:_0', 'date', 'hometeam', 'awayteam', 'fthg', 'ftag', 'ftr', 'htgs', 'atgs', 'htgc']...

Target Variable (fthg): Home team goals
  Mean: 1.53
  Std: 1.30
  Min: 0, Max: 9

Missing values: 1056

Date range: 2000-01-10 00:00:00 to 2018-12-03 00:00:00


## 3. Feature Selection & Scaling

In [8]:
# Define target and features
target = 'fthg'  # Full-Time Home Goals

# Remove non-numeric and irrelevant columns
exclude_cols = ['date', 'hometeam', 'awayteam', 'ftr', 'htr', 'fthg', 'ftag', 'hthg', 'htag', 'unnamed: 0']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

X = df[feature_cols].copy()
y = df[target].copy()

# Clean feature names for LightGBM compatibility (remove special JSON characters)
X.columns = [col.replace(':', '_').replace('[', '(').replace(']', ')') for col in X.columns]
feature_cols = list(X.columns)

print(f"‚úì Feature Selection Complete")
print(f"\nFeatures: {len(feature_cols)}")
print(f"Target: {target}")
print(f"\nFeature list (first 20):")
for i, col in enumerate(feature_cols[:20], 1):
    print(f"  {i:2d}. {col}")

# Handle missing values
X = X.fillna(X.mean())
X = X.replace([np.inf, -np.inf], np.nan).fillna(X.mean())

print(f"\n‚úì Missing values handled")
print(f"  Final shape: {X.shape}")

‚úì Feature Selection Complete

Features: 78
Target: fthg

Feature list (first 20):
   1. unnamed__0
   2. htgs
   3. atgs
   4. htgc
   5. atgc
   6. htp
   7. atp
   8. mw
   9. htformpts
  10. atformpts
  11. htwinstreak3
  12. htwinstreak5
  13. htlossstreak3
  14. htlossstreak5
  15. atwinstreak3
  16. atwinstreak5
  17. atlossstreak3
  18. atlossstreak5
  19. htgd
  20. atgd

‚úì Missing values handled
  Final shape: (6840, 78)


## 4. Temporal Cross-Validation Strategy

In [5]:
class TemporalCrossValidator:
    """
    Implement walk-forward cross-validation for time-series data
    Prevents data leakage by only training on past data
    """
    
    def __init__(self, n_splits=5, initial_train_size=0.5):
        self.n_splits = n_splits
        self.initial_train_size = initial_train_size
        
    def split(self, X, y=None):
        """Generate train/test indices for walk-forward validation"""
        n_samples = len(X)
        initial_train = int(n_samples * self.initial_train_size)
        test_size = (n_samples - initial_train) // self.n_splits
        
        for i in range(self.n_splits):
            train_end = initial_train + (i * test_size)
            test_end = train_end + test_size
            
            train_idx = np.arange(0, train_end)
            test_idx = np.arange(train_end, min(test_end, n_samples))
            
            if len(test_idx) > 0:
                yield train_idx, test_idx

# Create temporal CV splitter
tcv = TemporalCrossValidator(n_splits=5, initial_train_size=0.6)

# Verify splits
split_info = []
for i, (train_idx, test_idx) in enumerate(tcv.split(X, y), 1):
    split_info.append({
        'Fold': i,
        'Train Size': len(train_idx),
        'Test Size': len(test_idx),
        'Train %': f"{len(train_idx)/len(X)*100:.1f}%",
        'Test %': f"{len(test_idx)/len(X)*100:.1f}%"
    })

print("‚úì Temporal Cross-Validation Created (Walk-Forward Strategy)")
print("\n" + pd.DataFrame(split_info).to_string(index=False))

‚úì Temporal Cross-Validation Created (Walk-Forward Strategy)

 Fold  Train Size  Test Size Train % Test %
    1        4104        547   60.0%   8.0%
    2        4651        547   68.0%   8.0%
    3        5198        547   76.0%   8.0%
    4        5745        547   84.0%   8.0%
    5        6292        547   92.0%   8.0%


## 5. Multi-Algorithm Model Training

In [6]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("‚úì Features scaled")

# Define models with hyperparameters
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {},
        'use_scaled': True
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {'alpha': [0.1, 1, 10, 100]},
        'use_scaled': True
    },
    'Lasso Regression': {
        'model': Lasso(max_iter=5000),
        'params': {'alpha': [0.001, 0.01, 0.1, 1]},
        'use_scaled': True
    },
    'Random Forest': {
        'model': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'params': {'max_depth': [10, 20, 30], 'min_samples_split': [5, 10]},
        'use_scaled': False
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'params': {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]},
        'use_scaled': False
    }
}

if XGB_AVAILABLE:
    models['XGBoost'] = {
        'model': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'params': {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]},
        'use_scaled': False
    }

if LGB_AVAILABLE:
    models['LightGBM'] = {
        'model': lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1),
        'params': {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]},
        'use_scaled': False
    }

print(f"\n‚úì Models configured: {len(models)} models")
for model_name in models.keys():
    print(f"  - {model_name}")

‚úì Features scaled

‚úì Models configured: 7 models
  - Linear Regression
  - Ridge Regression
  - Lasso Regression
  - Random Forest
  - Gradient Boosting
  - XGBoost
  - LightGBM


## 6. Cross-Validation & Hyperparameter Tuning

In [7]:
import time

results = []
trained_models = {}

print("\n" + "="*80)
print("üöÄ TRAINING MODELS WITH CROSS-VALIDATION")
print("="*80)

for model_name, model_config in models.items():
    print(f"\n{model_name}...")
    start_time = time.time()
    
    X_train_data = X_scaled if model_config['use_scaled'] else X
    
    # Temporal CV
    cv_scores_mae = []
    cv_scores_rmse = []
    cv_scores_r2 = []
    
    for train_idx, test_idx in tcv.split(X_train_data, y):
        X_train, X_test = X_train_data.iloc[train_idx], X_train_data.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train
        model = model_config['model'].__class__(**model_config['model'].get_params())
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Score
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        cv_scores_mae.append(mae)
        cv_scores_rmse.append(rmse)
        cv_scores_r2.append(r2)
    
    elapsed = time.time() - start_time
    
    # Store results
    results.append({
        'Model': model_name,
        'MAE': np.mean(cv_scores_mae),
        'MAE_std': np.std(cv_scores_mae),
        'RMSE': np.mean(cv_scores_rmse),
        'R¬≤': np.mean(cv_scores_r2),
        'Time (s)': elapsed
    })
    
    print(f"  ‚úì MAE: {np.mean(cv_scores_mae):.4f} (¬±{np.std(cv_scores_mae):.4f})")
    print(f"  ‚úì RMSE: {np.mean(cv_scores_rmse):.4f}")
    print(f"  ‚úì R¬≤: {np.mean(cv_scores_r2):.4f}")
    
    # Train final model on full data for feature importance
    final_model = model_config['model'].__class__(**model_config['model'].get_params())
    final_model.fit(X_train_data, y)
    trained_models[model_name] = final_model

print("\n" + "="*80)
print("‚úì TRAINING COMPLETE")
print("="*80)


üöÄ TRAINING MODELS WITH CROSS-VALIDATION

Linear Regression...
  ‚úì MAE: 0.7998 (¬±0.0207)
  ‚úì RMSE: 1.0110
  ‚úì R¬≤: 0.4016

Ridge Regression...
  ‚úì MAE: 0.7995 (¬±0.0203)
  ‚úì RMSE: 1.0109
  ‚úì R¬≤: 0.4017

Lasso Regression...
  ‚úì MAE: 1.0600 (¬±0.0144)
  ‚úì RMSE: 1.3085
  ‚úì R¬≤: -0.0015

Random Forest...
  ‚úì MAE: 0.8252 (¬±0.0335)
  ‚úì RMSE: 1.0503
  ‚úì R¬≤: 0.3534
  ‚úì MAE: 0.8252 (¬±0.0335)
  ‚úì RMSE: 1.0503
  ‚úì R¬≤: 0.3534

Gradient Boosting...

Gradient Boosting...
  ‚úì MAE: 0.8222 (¬±0.0521)
  ‚úì RMSE: 1.0496
  ‚úì R¬≤: 0.3541
  ‚úì MAE: 0.8222 (¬±0.0521)
  ‚úì RMSE: 1.0496
  ‚úì R¬≤: 0.3541

XGBoost...

XGBoost...
  ‚úì MAE: 0.8894 (¬±0.0174)
  ‚úì RMSE: 1.1382
  ‚úì R¬≤: 0.2418
  ‚úì MAE: 0.8894 (¬±0.0174)
  ‚úì RMSE: 1.1382
  ‚úì R¬≤: 0.2418

LightGBM...

LightGBM...


LightGBMError: Do not support special JSON characters in feature name.

## 7. Model Comparison & Ranking

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results).sort_values('MAE')

print("\n" + "="*80)
print("üìä MODEL PERFORMANCE COMPARISON")
print("="*80)
print("\n" + results_df.to_string(index=False))

# Best model
best_model_name = results_df.iloc[0]['Model']
best_mae = results_df.iloc[0]['MAE']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   MAE: {best_mae:.4f}")
print(f"   RMSE: {results_df.iloc[0]['RMSE']:.4f}")
print(f"   R¬≤: {results_df.iloc[0]['R¬≤']:.4f}")

## 8. Feature Importance Analysis

In [None]:
# Extract feature importance from tree-based models
importance_data = {}

for model_name, model in trained_models.items():
    if hasattr(model, 'feature_importances_'):
        importance_data[model_name] = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)

if importance_data:
    print("\n" + "="*80)
    print("üéØ TOP 15 MOST IMPORTANT FEATURES")
    print("="*80)
    
    # Use best available model with feature importance
    for model_name in [best_model_name, 'Random Forest', 'XGBoost', 'LightGBM', 'Gradient Boosting']:
        if model_name in importance_data:
            print(f"\n{model_name}:")
            top_features = importance_data[model_name].head(15)
            for i, row in top_features.iterrows():
                print(f"  {row['feature']:40s} : {row['importance']:8.4f}")
            break
else:
    print("\n‚ÑπÔ∏è  Feature importance not available for selected model type")

## 9. Model Export & Summary

In [None]:
import json
from datetime import datetime

# Save results
results_df.to_csv('data/engineered/model_comparison_results.csv', index=False)

# Save model summary
summary = {
    'timestamp': datetime.now().isoformat(),
    'best_model': best_model_name,
    'best_mae': float(best_mae),
    'best_rmse': float(results_df.iloc[0]['RMSE']),
    'best_r2': float(results_df.iloc[0]['R¬≤']),
    'n_features': len(feature_cols),
    'n_samples': len(X),
    'cv_strategy': 'Walk-Forward (5-fold temporal)',
    'all_results': results_df.to_dict('records')
}

with open('data/engineered/model_training_summary_v1.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n" + "="*80)
print("üíæ RESULTS SAVED")
print("="*80)
print(f"Model Comparison: data/engineered/model_comparison_results.csv")
print(f"Training Summary: data/engineered/model_training_summary_v1.json")

print("\n" + "="*80)
print("‚úÖ MODEL TRAINING PIPELINE COMPLETE")
print("="*80)
print(f"\nBest Model: {best_model_name}")
print(f"  MAE: {best_mae:.4f} goals per match")
print(f"  RMSE: {results_df.iloc[0]['RMSE']:.4f}")
print(f"  R¬≤: {results_df.iloc[0]['R¬≤']:.4f}")
print(f"\nDataset: {len(X)} matches, {len(feature_cols)} features")
print(f"CV Strategy: Walk-Forward 5-fold temporal")