# Problem Statement 3: Top Scorer Prediction
## Predict Player Goals Scored

**Author:** ScoreSight ML Team  
**Date:** 2025-11-12  
**Problem Type:** Regression (Goals Prediction)

### Dataset
- **File:** `data/data_engineered_top_scorer.csv`
- **Task:** Predict total goals scored by player
- **Features:** Player statistics (44+ engineered features)
- **Target:** goals

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

try:
    import xgboost as xgb
    import lightgbm as lgb
    BOTH_AVAILABLE = True
except:
    BOTH_AVAILABLE = False

# For Colab: Base directory is /content/drive/MyDrive/ScoreSight
SCORESIGHT_DIR = '/content/drive/MyDrive/ScoreSight'
DATA_DIR = os.path.join(SCORESIGHT_DIR, 'data')
MODELS_DIR = os.path.join(SCORESIGHT_DIR, 'models')

Path(MODELS_DIR).mkdir(exist_ok=True, parents=True)

print("[OK] Libraries imported")

## 2. Load Data

In [None]:
data_path = os.path.join(DATA_DIR, 'engineered', 'data_engineered_top_scorer.csv')
print(f"Loading from: {data_path}")

df = pd.read_csv(data_path)
df.columns = df.columns.str.lower().str.strip()

print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Target is usually 'goals'
target = 'goals'
exclude_cols = [target, 'unnamed: 0', 'player', 'team', 'nation']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

X = df[feature_cols].fillna(df[feature_cols].mean())
y = df[target].values

print(f"Features: {len(feature_cols)}, Samples: {len(X)}")
print(f"Target mean: {y.mean():.2f}, std: {y.std():.2f}")

## 3. Train Models

In [None]:
def create_pipeline(model):
    return Pipeline([('imputer', SimpleImputer()), ('scaler', StandardScaler()), ('model', model)])

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}

if BOTH_AVAILABLE:
    models['XGBoost'] = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    models['LightGBM'] = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

results = {}
trained_models = {}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"[{name}] Training...")
    pipeline = create_pipeline(model)
    pipeline.fit(X, y)
    
    mae_scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        temp_pipeline = create_pipeline(model.__class__(**model.get_params()))
        temp_pipeline.fit(X_train, y_train)
        mae_scores.append(mean_absolute_error(y_test, temp_pipeline.predict(X_test)))
    
    results[name] = np.mean(mae_scores)
    trained_models[name] = pipeline
    print(f"  MAE: {results[name]:.4f}")

print("\n[OK] Training complete")

## 4. Save Models

In [None]:
models_dir = Path(MODELS_DIR)
models_dir.mkdir(exist_ok=True, parents=True)

best_name = min(results, key=results.get)  # Lower MAE is better
best_mae = results[best_name]

for name, model in trained_models.items():
    path = models_dir / f"top_scorer_{name.lower().replace(' ', '_')}_ps3.joblib"
    joblib.dump(model, path)
    print(f"[SAVE] {name} -> {path}")

summary = {
    'problem_statement': 'Top Scorer Prediction',
    'best_model': best_name,
    'best_mae': float(best_mae),
    'timestamp': datetime.now().isoformat(),
    'models': {name: {'mae': float(results[name])} for name in results.keys()}
}

summary_path = models_dir / 'top_scorer_training_summary_ps3.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n[COMPLETE] Best: {best_name} (MAE: {best_mae:.4f})")