# PS4: Total Points Prediction (Corrected)

**Objective:** Predict the total points a team will accumulate in a season.

**Methodology Correction:**
The original notebook design was flawed, likely using a random split that leads to data leakage. This corrected version implements a strict temporal validation strategy to ensure the model provides realistic and trustworthy predictions.

1.  **Data Source:** We will use `data_final_points_tally.csv`, which contains aggregated team statistics per season.
2.  **Temporal Splitting:** The data will be sorted by season. The final season in the dataset will be held out as the test set, while all prior seasons will be used for training. This mimics a real-world scenario of predicting a future season's outcome based on historical data.
3.  **Cross-Validation:** `TimeSeriesSplit` will be used during hyperparameter tuning to ensure validation folds respect the chronological order of the data.
4.  **Evaluation:** The model's performance will be judged on the unseen test set (the final season) using standard regression metrics (R-squared, MAE, RMSE).
5.  **Model Saving:** The best-performing model and its corresponding metadata will be saved for future use.

## 1. Setup

In [None]:
# --- Corrected Data Loading and Setup ---
# Objective: Load the dataset, define features and target, and prepare for temporal validation.
# Methodology: We will use 'data_final_points_tally.csv', which is pre-processed and contains season-level data.
# The data will be sorted by season to ensure chronological order before splitting.

import pandas as pd
import numpy as np
import joblib
import json
import os
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Check for XGBoost and LightGBM
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False

# Define file paths
DATA_FILE = '../data/final/data_final_points_tally.csv'
MODEL_DIR = 'models'
MODEL_NAME = 'ps4_total_points_best_model.joblib'
METADATA_NAME = 'ps4_total_points_metadata.json'

# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

# Load and sort the data
df = pd.read_csv(DATA_FILE)
df = df.sort_values(by='season_encoded').reset_index(drop=True)

# --- Feature and Target Definition ---
# Target: 'target_total_points' - The total points a team achieved in a season.
# Features: All other relevant columns, excluding identifiers and potential data leaks.
# We must remove 'points_per_game' as it is a direct derivative of the target and causes leakage.

# Define features (X) and target (y)
y = df['target_total_points']
X = df.drop(columns=['team', 'target_total_points', 'points_per_game'])

print("Data loaded and sorted successfully.")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Data spans from season {df['season_encoded'].min()} to {df['season_encoded'].max()}")
print(f"XGBoost available: {XGB_AVAILABLE}")
print(f"LightGBM available: {LGB_AVAILABLE}")

## 2. Load Data

In [None]:
# PROPER TEMPORAL TRAIN/TEST SPLIT (NO RANDOM SPLITS)
print("\n" + "="*60)
print("IMPLEMENTING PROPER TEMPORAL VALIDATION")
print("="*60)

# Sort by season to ensure temporal ordering
df_sorted = df.sort_values('season_encoded').reset_index(drop=True)

# Use temporal split: earlier seasons for training, later for testing
seasons = sorted(df_sorted['season_encoded'].unique())
print(f"Available seasons: {seasons}")

if len(seasons) >= 3:
    # Use first 60% of seasons for training, middle 20% for validation, last 20% for testing
    n_seasons = len(seasons)
    train_seasons = seasons[:int(0.6 * n_seasons)]
    val_seasons = seasons[int(0.6 * n_seasons):int(0.8 * n_seasons)]
    test_seasons = seasons[int(0.8 * n_seasons):]
    
    print(f"Training seasons: {train_seasons}")
    print(f"Validation seasons: {val_seasons}")
    print(f"Test seasons: {test_seasons}")
    
    # Create splits
    train_mask = df_sorted['season_encoded'].isin(train_seasons)
    val_mask = df_sorted['season_encoded'].isin(val_seasons)
    test_mask = df_sorted['season_encoded'].isin(test_seasons)
    
    X_train = df_sorted[train_mask][valid_features]
    y_train = df_sorted[train_mask]['target_total_points']
    
    X_val = df_sorted[val_mask][valid_features]
    y_val = df_sorted[val_mask]['target_total_points']
    
    X_test = df_sorted[test_mask][valid_features]
    y_test = df_sorted[test_mask]['target_total_points']
    
    print(f"\nTrain set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
else:
    # Fallback: use simple train/test split but warn about temporal issues
    print("WARNING: Not enough seasons for proper temporal validation")
    print("Using stratified split by team instead")
    
    # Group by team and use some teams for training, others for testing
    unique_teams = df['team_encoded'].unique()
    np.random.seed(42)
    train_teams = np.random.choice(unique_teams, size=int(0.7 * len(unique_teams)), replace=False)
    test_teams = [team for team in unique_teams if team not in train_teams]
    
    train_mask = df['team_encoded'].isin(train_teams)
    test_mask = df['team_encoded'].isin(test_teams)
    
    X_train = df[train_mask][valid_features]
    y_train = df[train_mask]['target_total_points']
    X_test = df[test_mask][valid_features]
    y_test = df[test_mask]['target_total_points']
    
    X_val = X_test  # Use test set as validation for simplicity
    y_val = y_test
    
    print(f"Train set: {X_train.shape[0]} samples ({len(train_teams)} teams)")
    print(f"Test set: {X_test.shape[0]} samples ({len(test_teams)} teams)")

## 3. Train Models

In [None]:
# ============================================================================
# TRAIN/TEST SPLIT
# ============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"[SPLIT] Train: {len(X_train)}, Test: {len(X_test)}")
print(f"[SPLIT] Train target - Mean: {y_train.mean():.2f}, Std: {y_train.std():.2f}")
print(f"[SPLIT] Test target  - Mean: {y_test.mean():.2f}, Std: {y_test.std():.2f}")

# ============================================================================
# HYPERPARAMETER SEARCH SPACE - Total Points Regression (Small Dataset)
# ============================================================================

param_grids = {
    'Ridge': {
        'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'Lasso': {
        'model__alpha': [0.001, 0.01, 0.1, 1, 10]
    },
    'RandomForest': {
        'model__n_estimators': [50, 100, 150],
        'model__max_depth': [5, 10, 15, None],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    },
    'GradientBoosting': {
        'model__n_estimators': [50, 100, 150],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 0.9]
    }
}

if XGB_AVAILABLE:
    param_grids['XGBoost'] = {
        'model__n_estimators': [50, 100, 150],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 0.9]
    }

if LGB_AVAILABLE:
    param_grids['LightGBM'] = {
        'model__n_estimators': [50, 100, 150],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__num_leaves': [20, 30, 40]
    }

# ============================================================================
# TRAINING WITH HYPERPARAMETER TUNING
# ============================================================================

def create_pipeline(model, use_scaling=True):
    steps = [('imputer', SimpleImputer(strategy='mean'))]
    if use_scaling:
        steps.append(('scaler', StandardScaler()))
    steps.append(('model', model))
    return Pipeline(steps)

print("\n" + "="*80)
print("HYPERPARAMETER TUNING - RandomizedSearchCV (Regression)")
print("="*80)

results = {}
trained_models = {}
best_models = {}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

models_to_train = {
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42, max_iter=10000),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}

if XGB_AVAILABLE:
    models_to_train['XGBoost'] = xgb.XGBRegressor(random_state=42, n_jobs=-1)

if LGB_AVAILABLE:
    models_to_train['LightGBM'] = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

for model_name, model in models_to_train.items():
    print(f"\n{'-'*80}")
    print(f"[{model_name}] Hyperparameter Tuning...")
    print(f"{'-'*80}")
    
    pipeline = create_pipeline(model)
    param_grid = param_grids[model_name]
    
    search = RandomizedSearchCV(
        pipeline,
        param_grid,
        n_iter=20,
        cv=cv,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    search.fit(X_train, y_train)
    
    best_models[model_name] = search.best_estimator_
    print(f"\n[BEST] Params: {search.best_params_}")
    print(f"[CV] MAE: {-search.best_score_:.4f}")
    
    # Test evaluation
    y_pred = best_models[model_name].predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[model_name] = {
        'best_params': search.best_params_,
        'cv_mae': float(-search.best_score_),
        'test_mae': float(mae),
        'test_rmse': float(rmse),
        'test_r2': float(r2)
    }
    
    print(f"\n[TEST] MAE:  {mae:.4f}")
    print(f"[TEST] RMSE: {rmse:.4f}")
    print(f"[TEST] R²:   {r2:.4f}")
    
    # Residuals analysis
    residuals = y_test - y_pred
    print(f"[TEST] Residuals - Mean: {residuals.mean():.4f}, Std: {residuals.std():.4f}")
    
    trained_models[model_name] = best_models[model_name]

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)

## 4. Save Models

In [None]:
# ============================================================================
# IDENTIFY BEST MODEL
# ============================================================================

best_model_name = min(results, key=lambda x: results[x]['test_mae'])
best_metrics = results[best_model_name]

print(f"\n[WINNER] Best Model: {best_model_name}")
print(f"[WINNER] Test MAE: {best_metrics['test_mae']:.4f}")
print(f"[WINNER] Test RMSE: {best_metrics['test_rmse']:.4f}")
print(f"[WINNER] Test R²: {best_metrics['test_r2']:.4f}")

# ============================================================================
# SAVE ONLY THE BEST MODEL
# ============================================================================

print(f"\n{'='*80}")
print("SAVING BEST MODEL")
print(f"{'='*80}")

# Save only the best model
best_model_path = models_dir / 'ps4_total_points_best_model.joblib'
joblib.dump(trained_models[best_model_name], best_model_path)
print(f"[SAVE] Best Model ({best_model_name}) -> {best_model_path}")

# ============================================================================
# SAVE TRAINING SUMMARY
# ============================================================================

summary = {
    'problem_statement': 'PS4: Total Points Tally Prediction',
    'task_type': 'Regression',
    'best_model': best_model_name,
    'best_test_mae': best_metrics['test_mae'],
    'best_test_rmse': best_metrics['test_rmse'],
    'best_test_r2': best_metrics['test_r2'],
    'timestamp': datetime.now().isoformat(),
    'data': {
        'path': data_path,
        'shape': list(df.shape),
        'train_size': len(X_train),
        'test_size': len(X_test),
        'n_features': len(feature_cols)
    },
    'features': feature_cols,
    'cv_strategy': '5-Fold KFold',
    'tuning_method': 'RandomizedSearchCV (20 iterations)',
    'scoring_metric': 'neg_mean_absolute_error',
    'best_params': best_metrics['best_params']
}

summary_path = models_dir / 'ps4_total_points_metadata.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"[SAVE] Metadata -> {summary_path}")

print("\n" + "="*80)
print("✅ PS4: TOTAL POINTS PREDICTION - COMPLETE!")
print("="*80)