# NBA Game Prediction - Advanced Models

**Purpose:** Train advanced ML models for improved prediction accuracy

**Models:** XGBoost, LightGBM

**Goal:** Achieve accuracy > 65% and AUC > 0.70

---

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Advanced models
import xgboost as xgb
import lightgbm as lgb

# Scikit-learn
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report
)
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")

## 1. Load Data

In [None]:
S3_BUCKET = 'nba-sim-raw-data-lake'
S3_PREFIX = 'ml-features'

print("Loading data...")
train_df = pd.read_parquet(f's3://{S3_BUCKET}/{S3_PREFIX}/train.parquet')
test_df = pd.read_parquet(f's3://{S3_BUCKET}/{S3_PREFIX}/test.parquet')

# Prepare features
id_cols = ['game_id', 'game_date', 'season', 'home_team_id', 'away_team_id']
target_col = 'home_win'
feature_cols = [col for col in train_df.columns if col not in id_cols + [target_col]]

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

print(f"✓ Data loaded: {X_train.shape[0]:,} train, {X_test.shape[0]:,} test")

## 2. XGBoost Model

In [None]:
print("Training XGBoost...\n")

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=50
)

print("\n✓ XGBoost trained")

In [None]:
# Predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
train_acc_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_acc_xgb = accuracy_score(y_test, y_test_pred_xgb)
test_auc_xgb = roc_auc_score(y_test, y_test_proba_xgb)

print("=" * 70)
print("XGBOOST PERFORMANCE")
print("=" * 70)
print(f"Train Accuracy: {train_acc_xgb:.4f}")
print(f"Test Accuracy:  {test_acc_xgb:.4f}")
print(f"Test AUC:       {test_auc_xgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_xgb, target_names=['Away Win', 'Home Win']))

## 3. LightGBM Model

In [None]:
print("Training LightGBM...\n")

lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.log_evaluation(50)]
)

print("\n✓ LightGBM trained")

In [None]:
# Predictions
y_train_pred_lgb = lgb_model.predict(X_train)
y_test_pred_lgb = lgb_model.predict(X_test)
y_test_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

# Metrics
train_acc_lgb = accuracy_score(y_train, y_train_pred_lgb)
test_acc_lgb = accuracy_score(y_test, y_test_pred_lgb)
test_auc_lgb = roc_auc_score(y_test, y_test_proba_lgb)

print("=" * 70)
print("LIGHTGBM PERFORMANCE")
print("=" * 70)
print(f"Train Accuracy: {train_acc_lgb:.4f}")
print(f"Test Accuracy:  {test_acc_lgb:.4f}")
print(f"Test AUC:       {test_auc_lgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_lgb, target_names=['Away Win', 'Home Win']))

## 4. Model Comparison

In [None]:
comparison = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM'],
    'Train Accuracy': [train_acc_xgb, train_acc_lgb],
    'Test Accuracy': [test_acc_xgb, test_acc_lgb],
    'Test AUC': [test_auc_xgb, test_auc_lgb]
})

print("=" * 70)
print("ADVANCED MODEL COMPARISON")
print("=" * 70)
print(comparison.to_string(index=False))

best_idx = comparison['Test AUC'].idxmax()
best_model = comparison.loc[best_idx, 'Model']
best_auc = comparison.loc[best_idx, 'Test AUC']

print(f"\n✓ Best Model: {best_model} (AUC: {best_auc:.4f})")

## 5. Feature Importance

In [None]:
# XGBoost feature importance
xgb_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# LightGBM feature importance
lgb_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].barh(range(10), xgb_importance.head(10)['importance'])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels(xgb_importance.head(10)['feature'])
axes[0].set_xlabel('Importance')
axes[0].set_title('XGBoost - Top 10 Features')

axes[1].barh(range(10), lgb_importance.head(10)['importance'])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels(lgb_importance.head(10)['feature'])
axes[1].set_xlabel('Importance')
axes[1].set_title('LightGBM - Top 10 Features')

plt.tight_layout()
plt.show()

## 6. Save Models

In [None]:
import pickle
import boto3
from io import BytesIO

s3 = boto3.client('s3')
model_prefix = 'ml-models/advanced'

print("Saving models to S3...")

# XGBoost
xgb_buffer = BytesIO()
pickle.dump(xgb_model, xgb_buffer)
xgb_buffer.seek(0)
s3.put_object(Bucket=S3_BUCKET, Key=f'{model_prefix}/xgboost.pkl', Body=xgb_buffer.getvalue())
print("  ✓ XGBoost saved")

# LightGBM
lgb_buffer = BytesIO()
pickle.dump(lgb_model, lgb_buffer)
lgb_buffer.seek(0)
s3.put_object(Bucket=S3_BUCKET, Key=f'{model_prefix}/lightgbm.pkl', Body=lgb_buffer.getvalue())
print("  ✓ LightGBM saved")

print(f"\n✓ Models saved to s3://{S3_BUCKET}/{model_prefix}/")

## 7. Summary

In [None]:
print("\n" + "=" * 70)
print("ADVANCED MODELS SUMMARY")
print("=" * 70)

print(f"\n📊 Models Trained:")
print(f"  1. XGBoost (200 estimators, max_depth=6)")
print(f"  2. LightGBM (200 estimators, 31 leaves)")

print(f"\n📈 Performance:")
print(f"  XGBoost:  {test_acc_xgb:.1%} accuracy, {test_auc_xgb:.3f} AUC")
print(f"  LightGBM: {test_acc_lgb:.1%} accuracy, {test_auc_lgb:.3f} AUC")

print(f"\n🏆 Best Model: {best_model}")

goal_met = comparison['Test Accuracy'].max() > 0.65
print(f"\n{'✓' if goal_met else '⚠️ '} Goal {'achieved' if goal_met else 'not met'}: Accuracy > 65%")

print(f"\n🎯 Next Steps:")
print(f"  1. Hyperparameter tuning for optimal performance")
print(f"  2. Ensemble methods (combine multiple models)")
print(f"  3. Review 05_model_evaluation.ipynb for comprehensive comparison")
print("=" * 70)