# NBA Game Prediction - Baseline Model

**Purpose:** Build baseline prediction models for NBA game outcomes

**Data Source:** S3 (features from 02_feature_engineering.ipynb)

**Output:** Trained baseline models and performance metrics

---

## Overview

This notebook trains simple baseline models:
1. **Logistic Regression** (linear baseline)
2. **Random Forest** (tree-based baseline)
3. **Model evaluation** (accuracy, AUC, feature importance)

**Target:** Home team win probability (binary classification)

**Goal:** Establish baseline performance (accuracy > 60%)

---

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix,
    classification_report
)

import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Load Features from S3

Load the engineered features created in the previous notebook.

In [None]:
# S3 paths
S3_BUCKET = 'nba-sim-raw-data-lake'
S3_PREFIX = 'ml-features'

train_path = f's3://{S3_BUCKET}/{S3_PREFIX}/train.parquet'
test_path = f's3://{S3_BUCKET}/{S3_PREFIX}/test.parquet'

print("Loading training data...")
train_df = pd.read_parquet(train_path)

print("Loading test data...")
test_df = pd.read_parquet(test_path)

print(f"\n✓ Data loaded")
print(f"  Train set: {len(train_df):,} rows")
print(f"  Test set:  {len(test_df):,} rows")
print(f"  Features:  {len(train_df.columns)} columns")

In [None]:
# Examine data
print("Train data sample:")
print(train_df.head())

print("\nData types:")
print(train_df.dtypes)

print("\nTarget distribution (train):")
print(train_df['home_win'].value_counts(normalize=True))

## 2. Prepare Features

Select features for modeling and prepare train/test splits.

In [None]:
# Identify feature columns (exclude identifiers and target)
id_cols = ['game_id', 'game_date', 'season', 'home_team_id', 'away_team_id']
target_col = 'home_win'

feature_cols = [col for col in train_df.columns 
                if col not in id_cols + [target_col]]

print(f"Feature columns ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Prepare train/test splits
X_train = train_df[feature_cols].copy()
y_train = train_df[target_col].copy()

X_test = test_df[feature_cols].copy()
y_test = test_df[target_col].copy()

print(f"✓ Data prepared")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape}")
print(f"  y_test:  {y_test.shape}")

In [None]:
# Check for missing values
print("\nMissing values:")
print("Train:", X_train.isnull().sum().sum())
print("Test:", X_test.isnull().sum().sum())

if X_train.isnull().sum().sum() > 0:
    print("\n⚠️  Warning: Missing values detected")
    print(X_train.isnull().sum()[X_train.isnull().sum() > 0])
else:
    print("✓ No missing values")

## 3. Feature Scaling

Standardize features for logistic regression.

In [None]:
# Standardize features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for consistency
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test.index)

print("✓ Features scaled")
print(f"  Mean (train): {X_train_scaled.mean().mean():.6f}")
print(f"  Std (train):  {X_train_scaled.std().mean():.6f}")

## 4. Baseline Model #1: Logistic Regression

Simple linear model for binary classification.

In [None]:
# Train logistic regression
print("Training Logistic Regression...")

lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train_scaled, y_train)

print("✓ Model trained")

In [None]:
# Make predictions
y_train_pred_lr = lr_model.predict(X_train_scaled)
y_test_pred_lr = lr_model.predict(X_test_scaled)

y_train_proba_lr = lr_model.predict_proba(X_train_scaled)[:, 1]
y_test_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

print("✓ Predictions generated")

In [None]:
# Evaluate logistic regression
print("=" * 70)
print("LOGISTIC REGRESSION PERFORMANCE")
print("=" * 70)

# Train metrics
train_acc_lr = accuracy_score(y_train, y_train_pred_lr)
train_auc_lr = roc_auc_score(y_train, y_train_proba_lr)

print("\nTrain Set:")
print(f"  Accuracy: {train_acc_lr:.4f}")
print(f"  AUC-ROC:  {train_auc_lr:.4f}")

# Test metrics
test_acc_lr = accuracy_score(y_test, y_test_pred_lr)
test_precision_lr = precision_score(y_test, y_test_pred_lr)
test_recall_lr = recall_score(y_test, y_test_pred_lr)
test_f1_lr = f1_score(y_test, y_test_pred_lr)
test_auc_lr = roc_auc_score(y_test, y_test_proba_lr)

print("\nTest Set:")
print(f"  Accuracy:  {test_acc_lr:.4f}")
print(f"  Precision: {test_precision_lr:.4f}")
print(f"  Recall:    {test_recall_lr:.4f}")
print(f"  F1 Score:  {test_f1_lr:.4f}")
print(f"  AUC-ROC:   {test_auc_lr:.4f}")

# Classification report
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_lr, 
                          target_names=['Away Win', 'Home Win']))

In [None]:
# Feature importance (coefficients)
feature_importance_lr = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lr_model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

print("\nTop 10 Most Important Features (by coefficient magnitude):")
print(feature_importance_lr.head(10).to_string(index=False))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(10), feature_importance_lr.head(10)['coefficient'])
plt.yticks(range(10), feature_importance_lr.head(10)['feature'])
plt.xlabel('Coefficient Value')
plt.title('Logistic Regression - Top 10 Features')
plt.tight_layout()
plt.show()

## 5. Baseline Model #2: Random Forest

Tree-based ensemble model (no scaling needed).

In [None]:
# Train random forest
print("Training Random Forest...")
print("This may take a few minutes...\n")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=50,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)

print("\n✓ Model trained")

In [None]:
# Make predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

y_train_proba_rf = rf_model.predict_proba(X_train)[:, 1]
y_test_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("✓ Predictions generated")

In [None]:
# Evaluate random forest
print("=" * 70)
print("RANDOM FOREST PERFORMANCE")
print("=" * 70)

# Train metrics
train_acc_rf = accuracy_score(y_train, y_train_pred_rf)
train_auc_rf = roc_auc_score(y_train, y_train_proba_rf)

print("\nTrain Set:")
print(f"  Accuracy: {train_acc_rf:.4f}")
print(f"  AUC-ROC:  {train_auc_rf:.4f}")

# Test metrics
test_acc_rf = accuracy_score(y_test, y_test_pred_rf)
test_precision_rf = precision_score(y_test, y_test_pred_rf)
test_recall_rf = recall_score(y_test, y_test_pred_rf)
test_f1_rf = f1_score(y_test, y_test_pred_rf)
test_auc_rf = roc_auc_score(y_test, y_test_proba_rf)

print("\nTest Set:")
print(f"  Accuracy:  {test_acc_rf:.4f}")
print(f"  Precision: {test_precision_rf:.4f}")
print(f"  Recall:    {test_recall_rf:.4f}")
print(f"  F1 Score:  {test_f1_rf:.4f}")
print(f"  AUC-ROC:   {test_auc_rf:.4f}")

# Classification report
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_rf,
                          target_names=['Away Win', 'Home Win']))

In [None]:
# Feature importance
feature_importance_rf = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_rf.head(10).to_string(index=False))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(10), feature_importance_rf.head(10)['importance'])
plt.yticks(range(10), feature_importance_rf.head(10)['feature'])
plt.xlabel('Feature Importance')
plt.title('Random Forest - Top 10 Features')
plt.tight_layout()
plt.show()

## 6. Model Comparison

Compare baseline models side-by-side.

In [None]:
# Create comparison DataFrame
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Train Accuracy': [train_acc_lr, train_acc_rf],
    'Test Accuracy': [test_acc_lr, test_acc_rf],
    'Test Precision': [test_precision_lr, test_precision_rf],
    'Test Recall': [test_recall_lr, test_recall_rf],
    'Test F1': [test_f1_lr, test_f1_rf],
    'Test AUC': [test_auc_lr, test_auc_rf]
})

print("=" * 70)
print("BASELINE MODEL COMPARISON")
print("=" * 70)
print(comparison.to_string(index=False))

# Determine best model
best_idx = comparison['Test AUC'].idxmax()
best_model = comparison.loc[best_idx, 'Model']
best_auc = comparison.loc[best_idx, 'Test AUC']

print(f"\n✓ Best Model: {best_model} (AUC: {best_auc:.4f})")

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
metrics = ['Train Accuracy', 'Test Accuracy']
x = np.arange(len(metrics))
width = 0.35

axes[0].bar(x - width/2, comparison.iloc[0][metrics], width, label='Logistic Regression')
axes[0].bar(x + width/2, comparison.iloc[1][metrics], width, label='Random Forest')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics)
axes[0].legend()
axes[0].set_ylim([0.5, 0.8])

# Test metrics comparison
test_metrics = ['Test Precision', 'Test Recall', 'Test F1', 'Test AUC']
x2 = np.arange(len(test_metrics))

axes[1].bar(x2 - width/2, comparison.iloc[0][test_metrics], width, label='Logistic Regression')
axes[1].bar(x2 + width/2, comparison.iloc[1][test_metrics], width, label='Random Forest')
axes[1].set_ylabel('Score')
axes[1].set_title('Test Set Metrics Comparison')
axes[1].set_xticks(x2)
axes[1].set_xticklabels(test_metrics, rotation=45)
axes[1].legend()
axes[1].set_ylim([0.5, 0.8])

plt.tight_layout()
plt.show()

## 7. ROC Curves

Compare model discrimination ability.

In [None]:
# Calculate ROC curves
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_test_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_test_proba_rf)

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {test_auc_lr:.3f})', linewidth=2)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {test_auc_rf:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess', linewidth=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Baseline Models')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Confusion Matrices

Visualize prediction errors.

In [None]:
# Calculate confusion matrices
cm_lr = confusion_matrix(y_test, y_test_pred_lr)
cm_rf = confusion_matrix(y_test, y_test_pred_rf)

# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Away Win', 'Home Win'],
            yticklabels=['Away Win', 'Home Win'],
            ax=axes[0])
axes[0].set_title('Logistic Regression\nConfusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Random Forest
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Away Win', 'Home Win'],
            yticklabels=['Away Win', 'Home Win'],
            ax=axes[1])
axes[1].set_title('Random Forest\nConfusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 9. Save Models

Export trained models for future use.

In [None]:
import pickle
import boto3
from io import BytesIO

# Save models to S3
s3 = boto3.client('s3')
model_prefix = 'ml-models/baseline'

print("Saving models to S3...")

# Save logistic regression
lr_buffer = BytesIO()
pickle.dump({'model': lr_model, 'scaler': scaler}, lr_buffer)
lr_buffer.seek(0)
s3.put_object(Bucket=S3_BUCKET, 
              Key=f'{model_prefix}/logistic_regression.pkl',
              Body=lr_buffer.getvalue())
print("  ✓ Logistic Regression saved")

# Save random forest
rf_buffer = BytesIO()
pickle.dump(rf_model, rf_buffer)
rf_buffer.seek(0)
s3.put_object(Bucket=S3_BUCKET,
              Key=f'{model_prefix}/random_forest.pkl',
              Body=rf_buffer.getvalue())
print("  ✓ Random Forest saved")

# Save feature names
feature_buffer = BytesIO()
pickle.dump(feature_cols, feature_buffer)
feature_buffer.seek(0)
s3.put_object(Bucket=S3_BUCKET,
              Key=f'{model_prefix}/feature_names.pkl',
              Body=feature_buffer.getvalue())
print("  ✓ Feature names saved")

print(f"\n✓ Models saved to s3://{S3_BUCKET}/{model_prefix}/")

## 10. Summary

Baseline model results and recommendations.

In [None]:
print("\n" + "=" * 70)
print("BASELINE MODEL SUMMARY")
print("=" * 70)

print(f"\n📊 Models Trained:")
print(f"  1. Logistic Regression")
print(f"  2. Random Forest (100 trees, max_depth=10)")

print(f"\n📈 Performance (Test Set):")
print(f"  Logistic Regression: {test_acc_lr:.1%} accuracy, {test_auc_lr:.3f} AUC")
print(f"  Random Forest:       {test_acc_rf:.1%} accuracy, {test_auc_rf:.3f} AUC")

print(f"\n🏆 Best Model: {best_model}")
print(f"  Test Accuracy: {comparison.loc[best_idx, 'Test Accuracy']:.1%}")
print(f"  Test AUC:      {best_auc:.3f}")

# Goal check
goal_met = comparison['Test Accuracy'].max() > 0.60
if goal_met:
    print(f"\n✓ Goal achieved: Accuracy > 60%")
else:
    print(f"\n⚠️  Goal not met: Accuracy ≤ 60%")
    print(f"   Consider: More features, hyperparameter tuning, advanced models")

print(f"\n💾 Outputs:")
print(f"  - Models saved to S3: s3://{S3_BUCKET}/ml-models/baseline/")
print(f"  - Logistic Regression: logistic_regression.pkl")
print(f"  - Random Forest: random_forest.pkl")
print(f"  - Feature names: feature_names.pkl")

print(f"\n🎯 Next Steps:")
print(f"  1. Review feature importance (which features matter most?)")
print(f"  2. Try advanced models (XGBoost, LightGBM) in 04_advanced_models.ipynb")
print(f"  3. Hyperparameter tuning for better performance")
print(f"  4. Feature engineering (add more features if needed)")

print("=" * 70)