# College Basketball Prediction Model Training
## GPU-Accelerated Training on Google Colab

This notebook trains deep learning models for college basketball game predictions using historical data.

**Hardware Requirements:**
- Runtime → Change runtime type → GPU (T4 or better)

**Steps:**
1. Upload your training data
2. Configure training parameters
3. Train models on GPU
4. Download trained models
5. Evaluate performance

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install pandas numpy scikit-learn matplotlib seaborn tqdm

In [None]:
# Check GPU availability
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import json
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Upload Training Data

Upload your `basketball_training_data.csv` file generated from the data preparation script.

In [None]:
from google.colab import files

print("Upload your basketball_training_data.csv file:")
uploaded = files.upload()

# Load the data
data_file = list(uploaded.keys())[0]
df = pd.read_csv(data_file)

print(f"\nData loaded: {len(df)} games")
print(f"Features: {df.shape[1]} columns")
print(f"\nFirst few rows:")
df.head()

## 3. Data Preprocessing

In [None]:
# Define feature columns (customize based on your data)
feature_columns = [
    'home_offensive_efficiency', 'home_defensive_efficiency', 'home_tempo',
    'away_offensive_efficiency', 'away_defensive_efficiency', 'away_tempo',
    'home_kenpom_rating', 'away_kenpom_rating',
    'home_recent_form', 'away_recent_form',
    'neutral_site', 'tournament_game',
    'home_wins', 'home_losses', 'away_wins', 'away_losses',
    'home_avg_points', 'away_avg_points',
    'home_avg_points_allowed', 'away_avg_points_allowed'
]

# Target columns
target_columns = [
    'home_won',  # Binary: 1 if home won, 0 if away won
    'point_differential',  # Home points - Away points
    'total_points'  # Total combined points
]

# Filter to available columns
available_features = [col for col in feature_columns if col in df.columns]
available_targets = [col for col in target_columns if col in df.columns]

print(f"Using {len(available_features)} features")
print(f"Predicting {len(available_targets)} targets")

# Prepare data
X = df[available_features].values
y = df[available_targets].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42
)

print(f"\nData split:")
print(f"  Training: {len(X_train)} samples")
print(f"  Validation: {len(X_val)} samples")
print(f"  Test: {len(X_test)} samples")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaler parameters
scaler_params = {
    'mean': scaler.mean_.tolist(),
    'scale': scaler.scale_.tolist(),
    'feature_names': available_features
}

with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f, indent=2)

## 4. Dataset and DataLoader

In [None]:
class BasketballDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = BasketballDataset(X_train_scaled, y_train)
val_dataset = BasketballDataset(X_val_scaled, y_val)
test_dataset = BasketballDataset(X_test_scaled, y_test)

# Create dataloaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Batch size: {batch_size}")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

## 5. Model Architecture

In [None]:
class BasketballPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[256, 128, 64], dropout=0.3):
        super(BasketballPredictionModel, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_size = hidden_size
        
        self.feature_extractor = nn.Sequential(*layers)
        
        # Output heads for different predictions
        self.win_predictor = nn.Sequential(
            nn.Linear(prev_size, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Win probability (0-1)
        )
        
        self.margin_predictor = nn.Sequential(
            nn.Linear(prev_size, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Point differential
        )
        
        self.total_predictor = nn.Sequential(
            nn.Linear(prev_size, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Total points
        )
    
    def forward(self, x):
        features = self.feature_extractor(x)
        win_prob = self.win_predictor(features)
        margin = self.margin_predictor(features)
        total = self.total_predictor(features)
        return torch.cat([win_prob, margin, total], dim=1)

# Initialize model
input_size = X_train_scaled.shape[1]
model = BasketballPredictionModel(input_size).to(device)

print(f"Model architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

## 6. Training Configuration

In [None]:
# Loss functions
bce_loss = nn.BCELoss()  # For win probability
mse_loss = nn.MSELoss()  # For margin and total

# Optimizer and scheduler
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Training parameters
num_epochs = 100
early_stop_patience = 15
best_val_loss = float('inf')
patience_counter = 0

# Loss weights for multi-task learning
loss_weights = {
    'win': 2.0,      # Most important for betting
    'margin': 1.5,   # Important for spread betting
    'total': 1.0     # Important for totals betting
}

print("Training configuration:")
print(f"  Epochs: {num_epochs}")
print(f"  Early stopping patience: {early_stop_patience}")
print(f"  Loss weights: {loss_weights}")

## 7. Training Loop

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in tqdm(dataloader, desc="Training"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        predictions = model(X_batch)
        
        # Calculate individual losses
        win_loss = bce_loss(predictions[:, 0], y_batch[:, 0])
        margin_loss = mse_loss(predictions[:, 1], y_batch[:, 1])
        total_loss_val = mse_loss(predictions[:, 2], y_batch[:, 2])
        
        # Weighted combination
        loss = (loss_weights['win'] * win_loss + 
                loss_weights['margin'] * margin_loss + 
                loss_weights['total'] * total_loss_val)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            predictions = model(X_batch)
            
            win_loss = bce_loss(predictions[:, 0], y_batch[:, 0])
            margin_loss = mse_loss(predictions[:, 1], y_batch[:, 1])
            total_loss_val = mse_loss(predictions[:, 2], y_batch[:, 2])
            
            loss = (loss_weights['win'] * win_loss + 
                    loss_weights['margin'] * margin_loss + 
                    loss_weights['total'] * total_loss_val)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'learning_rates': []
}

print("Starting training...\n")

for epoch in range(num_epochs):
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device)
    
    # Validate
    val_loss = validate(model, val_loader, device)
    
    # Update scheduler
    scheduler.step(val_loss)
    
    # Save history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['learning_rates'].append(optimizer.param_groups[0]['lr'])
    
    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'history': history
        }, 'best_model.pth')
        print(f"  ✓ New best model saved!")
    else:
        patience_counter += 1
    
    # Early stopping
    if patience_counter >= early_stop_patience:
        print(f"\nEarly stopping triggered after {epoch+1} epochs")
        break
    
    print()

print("\n✓ Training complete!")

## 8. Visualize Training

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history['train_loss'], label='Train Loss')
axes[0].plot(history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True)

# Learning rate plot
axes[1].plot(history['learning_rates'])
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Learning Rate')
axes[1].set_title('Learning Rate Schedule')
axes[1].set_yscale('log')
axes[1].grid(True)

plt.tight_layout()
plt.savefig('training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Best validation loss: {best_val_loss:.4f}")

## 9. Evaluate Model

In [None]:
# Load best model
checkpoint = torch.load('best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])

# Evaluate on test set
model.eval()
predictions_list = []
actuals_list = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        predictions = model(X_batch).cpu().numpy()
        predictions_list.append(predictions)
        actuals_list.append(y_batch.numpy())

predictions = np.vstack(predictions_list)
actuals = np.vstack(actuals_list)

# Calculate metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

# Win prediction accuracy
win_predictions = (predictions[:, 0] > 0.5).astype(int)
win_actuals = actuals[:, 0].astype(int)
win_accuracy = accuracy_score(win_actuals, win_predictions)

# Margin prediction
margin_mae = mean_absolute_error(actuals[:, 1], predictions[:, 1])
margin_rmse = np.sqrt(mean_squared_error(actuals[:, 1], predictions[:, 1]))

# Total prediction
total_mae = mean_absolute_error(actuals[:, 2], predictions[:, 2])
total_rmse = np.sqrt(mean_squared_error(actuals[:, 2], predictions[:, 2]))

print("\n" + "="*50)
print("TEST SET PERFORMANCE")
print("="*50)
print(f"\nWin Prediction:")
print(f"  Accuracy: {win_accuracy:.1%}")
print(f"\nPoint Differential:")
print(f"  MAE: {margin_mae:.2f} points")
print(f"  RMSE: {margin_rmse:.2f} points")
print(f"\nTotal Points:")
print(f"  MAE: {total_mae:.2f} points")
print(f"  RMSE: {total_rmse:.2f} points")
print("\n" + "="*50)

# Save evaluation results
evaluation_results = {
    'win_accuracy': float(win_accuracy),
    'margin_mae': float(margin_mae),
    'margin_rmse': float(margin_rmse),
    'total_mae': float(total_mae),
    'total_rmse': float(total_rmse),
    'test_samples': len(predictions)
}

with open('evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2)

## 10. Download Trained Model

In [None]:
# Package model for deployment
import zipfile

# Create zip file with all necessary files
with zipfile.ZipFile('basketball_model_package.zip', 'w') as zipf:
    zipf.write('best_model.pth')
    zipf.write('scaler_params.json')
    zipf.write('evaluation_results.json')
    zipf.write('training_history.png')

print("Model package created!")
print("\nDownloading files...")

# Download the package
files.download('basketball_model_package.zip')

print("\n✓ Download complete!")
print("\nNext steps:")
print("1. Extract the zip file")
print("2. Move files to your project's models/ directory")
print("3. Use the deployment script to integrate the model")