This notebook describes the use of an autoencoder to predict future NBA games. The idea is to use the training data as "patterns". Then, when it comes time to predict, we would feed the network a "messy" pattern that consists of the two teams that are playing and all the other inputs are randomized. Ideally, the autoencoder should be able to reconstruct the game stats from just the messy input involving the two teams playing.

In [1]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets 
from torchvision import transforms
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from IPython.display import clear_output
import seaborn as sns
import itertools


# Building the Autoencoder

### __1.__ Prepare training data
1. We need to make sure all categorical varialbes are one-hot encoded.
2. Then 'mask' values (replace with 0s?) in corrupted training
3. Add normal/poisson noise to continuous variables (depending on scale)
4. Drop unneeded variables

In [2]:
df_games = pd.read_csv("df_games_output.csv")
df_games.drop(columns=['gameId','gameDate'],inplace=True)

Imputation

In [None]:
# Impute missing values with the mean for numerical columns
for col in df_games.columns:
    if df_games[col].isna().sum() > 0:  # Check if the column has missing values
        if df_games[col].dtype in ['float64', 'int64']:  # Only for numerical columns
            df_games[col].fillna(df_games[col].median(), inplace=True)
        if df_games[col].dtype == 'object':
            df_games[col].fillna('Missing', inplace=True)

In [4]:
# should show NO columns after imputation
for col in df_games.columns:
    if df_games[col].isna().sum() > 0:  # Check if the column has missing values
        print(f"Column {col} still has missing values after imputation.")

Turning `winner` into a PyTorch tensor

In [5]:
# creating team to index mapping for use with home team, away team, and winner one-hot encoded feaures
unique_teams = df_games['hometeamName'].unique()
team_to_index = {team: idx for idx, team in enumerate(unique_teams)}
labels_winner = torch.tensor(df_games['winner'].map(team_to_index).values)
one_hot_winner = F.one_hot(labels_winner,num_classes=len(unique_teams)).float()
# 33 features for 33 unique teams

In [6]:
winner_features = ["winner_" + team for team in df_games['hometeamName'].unique()]
# Features named "winner_TeamName"

Turning `hometeamName` into a PyTorch tensor

In [7]:
unique_teams = df_games['hometeamName'].unique()
team_to_index = {team: idx for idx, team in enumerate(unique_teams)}
labels_hometeam = torch.tensor(df_games['hometeamName'].map(team_to_index).values)
one_hot_hometeam = F.one_hot(labels_hometeam, num_classes=len(unique_teams)).float()
# [50581, 33]
print(one_hot_hometeam.shape)
# tensor has a column for each team (index 0 to 32) and the 0th index of the column is 1 if that row is the first team (Jazz)
# 33 home teams

torch.Size([50581, 33])


In [8]:
hometeam_features = ['hometeam_' + key for key in team_to_index.keys()]
# Features are named hometeam_TeamName

Turning `awayteamName` into a PyTorch tensor

In [9]:
labels_awayteam = torch.tensor(df_games['awayteamName'].map(team_to_index).values)
one_hot_awayteam = F.one_hot(labels_awayteam, num_classes=len(unique_teams)).float()
# 33 away teams

In [10]:
awayteam_features = ['awayteam_' + key for key in team_to_index.keys()]
# features are named "awayteam_TeamName"


Turning `gameType` into a PyTorch tensor

In [11]:
unique_gametypes = df_games['gameType'].unique()
type_to_index = {gametype: idx for idx, gametype in enumerate(unique_gametypes)}
labels_gametype= torch.tensor(df_games['gameType'].map(type_to_index).values)
one_hot_gametype = F.one_hot(labels_gametype, num_classes=len(unique_gametypes)).float()

In [12]:
gametype_features = ['gametype_' + key for key in type_to_index.keys()]
# 6 game types

Turning `df_games` into tensor for PyTorch

In [13]:
scaler = StandardScaler()
# scaling the numerical columns
for col in df_games.columns:
    if df_games[col].dtype in ['float64', 'int64']:
        df_games[[col]] = scaler.fit_transform(df_games[[col]])

# creating tensor
tensor_games = torch.tensor(df_games.drop(columns=['hometeamName','awayteamName','winner','gameType']).values)
tensor_numeric = torch.cat([tensor_games,one_hot_hometeam, one_hot_awayteam,one_hot_winner, one_hot_gametype], axis=1)

In [14]:
# All feature names
numeric_features = df_games.drop(columns=['hometeamName','awayteamName','winner','gameType']).columns.tolist()
tensor_features = numeric_features +  hometeam_features + awayteam_features + winner_features + gametype_features
tensor_features = [feat.replace(" ","_") for feat in tensor_features]

### __2.__ Build Model Architecture

In [15]:
# Static network class

class ae(nn.Module):
    def __init__(self):
        super().__init__() # specifies to run parent torch.nn.Module class __init__ method automatically when I initialize the child 'dae' class I'm making.
        # Encoding layers
        self.encoder = nn.Sequential(
            nn.Linear(145,24),
            nn.ReLU(),
            nn.Linear(24, 24), 
            nn.ReLU(),
            nn.Linear(24,8)
        )
        # Decoding layers
        self.decoder = nn.Sequential(
            nn.Linear(8,24),
            nn.ReLU(), 
            nn.Linear(24,24), 
            nn.ReLU(),
            nn.Linear(24, 145)
        )
    def forward(self,x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [16]:
# Flexible NN class
class fae(nn.Module):
    def __init__(self, input_dim, hidden_layers, hidden_dim,compression_dim):
        super().__init__() # specifies to run parent torch.nn.Module class __init__ method automatically when I initialize the child 'dae' class I'm making.

        # Encoding layers
        enc_lay = [nn.Linear(input_dim, hidden_dim), nn.ReLU()]
        for i in range(hidden_layers - 1):
            enc_lay.append(nn.Linear(hidden_dim, hidden_dim))
            enc_lay.append(nn.ReLU())
        enc_lay.append(nn.Linear(hidden_dim, compression_dim))
        self.encoder = nn.Sequential(*enc_lay)

        # Decoding layers
        output_dim = input_dim
        dec_lay = [nn.Linear(compression_dim, hidden_dim), nn.ReLU()]
        for i in range(hidden_layers - 1):
            dec_lay.append(nn.Linear(hidden_dim, hidden_dim))
            dec_lay.append(nn.ReLU())
        dec_lay.append(nn.Linear(hidden_dim, output_dim))
        self.decoder = nn.Sequential(*dec_lay)

    def forward(self,x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


### __3.__ Instantiate model
* Instantiate loss function for combo of continuous and one-hot encoded variables
> 1. We need to use the proper loss function for one-hot encoded features (binary cross-entropy; BCE).
> 2. Then, specify a composite loss function that includes both binary feature loss (BCE) and continiuous feature loss (MSE).
* Specify gradient descent algorithm

In [75]:

def loss_function(prediction, pattern):
    # numeric feature loss
    num_loss_fn = nn.MSELoss()
    num_loss = num_loss_fn(prediction[:,:40], pattern[:,:40])
    
    # CrossEntropy loss to auto apply softmax ans cross entropy loss to continuous outputs that should be treated as logits
    oh_loss_fn = nn.CrossEntropyLoss()
    ht_loss = oh_loss_fn(prediction[:, 40:(40+33)], torch.argmax(pattern[:, 40:(40+33)],dim=1))
    at_loss = oh_loss_fn(prediction[:, 73:(73+33)], torch.argmax(pattern[:, 73:(73+33)],dim = 1))
    win_loss = oh_loss_fn(prediction[:, 106:(106+33)], torch.argmax(pattern[:, 106:(106+33)],dim=1))
    gt_loss = oh_loss_fn(prediction[:, 139:145], torch.argmax(pattern[:, 139:145],dim=1))

    # weights
    weights = torch.tensor([40 / 145,33 / 145,33 / 145,33 / 145,6 / 145])
    losses = torch.stack([num_loss,ht_loss,at_loss,win_loss,gt_loss])
    loss = torch.dot(weights, losses)
    return loss

# AdaM (Adaptive Moment estimation) is a pretty fancy off the shelf algorithm. It involves tracking recent gradient values to dynamically control the learning rate, which aids convergence
optimizer = torch.optim.Adam(model.parameters(),
                             lr = 1e-1,
                             weight_decay=1e-8)


### __4.__ Set up training regimen

* Alter batch size
* Alter network structure
* Alter loss function


In [76]:

seed = 42  # You can use any integer
np.random.seed(seed)
torch.manual_seed(seed)
model1 = ae()

torch.manual_seed(seed)
model2 = fae(input_dim=145, hidden_layers=2, hidden_dim=24, compression_dim=8)


In [None]:
# TESTING TO MAKE SURE FELXIBLE AUTOENCODER WORKS

g = torch.Generator()
g.manual_seed(42)
loader = DataLoader(tensor_numeric, batch_size=64, shuffle=True, generator=g)
# number of training epochs
epochs = 5
output = []
losses1 = []

counter = 0
for epoch in range(1,epochs+1):
    for i, batch in enumerate(loader):
        if (i % 50) == 0: 
            print(f"\rEpoch {epoch} of {epochs}, Batch {i}    ",end='')
        reconstructed = model1(batch.float())
        # print(f"\nRECONSTRUCTED: {reconstructed}\nRECONSTRUCTED SHAPE: {reconstructed.shape}")
        loss = loss_function(reconstructed, batch.float())
        # print(f"Batch {i} Loss: {loss.item()}")
        # zero gradients
        optimizer.zero_grad()
        # backprop
        loss.backward()
        # update weights
        optimizer.step()
        # store losses
        losses1.append(loss.item())
        # store output
        counter += 1
        output.append((epoch,loss,counter))

print(losses1[:10])


# testing felxible autoencoder

g = torch.Generator()
g.manual_seed(42)
loader = DataLoader(tensor_numeric, batch_size=64, shuffle=True, generator=g)
# number of training epochs
epochs = 5
output = []
losses2 = []

counter = 0
for epoch in range(1,epochs+1):
    for i, batch in enumerate(loader):
        if (i % 50) == 0: 
            print(f"\rEpoch {epoch} of {epochs}, Batch {i}    ",end='')
        reconstructed = model2(batch.float())
        # print(f"\nRECONSTRUCTED: {reconstructed}\nRECONSTRUCTED SHAPE: {reconstructed.shape}")
        loss = loss_function(reconstructed, batch.float())
        # print(f"Batch {i} Loss: {loss.item()}")
        # zero gradients
        optimizer.zero_grad()
        # backprop
        loss.backward()
        # update weights
        optimizer.step()
        # store losses
        losses2.append(loss.item())
        # store output
        counter += 1
        output.append((epoch,loss,counter))

print(losses2[:10])

Epoch 5 of 5, Batch 750    [2.74603271484375, 2.7496278285980225, 2.7607407569885254, 2.7975330352783203, 2.7420668601989746, 2.754002571105957, 2.7259185314178467, 2.755215883255005, 2.7375264167785645, 2.737128496170044]
Epoch 5 of 5, Batch 750    [2.74603271484375, 2.7496278285980225, 2.7607407569885254, 2.7975330352783203, 2.7420668601989746, 2.754002571105957, 2.7259185314178467, 2.755215883255005, 2.7375264167785645, 2.737128496170044]


In [86]:
# hyperparamater options
hidden_layers_options = [1, 2, 3]
hidden_dim_options = [16, 24, 32]
compression_dim_options = [4, 8, 16]
batch_size_options = [128]

# Create the grid
param_grid = list(itertools.product(
    hidden_layers_options,
    hidden_dim_options,
    compression_dim_options,
    batch_size_options
))


for hidden_layers, hidden_dim, compression_dim, batch_size in param_grid:

    # Set seeds for reproducibility if needed
    torch.manual_seed(42)
    # Instantiate model
    model = fae(input_dim=145, hidden_layers=hidden_layers, hidden_dim=hidden_dim, compression_dim=compression_dim)
    # Set up DataLoader
    g = torch.Generator()
    g.manual_seed(42)
    loader = DataLoader(tensor_numeric, batch_size=batch_size, shuffle=True, generator=g)
    # number of training epochs
    epochs = 5
    losses = []

    for epoch in range(1,epochs+1):
        for i, batch in enumerate(loader):
            if (i % 50) == 0: 
                print(f"\rEpoch {epoch} of {epochs}, Batch {i}    ",end='')
            reconstructed = model(batch.float())
            # print(f"\nRECONSTRUCTED: {reconstructed}\nRECONSTRUCTED SHAPE: {reconstructed.shape}")
            loss = loss_function(reconstructed, batch.float())
            # print(f"Batch {i} Loss: {loss.item()}")
            # zero gradients
            optimizer.zero_grad()
            # backprop
            loss.backward()
            # update weights
            optimizer.step()
            # store losses
            losses.append(loss.item())
    print(f"\nFinal Loss for layers={hidden_layers}, hidden_dim={hidden_dim}, compression_dim={compression_dim}, batch_size={batch_size}: {round(losses[-1],3)}")
   


Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=16, compression_dim=4, batch_size=128: 2.732
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=16, compression_dim=8, batch_size=128: 2.754
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=16, compression_dim=16, batch_size=128: 2.731
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=24, compression_dim=4, batch_size=128: 2.716
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=24, compression_dim=8, batch_size=128: 2.748
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=24, compression_dim=16, batch_size=128: 2.745
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=32, compression_dim=4, batch_size=128: 2.754
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=32, compression_dim=8, batch_size=128: 2.724
Epoch 5 of 5, Batch 350    
Final Loss for layers=1, hidden_dim=32, compression_dim=16, batch_size=128: 2.734
Epoch 5 of 5, Ba