In [1]:
%env CUDA_LAUNCH_BLOCKING=1
import pandas as pd
import torch
import sklearn as sk
from datetime import date
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import numpy as np



env: CUDA_LAUNCH_BLOCKING=1


In [2]:
# import the data
path = '/home/projects/baseball-datasets/data/final/atbats.csv'
df = pd.read_csv(path, parse_dates=['date'])
df = df[['date', 'pitcher', 'batter', 'outcome']]
print(df.describe(include='all'))

                                 date   pitcher    batter    outcome
count                         2695558   2695558   2695558    2695558
unique                            NaN      2721      3730         11
top                               NaN  schem001  freef001  strikeout
freq                              NaN     12342      9310     487899
mean    2017-05-09 02:17:18.417723136       NaN       NaN        NaN
min               2011-03-31 00:00:00       NaN       NaN        NaN
25%               2014-04-18 00:00:00       NaN       NaN        NaN
50%               2017-05-09 00:00:00       NaN       NaN        NaN
75%               2020-09-08 00:00:00       NaN       NaN        NaN
max               2023-10-01 00:00:00       NaN       NaN        NaN


In [3]:
# Do some exploratory data analysis
# how many outcomes with unknown values?
print('Number of unknown outcomes: ', len(df[df.outcome == 'unknown']))
# How many at bats are there?
print('Number of at bats: ', len(df))
# How many unique pitchers are there?
print('Number of pitchers: ', len(df.pitcher.unique()))
# How many unique batters are there?
print('Number of batters: ', len(df.batter.unique()))
# spanning how many seasons?
print('Number of seasons: ', min(df.date.dt.year), ' to ', max(df.date.dt.year))
# check to make sure date is in the right format
print('Date format: ', type(df.date[0]))


 


Number of unknown outcomes:  358072
Number of at bats:  2695558
Number of pitchers:  2721
Number of batters:  3730
Number of seasons:  2011  to  2023
Date format:  <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [4]:
# preprocess the data
# onehot encode the outcome into multiple columns as 0 or 1

mean_df = pd.get_dummies(df, columns=['outcome'], dtype='int64')
# df.head()
# convert datatype to in

In [5]:
# function that splits the data into train, test, and validation sets, the year of 2023 is used as the test set
def split_data(df, train_cuttoff='2022-01-01', test_cuttoff='2023-01-01'):
    train = df[df['date'] < train_cuttoff]
    valid = df[(df['date'] >= train_cuttoff) & (df['date'] < test_cuttoff)]
    test = df[df['date'] >= test_cuttoff]
    return train, valid, test

train_df, valid_df, test_df = split_data(df)
train_df.head()
train_df['outcome'].unique()

array(['popout', 'walk', 'double', 'single', 'flyout', 'groundout',
       'lineout', 'strikeout', 'homerun', 'unknown', 'triple'],
      dtype=object)

In [6]:
# Get the baseline accuracy for the test set based on predicting a distribution of the training set
# Convert the train, test, and valid set to a tensor
train_df_mean, valid_df_mean, test_df_mean = split_data(mean_df)
train_tensor = torch.tensor(train_df_mean.iloc[:, 3:].to_numpy().astype(float), dtype=torch.float64)
test_tensor = torch.tensor(test_df_mean.iloc[:, 3:].to_numpy().astype(float), dtype=torch.float64)
valid_tensor = torch.tensor(valid_df_mean.iloc[:, 3:].to_numpy().astype(float), dtype=torch.float64)

# Get the distribution of the training set
train_dist = torch.mean(train_tensor, dim=0, dtype=torch.float64)
# duplicate for every row in the test set
train_dist_test = train_dist.repeat(len(test_tensor), 1)
# duplicate for every row in the valid set
train_dist_valid = train_dist.repeat(len(valid_tensor), 1)

# Calculate the cross entropy loss of the test set
valid_loss = torch.nn.functional.cross_entropy(train_dist_test, test_tensor)
test_loss = torch.nn.functional.cross_entropy(train_dist_valid, valid_tensor)
print(train_dist)
print('Test loss: ', test_loss.item())
print('Valid loss: ', valid_loss.item())

tensor([0.0775, 0.1076, 0.1496, 0.0369, 0.0415, 0.0365, 0.1568, 0.1789, 0.0068,
        0.1337, 0.0740], dtype=torch.float64)
Test loss:  2.3628871009774945
Valid loss:  2.362509602861311


Now we are going to use a Neural Matrix Factorization Model in order to try and predict the outcomes 

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class NCF(nn.Module):
    def __init__(
        self, num_pitchers, num_batters, num_outcomes, embed_dim=50, num_layers=2
    ):
        super(NCF, self).__init__()

        # Increase the embedding dimension
        self.pitcher_embedding = nn.Embedding(num_pitchers, embed_dim)
        self.batter_embedding = nn.Embedding(num_batters, embed_dim)

        self.fc1 = nn.Linear(embed_dim * 2, 128)  # Increase the hidden layer size
        self.linear_layers = nn.ModuleList(
            [nn.Linear(128, 128) for _ in range(num_layers - 2)]
        )
        self.dropout = nn.ModuleList([nn.Dropout(0.3) for _ in range(num_layers - 1)])
        self.output_layer = nn.Linear(128, num_outcomes)

        # Add Dropout layers
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

    def forward(self, pitcher_ids, batter_ids):
        pitcher_embed = self.pitcher_embedding(pitcher_ids)
        batter_embed = self.batter_embedding(batter_ids)
        x = torch.cat([pitcher_embed, batter_embed], dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout[0](x)  # Dropout layer after first hidden layer
        for idx in range(len(self.linear_layers)):
            x = F.relu(self.linear_layers[idx](x))
            x = self.dropout[idx + 1](x)
        
        out = self.output_layer(x)
        
        return out

In [8]:
class BaseballDataset(Dataset):
    def __init__(self, df):
        self.pitcher = torch.tensor(df['pitcher'].to_numpy().astype(int), dtype=torch.long)
        self.batter = torch.tensor(df['batter'].to_numpy().astype(int), dtype=torch.long)
        self.data = torch.tensor(df['outcome'].to_numpy().astype(int), dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Assuming the first two columns are batter and pitcher indices, and the rest are outcomes
        return self.pitcher[idx], self.batter[idx], self.data[idx]

In [9]:
class DataPreprocessor:
    def __init__(self, train_df, proportion=.15):
        self.train_df = train_df
        self.known_batter_ids = set(train_df['batter'])
        self.known_pitcher_ids = set(train_df['pitcher'])

        # Create mappings for batter and pitcher IDs to indices
        self.batter_id_to_index = {id: idx for idx, id in enumerate(self.known_batter_ids, start=1)}  # Start from 1 to reserve 0 for 'unknown'
        self.pitcher_id_to_index = {id: idx for idx, id in enumerate(self.known_pitcher_ids, start=1)}  # Start from 1 to reserve 0 for 'unknown' 
        self.batter_id_to_index['unknown'] = 0
        self.pitcher_id_to_index['unknown'] = 0
        self.outcomes = {
            'single': 0, 
            'double': 1,
            'triple': 2,
            'homerun': 3,
            'walk': 4,
            'strikeout': 5,
            'groundout': 6,
            'flyout': 7,
            'lineout':8,
            'popout': 9,
            'unknown': 10
            }
    
    def augment(self, train_df, proportion=.15):
        # sample the data according to the proportion and remove either the pitcher or batter
        train_aug_pitcher = train_df.sample(frac=proportion/2)
        train_aug_pitcher['pitcher'] = 'unknown'
        train_aug_batter = train_df.sample(frac=proportion/2)
        train_aug_batter['batter'] = 'unknown'
        train_aug = pd.concat([train_df, train_aug_pitcher, train_aug_batter])
        return train_aug

    def preprocess(self, df, train=False, proportion=.15):
        # Replace unknown IDs and convert to indices
        processed_df = df.copy()
        if train:
            processed_df = self.augment(df, proportion=proportion)

        
        processed_df.loc[:, 'batter'] = processed_df['batter'].apply(lambda x: self.batter_id_to_index.get(x, 0))
        processed_df.loc[:, 'pitcher'] = processed_df['pitcher'].apply(lambda x: self.pitcher_id_to_index.get(x, 0))
        processed_df.loc[:, 'outcome'] = processed_df['outcome'].apply(lambda x: self.outcomes.get(x, 10))
        # Also need to drop the date column
        processed_df = processed_df.drop(columns=['date'])

        return processed_df

    def get_num_batters(self):
        return len(self.batter_id_to_index)
    
    def get_num_pitchers(self):
        return len(self.pitcher_id_to_index)    
    
    def get_num_outcomes(self):
        return len(self.outcomes)


In [10]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import torch.nn.functional as F

def train_model(train_dataloader, test_dataloader, preprocessor, learning_rate, epochs, embed_dim, num_layers, early_stopping_patience=5):
    # Initialize the model, loss, and optimizer with hyperparameters
    num_pitchers = preprocessor.get_num_pitchers()
    num_batters = preprocessor.get_num_batters()
    num_outcomes = preprocessor.get_num_outcomes()
    
    model = NCF(num_pitchers, num_batters, num_outcomes, embed_dim=embed_dim, num_layers=num_layers)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('using: ', device)
    # device = torch.device("cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        # Training loop
        model.train()
        train_loss = 0.0
        for i, (pitchers, batters, outcomes) in enumerate(train_dataloader):
            pitchers, batters, outcomes = pitchers.to(device), batters.to(device), outcomes.to(device)
            # check all data types
            outputs = model(pitchers, batters)
            loss = criterion(input=outputs, target=outcomes)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            print(loss.item())
        train_loss /= len(train_dataloader)

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for pitchers, batters, outcomes in test_dataloader:
                pitchers, batters, outcomes = pitchers.to(device), batters.to(device), outcomes.to(device)
                outputs = model(pitchers, batters)
                loss = criterion(outputs, outcomes)
                val_loss += loss.item()
        val_loss /= len(test_dataloader)

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == early_stopping_patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break

    return val_loss



In [11]:
LEARNING_RATE = 0.001
EPOCHS = 20
EMBED_DIM = 50
NUM_LAYERS = 3

preprocessor = DataPreprocessor(train_df)

train_df_processed = preprocessor.preprocess(train_df, train=True)
valid_df_processed = preprocessor.preprocess(valid_df)
test_df_processed = preprocessor.preprocess(test_df)

# Create Dataset Instances
train_dataset = BaseballDataset(train_df_processed)
valid_dataset = BaseballDataset(valid_df_processed)
test_dataset = BaseballDataset(test_df_processed)

train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=512, shuffle=False)

val_loss = train_model(train_dataloader, valid_dataloader, preprocessor, LEARNING_RATE, EPOCHS, EMBED_DIM, NUM_LAYERS)
# print(f"Validation Loss: {val_loss}")

using:  cuda:0
Epoch [1/20], Train Loss: 2.1609, Validation Loss: 2.0800
Epoch [2/20], Train Loss: 2.1303, Validation Loss: 2.0618
Epoch [3/20], Train Loss: 2.1238, Validation Loss: 2.0485
Epoch [4/20], Train Loss: 2.1207, Validation Loss: 2.0426
Epoch [5/20], Train Loss: 2.1191, Validation Loss: 2.0455
Epoch [6/20], Train Loss: 2.1180, Validation Loss: 2.0440
Epoch [7/20], Train Loss: 2.1171, Validation Loss: 2.0462
Epoch [8/20], Train Loss: 2.1163, Validation Loss: 2.0447
Epoch [9/20], Train Loss: 2.1154, Validation Loss: 2.0417
Epoch [10/20], Train Loss: 2.1150, Validation Loss: 2.0408
Epoch [11/20], Train Loss: 2.1144, Validation Loss: 2.0436
Epoch [12/20], Train Loss: 2.1138, Validation Loss: 2.0430
Epoch [13/20], Train Loss: 2.1134, Validation Loss: 2.0435
Epoch [14/20], Train Loss: 2.1130, Validation Loss: 2.0468
Epoch [15/20], Train Loss: 2.1124, Validation Loss: 2.0478
Early stopping triggered after 15 epochs


In [12]:
class NCF(nn.Module):
    def __init__(
        self, num_pitchers, num_batters, num_outcomes, embed_dim=50):
        super(NCF, self).__init__()
        # Increase the embedding dimension
        self.pitcher_embedding = nn.Embedding(num_pitchers, embed_dim)
        self.batter_embedding = nn.Embedding(num_batters, embed_dim)
        self.output_layer = nn.Linear(embed_dim*2, num_outcomes)
        # Add Dropout layers

    def forward(self, pitcher_ids, batter_ids):
        pitcher_embed = self.pitcher_embedding(pitcher_ids)
        batter_embed = self.batter_embedding(batter_ids)
        x = torch.cat([pitcher_embed, batter_embed], dim=1)
        out = self.output_layer(x)
        
        return out
    
criterion = nn.CrossEntropyLoss()
device = torch.device("cpu")

pitcher = torch.tensor([1], dtype=torch.long).to(device)
batter = torch.tensor([0], dtype=torch.long).to(device)

model = NCF(2, 2, 3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=.001)
model.train()
out = model(pitcher, batter)

target = torch.tensor([1], dtype=torch.long)
loss = criterion(out, target)
loss.backward()
optimizer.step()