In [1]:
# import game data
# parse game data: create a series of states
# create a list of all pairs (representing transitions)
# classify each pair as one of the possible outcomes
# create a probability distribution f(next state | outcome and current state)
# implement it into simulator

# Neural Collaborative Filtering

In [2]:
from retrosheet.retrosheet.parser import Retrosheet
rs = Retrosheet().batch_parse(yearFrom=2001, yearTo=2001)



In [3]:
import pandas as pd
plays_path = './data/plays.csv'

plays_data = pd.read_csv(plays_path)
plays_data

Unnamed: 0,game_id,order,pitcher,pitch_count,inning,team,player_id,count_on_batter,pitch_str,play_str,...,on-H,hometeam_score,awayteam_score,trajectory,passes,location,pre_state,post_state,play_runs,play_outs
0,ANA200104100,0,rappp001,2,1,0,greer001,1,CX,43/G,...,[],0,0,G,['43'],,1,9,0,1
1,ANA200104100,1,rappp001,7,1,0,velar001,12,CBSFX,8/F8D,...,[],0,0,F,['8'],8D,9,17,0,1
2,ANA200104100,2,rappp001,12,1,0,rodra001,22,CCBBX,63/G,...,[],0,0,G,['63'],,17,25,0,1
3,ANA200104100,3,olivd001,2,1,1,erstd001,1,CX,S7,...,[],0,0,,['7'],,1,2,0,0
4,ANA200104100,4,olivd001,6,1,1,ecksd001,12,CBSX,S9.1-2,...,[],0,0,,['9'],,2,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381206,SLN200209280,73,vizcl001,9,8,1,rente001,32,BSFBBB,W,...,[],3,1,,[],,9,10,0,0
381207,SLN200209280,74,vizcl001,12,8,1,martt002,11,BFX,46(1)3/GDP,...,[],3,1,,['463'],,10,25,0,2
381208,SLN200209280,77,isrij001,1,9,0,gintk001,0,X,5/FL,...,[],3,1,,['5'],,1,9,0,1
381209,SLN200209280,78,isrij001,5,9,0,sexsr001,21,BFBX,43/G,...,[],3,1,G,['43'],,9,17,0,1


In [4]:
def add_outcome(x): 
    if "G" in x: 
        return "groundout"
    elif "F" in x: 
        return "flyout"
    elif "K" in x: 
        return "strikeout"
    elif "S" in x: 
        return "single"
    elif "D" in x: 
        return "double"
    elif "T" in x: 
        return "triple"
    elif "W" in x or "HP" in x: 
        return "walk"
    elif "H" in x: 
        return "homerun"
    else: 
        return "other"
plays_data["outcome"] = plays_data["play_str"].apply(add_outcome)
plays_data

Unnamed: 0,game_id,order,pitcher,pitch_count,inning,team,player_id,count_on_batter,pitch_str,play_str,...,hometeam_score,awayteam_score,trajectory,passes,location,pre_state,post_state,play_runs,play_outs,outcome
0,ANA200104100,0,rappp001,2,1,0,greer001,1,CX,43/G,...,0,0,G,['43'],,1,9,0,1,groundout
1,ANA200104100,1,rappp001,7,1,0,velar001,12,CBSFX,8/F8D,...,0,0,F,['8'],8D,9,17,0,1,flyout
2,ANA200104100,2,rappp001,12,1,0,rodra001,22,CCBBX,63/G,...,0,0,G,['63'],,17,25,0,1,groundout
3,ANA200104100,3,olivd001,2,1,1,erstd001,1,CX,S7,...,0,0,,['7'],,1,2,0,0,single
4,ANA200104100,4,olivd001,6,1,1,ecksd001,12,CBSX,S9.1-2,...,0,0,,['9'],,2,5,0,0,single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381206,SLN200209280,73,vizcl001,9,8,1,rente001,32,BSFBBB,W,...,3,1,,[],,9,10,0,0,walk
381207,SLN200209280,74,vizcl001,12,8,1,martt002,11,BFX,46(1)3/GDP,...,3,1,,['463'],,10,25,0,2,groundout
381208,SLN200209280,77,isrij001,1,9,0,gintk001,0,X,5/FL,...,3,1,,['5'],,1,9,0,1,flyout
381209,SLN200209280,78,isrij001,5,9,0,sexsr001,21,BFBX,43/G,...,3,1,G,['43'],,9,17,0,1,groundout


In [83]:
import numpy as np
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader


class regressor_dataset(Dataset): 
    def __init__(self, df):
        data = df[["pitcher", "player_id", "outcome"]]
        pitchers = data["pitcher"].unique()
        batters = data["player_id"].unique()

        self.pitcher_map = {id: idx for idx, id in enumerate(pitchers)}
        self.batter_map = {id: idx for idx, id in enumerate(batters)}
        self.outcome_map = {
            "strikeout": 0, 
            "groundout": 1, 
            "flyout": 2, 
            "single": 3, 
            "double": 4, 
            "triple": 5, 
            "homerun": 6, 
            "walk": 7, 
            "other": 8,
        }

        self.data = torch.zeros((len(data), 2), dtype=torch.long)
        self.target = torch.zeros(len(data), len(self.outcome_map))

        pbar = tqdm(data.itertuples(index=True))
        for row in pbar: 
            cur = [self.pitcher_map[row[1]]], [self.batter_map[row[2]]]
            outcome_idx = self.outcome_map[row[3]]
            self.data[row[0], 0] = torch.tensor([self.pitcher_map[row[1]]])
            self.data[row[0], 1] = torch.tensor([self.batter_map[row[2]]])
            self.target[row[0], outcome_idx] = 1 

    def __len__(self): 
        return self.data.size()[0]

    def __getitem__(self, idx): 
        return self.data[idx, :], self.target[idx, :]


    

In [106]:
args = {
    "layers": [64, 64],
    "factor_num": 5, 
    "epochs": 10,
    "lr": 0.01,
    "batch_size": 32,
}

mlp_data = regressor_dataset(plays_data)
train_loader = DataLoader(mlp_data, batch_size=args['batch_size'], shuffle=True)

0it [00:00, ?it/s]

In [107]:
import torch.nn as nn
class Multi_Layer_Perceptron(nn.Module):
    def __init__(self, args, num_users, num_items):
        super(Multi_Layer_Perceptron, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.factor_num = args["factor_num"]
        self.layers = [2 * self.factor_num] + args["layers"]
        print(self.factor_num)

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num)

        self.fc_layers = nn.ModuleList()
       
        for idx, (in_size, out_size) in enumerate(zip(self.layers[:-1], self.layers[1:])):
            self.fc_layers.append(nn.Linear(in_size, out_size))

        self.affine_output = nn.Linear(in_features=self.layers[-1], out_features=9)
        self.logistic = nn.Sigmoid()

    def forward(self, indices):
        user_indices = indices[:, 0]
        item_indices = indices[:, 1]
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
        for idx, _ in enumerate(range(len(self.fc_layers))):
            vector = self.fc_layers[idx](vector)
            vector = nn.ReLU()(vector)
            # vector = nn.BatchNorm1d()(vector)
            # vector = nn.Dropout(p=0.5)(vector)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

In [108]:
model = Multi_Layer_Perceptron(args, len(mlp_data.pitcher_map), len(mlp_data.batter_map))
optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"])
bce_loss = nn.BCELoss()

5


In [109]:
total_iter = len(mlp_data)/args["batch_size"]
model.train()
for ep in range(args["epochs"]): 
    losses = []
    iter = 0
    for idxs, outcome in train_loader: 
        optimizer.zero_grad()
        output = model(idxs)
        loss = bce_loss(output, outcome)
        loss.backward()
        optimizer.step()
        losses.append(loss)
        iter += 1
        # print(f"percent done: {iter/total_iter*100}%")
    print(f"epoch: {ep}, loss: {round((sum(losses) / len(losses)).item(), 5)}")



epoch: 0, loss: 0.31279
epoch: 1, loss: 0.31121
epoch: 2, loss: 0.31128
epoch: 3, loss: 0.31127
epoch: 4, loss: 0.31119
epoch: 5, loss: 0.31125
epoch: 6, loss: 0.31117
epoch: 7, loss: 0.31116
epoch: 8, loss: 0.31109
epoch: 9, loss: 0.31103
