In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsapi

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

%matplotlib inline

In [186]:
import datetime

yesterday = (datetime.datetime.today() - datetime.timedelta(days = 1)).strftime("%m_%d_%Y")
today = datetime.datetime.today().strftime("%m_%d_%Y")

In [130]:
hits = pd.concat([pd.read_csv("player_stats_07_26_2019.csv"), 
                  pd.read_csv("player_stats_07_24_2019.csv"),
                  pd.read_csv("player_stats_07_27_2019.csv")
                 ], sort=False)

In [131]:
hits.set_index(np.arange(len(hits)), inplace=True)

In [132]:
hits['player_got_hit'] = hits['player_got_hit'].apply(float)

In [133]:
data = hits.iloc[:, 3:-1]
data

Unnamed: 0,gamesPlayed,groundOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,hits,...,walksPer9Inn_p5G,whip_p5G,winPercentage_p5G,atBats_h2h,avg_h2h,hits_h2h,obp_h2h,ops_h2h,slg_h2h,pitcher_hitter_opposite_hand
0,94.0,50.0,40.0,7.0,2.0,19.0,63.0,24.0,2.0,46.0,...,1.26,1.29,0.50,9.0,0.444,4.0,0.500,1.389,0.889,1.0
1,101.0,80.0,54.0,17.0,1.0,27.0,115.0,41.0,2.0,94.0,...,1.26,1.29,0.50,5.0,0.000,0.0,0.167,0.167,0.000,0.0
2,94.0,97.0,42.0,15.0,2.0,14.0,53.0,30.0,2.0,75.0,...,1.26,1.29,0.50,6.0,0.167,1.0,0.167,0.500,0.333,1.0
3,95.0,79.0,51.0,21.0,1.0,9.0,85.0,44.0,2.0,93.0,...,1.26,1.29,0.50,6.0,0.333,2.0,0.333,0.833,0.500,1.0
4,94.0,105.0,37.0,13.0,2.0,5.0,50.0,13.0,3.0,90.0,...,1.26,1.29,0.50,4.0,0.000,0.0,0.200,0.200,0.000,0.0
5,92.0,58.0,28.0,10.0,2.0,6.0,43.0,11.0,0.0,62.0,...,1.26,1.29,0.50,3.0,0.333,1.0,0.333,0.666,0.333,0.0
6,38.0,10.0,8.0,4.0,0.0,3.0,18.0,8.0,0.0,19.0,...,1.26,1.29,0.50,3.0,0.667,2.0,0.667,2.334,1.667,1.0
7,66.0,58.0,40.0,15.0,4.0,8.0,62.0,24.0,0.0,69.0,...,1.26,1.29,0.50,0.0,0.000,0.0,0.000,0.000,0.000,0.0
8,38.0,12.0,14.0,4.0,5.0,2.0,26.0,8.0,0.0,29.0,...,1.26,1.29,0.50,0.0,0.000,0.0,0.000,0.000,0.000,0.0
9,4.0,4.0,4.0,2.0,0.0,2.0,5.0,1.0,0.0,5.0,...,1.26,1.29,0.50,0.0,0.000,0.0,0.000,0.000,0.000,0.0


In [134]:
labels = hits.iloc[:, -1]
labels

0       0.0
1       0.0
2       1.0
3       0.0
4       0.0
5       0.0
6       1.0
7       0.0
8       0.0
9       0.0
10      0.0
11      1.0
12      1.0
13      1.0
14      0.0
15      1.0
16      1.0
17      0.0
18      0.0
19      1.0
20      1.0
21      1.0
22      1.0
23      0.0
24      1.0
25      1.0
26      1.0
27      1.0
28      1.0
29      0.0
       ... 
1084    0.0
1085    1.0
1086    0.0
1087    0.0
1088    0.0
1089    1.0
1090    1.0
1091    0.0
1092    0.0
1093    1.0
1094    0.0
1095    0.0
1096    1.0
1097    0.0
1098    0.0
1099    1.0
1100    1.0
1101    1.0
1102    1.0
1103    0.0
1104    0.0
1105    0.0
1106    1.0
1107    0.0
1108    1.0
1109    1.0
1110    1.0
1111    1.0
1112    1.0
1113    0.0
Name: player_got_hit, Length: 1114, dtype: float64

In [135]:
# data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2)
data_train, data_val, labels_train, labels_val = train_test_split(data, labels, test_size=0.2)

In [136]:
np.mean(labels_train == 0)

0.54545454545454541

In [137]:
np.mean(labels_train == 1)

0.45454545454545453

In [138]:
logreg = LogisticRegression(penalty='l1').fit(data_train, labels_train)

In [139]:
np.mean(logreg.predict(data_train) == labels_train)

0.78787878787878785

In [140]:
np.mean(logreg.predict(data_val) == labels_val)

0.78026905829596416

In [141]:
hits.take(data_val.take(np.argsort(logreg.predict_proba(data_val)[:, 1])[::-1][:20]).index)['Name']

1093       David Fletcher
1006        Carlos Correa
376      Charlie Blackmon
488          Keston Hiura
574           Javier Baez
338          Niko Goodrum
1102    Anthony Santander
828          Mallex Smith
445        Miguel Cabrera
1106       Hanser Alberto
965          Kevin Newman
437         Brandon Dixon
108            Adam Jones
561        Donovan Solano
658       Freddie Freeman
781         Rafael Devers
382         Nolan Arenado
28        Cesar Hernandez
569     Albert Almora Jr.
887       Freddie Freeman
Name: Name, dtype: object

In [187]:
hits_test = pd.read_csv("player_stats_{}.csv".format(today))
hits_test.head()

Unnamed: 0,Name,ID,Team,gamesPlayed,groundOuts,runs,doubles,triples,homeRuns,strikeOuts,...,walksPer9Inn_p5G,whip_p5G,winPercentage_p5G,atBats_h2h,avg_h2h,hits_h2h,obp_h2h,ops_h2h,slg_h2h,pitcher_hitter_opposite_hand
0,Adam Haseley,656514,143,16.0,20.0,8.0,3.0,0.0,3.0,10.0,...,1.69,2.02,0.25,3.0,0.333,1.0,0.333,1.666,1.333,1.0
1,Andrew Knapp,595284,143,49.0,20.0,8.0,3.0,0.0,1.0,31.0,...,1.69,2.02,0.25,2.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Bryce Harper,547180,143,104.0,79.0,59.0,30.0,0.0,18.0,125.0,...,1.69,2.02,0.25,12.0,0.417,5.0,0.556,1.223,0.667,1.0
3,Cesar Hernandez,514917,143,104.0,123.0,47.0,21.0,3.0,7.0,58.0,...,1.69,2.02,0.25,11.0,0.455,5.0,0.538,1.265,0.727,1.0
4,J.T. Realmuto,592663,143,99.0,83.0,61.0,20.0,1.0,14.0,92.0,...,1.69,2.02,0.25,9.0,0.111,1.0,0.111,0.555,0.444,0.0


In [172]:
data_test = hits_test.iloc[:, 3:]

In [173]:
hits_test.take(np.argsort(logreg.predict_proba(data_test)[:, 1])[::-1][:15])['Name']

335      Mike Yastrzemski
330      Brandon Crawford
317    Fernando Tatis Jr.
351       Xander Bogaerts
218          Yuli Gurriel
217        Yordan Alvarez
79          Josh VanMeter
84        Tucker Barnhart
312        Miguel Cabrera
61             Alex Avila
89             David Dahl
222         Cam Gallagher
94           Ryan McMahon
101            J.D. Davis
114            Elias Diaz
Name: Name, dtype: object

## Gradient boosted decision trees??

In [174]:
from sklearn.ensemble import AdaBoostClassifier

In [175]:
boosted_dt = AdaBoostClassifier()
boosted_dt.fit(data_train, labels_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [176]:
np.mean(boosted_dt.predict(data_val) == labels_val)

0.7488789237668162

In [177]:
hits_test.take(np.argsort(boosted_dt.predict_proba(data_test)[:, 1])[::-1][:10])['Name']

226         Jorge Soler
79        Josh VanMeter
230     Whit Merrifield
126       Gerardo Parra
334        Kevin Pillar
249       Marcus Semien
253      Ramon Laureano
82        Phillip Ervin
330    Brandon Crawford
209     George Springer
Name: Name, dtype: object

## Random forests...

In [178]:
from sklearn.ensemble import RandomForestClassifier

In [202]:
# Tuning hyperparameters for random forests
# Note that cross-validation is sort of "automatically" done because 
# sklearn's implementation of random forests uses bagging for each 
# estimator -- more details here 
# https://scikit-learn.org/stable/modules/grid_search.html#out-of-bag-estimates

from sklearn.model_selection import GridSearchCV

print(rf_classifier.get_params())
param_grid = [
    {'criterion': ['gini'], 'max_depth': [20], 'min_samples_leaf': [4, 10, 20, 30],
    'n_estimators': [50, 75, 100, 125, 150]}
]

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_split': 1e-07, 'min_samples_leaf': 10, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': 1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [203]:
rf_cv = GridSearchCV(rf_classifier, param_grid, cv=4)
rf_cv.fit(data_train, labels_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'criterion': ['gini'], 'max_depth': [20], 'min_samples_leaf': [4, 10, 20, 30], 'n_estimators': [50, 75, 100, 125, 150]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [204]:
rf_cv.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'min_samples_leaf': 10,
 'n_estimators': 100}

The best parameters seem to be a Gini score criterion, a max depth per tree of 20, a minimum of 10 points per leaf node, and 100 estimators.

In [205]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=20, min_samples_leaf=10)
rf_classifier.fit(data_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [206]:
np.mean(rf_classifier.predict(data_val) == labels_val)

0.78923766816143492

In [226]:
statsapi.lookup_team(117)[0]['name']

'Houston Astros'

In [234]:
predictions = hits_test.take(np.argsort(rf_classifier.predict_proba(data_test)[:, 1])[::-1][:10])[['Name', 'Team']].reset_index().iloc[:, 1:]
predictions.columns = ["name", "team_id"]
predictions["team_name"] = predictions['team_id'].apply(lambda x: statsapi.lookup_team(x)[0]['name'])
predictions["hit_probability"] = np.sort(rf_classifier.predict_proba(data_test)[:, 1])[::-1][:10]
predictions.to_csv("predictions_{}.csv".format(today))

## K-NN, because why not

In [150]:
from sklearn.neighbors import KNeighborsClassifier

In [155]:
# TODO: tune hyperparameters
knn_clf = KNeighborsClassifier(n_neighbors=11, weights='distance')
knn_clf.fit(data_train, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='distance')

In [156]:
np.mean(knn_clf.predict(data_val) == labels_val)

0.5964125560538116

In [157]:
hits_test.take(np.argsort(knn_clf.predict_proba(data_test)[:, 1])[::-1][:10])['Name']

86     Christin Stewart
28        J.D. Martinez
16          Joey Wendle
45       Gleyber Torres
18     Michael Brosseau
43    Edwin Encarnacion
36      Xander Bogaerts
89         JaCoby Jones
22           Tommy Pham
23      Travis d'Arnaud
Name: Name, dtype: object

## Now the fun begins

In [158]:
num_rows = len(data)
num_feats = len(np.transpose(data))
print(num_rows)
print(num_feats)

1114
61


### Convert our arrays to torch tensors

In [159]:
data_train_t = torch.from_numpy(np.array(data_train)).type(torch.FloatTensor)
data_val_t = torch.from_numpy(np.array(data_val)).type(torch.FloatTensor)
data_test_t = torch.from_numpy(np.array(data_test)).type(torch.FloatTensor)

labels_train_t = torch.from_numpy(np.array(labels_train)).type(torch.LongTensor)
labels_val_t = torch.from_numpy(np.array(labels_val)).type(torch.LongTensor)
# labels_test_t = torch.from_numpy(np.array(labels_test)).type(torch.LongTensor)

### Then begin training

In [208]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.classify = nn.Sequential(
            nn.Linear(num_feats, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, 2)
        )
        self.fc1 = nn.Linear(num_feats, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)
        
    def forward(self, x):
        x = self.classify(x)
        return x
    
    def predict(self, x):
        predictions = F.softmax(self.forward(x), dim=1)
        return predictions
        

In [209]:
model = NeuralNet()

In [210]:
# Hyperparameters
learning_rate = .001
reg_param = .001
epochs = 250

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg_param)

In [211]:
losses = []
for i in range(epochs):
    outputs = model.forward(data_train_t)
    loss = criterion(outputs, labels_train_t)
    losses.append(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [212]:
losses

[3.5962157249450684,
 0.6674350500106812,
 2.5586018562316895,
 2.1032216548919678,
 0.8293870091438293,
 1.2489771842956543,
 1.7745410203933716,
 1.5320225954055786,
 0.8739541172981262,
 0.8007000684738159,
 1.2775129079818726,
 1.259511947631836,
 0.844200611114502,
 0.6990857124328613,
 0.9853190779685974,
 1.081223487854004,
 0.8883968591690063,
 0.6712538003921509,
 0.7855293154716492,
 0.9365317821502686,
 0.8383257985115051,
 0.6693366765975952,
 0.7221936583518982,
 0.833164632320404,
 0.7924787402153015,
 0.6749505996704102,
 0.6809592843055725,
 0.7673376202583313,
 0.7452974915504456,
 0.6600557565689087,
 0.6670337915420532,
 0.7115239500999451,
 0.6702172160148621,
 0.6572307348251343,
 0.668310821056366,
 0.6366018056869507,
 0.6609506607055664,
 0.6470149755477905,
 0.6287491321563721,
 0.643936038017273,
 0.6344396471977234,
 0.6283947229385376,
 0.6317319273948669,
 0.6283726096153259,
 0.624365508556366,
 0.625474214553833,
 0.6244587898254395,
 0.6204333305358887,


In [213]:
# Training accuracy

_, predicted = torch.max(outputs, 1)
np.mean(predicted.numpy() == labels_train)

0.79797979797979801

In [214]:
with torch.no_grad():
    val_outputs = model.forward(data_val_t)
    _, predicted_val = torch.max(val_outputs.data, 1)

In [215]:
# Validation accuracy

np.mean(predicted_val.numpy() == labels_val)

0.71300448430493268

In [216]:
highest_probs_indices_val = np.argsort(F.softmax(val_outputs)[:, 1].numpy())[-25:][::-1]

  """Entry point for launching an IPython kernel.


In [217]:
hits.take(np.array(data_val.take(highest_probs_indices_val).index))['Name']

1093       David Fletcher
561        Donovan Solano
488          Keston Hiura
382         Nolan Arenado
887       Freddie Freeman
41        Freddie Freeman
584       Corey Dickerson
1072        Marcus Semien
828          Mallex Smith
1006        Carlos Correa
173          Keston Hiura
1102    Anthony Santander
781         Rafael Devers
338          Niko Goodrum
445        Miguel Cabrera
664      Ronald Acuna Jr.
499            Joey Votto
93             Tommy Pham
1082         Hunter Pence
376      Charlie Blackmon
1042          Eric Hosmer
437         Brandon Dixon
28        Cesar Hernandez
3              Joey Votto
650         Manny Machado
Name: Name, dtype: object

In [218]:
# On test data
with torch.no_grad():
    test_outputs = model.forward(data_test_t)
    _, predicted_test = torch.max(test_outputs.data, 1)

In [219]:
highest_probs_indices_test = np.argsort(F.softmax(test_outputs)[:, 1].numpy())[-25:][::-1]

  """Entry point for launching an IPython kernel.


In [220]:
hits_test.take(np.array(data_test.take(highest_probs_indices_test).index))['Name']

23         Tyler Flowers
41            Matt Duffy
72         Yasmany Tomas
28         Freddy Galvis
142            Max Muncy
145            Ben Gamel
303         Bobby Wilson
12           Adam Duvall
149         Keston Hiura
280         Chance Sisco
81           Nick Senzel
221       Bubba Starling
79         Josh VanMeter
137         Corey Seager
22      Ronald Acuna Jr.
337         Stephen Vogt
2           Bryce Harper
25         Brandon Drury
63      Christian Walker
322          Ian Kinsler
269        Brian Goodwin
351      Xander Bogaerts
33     Teoscar Hernandez
47          Willy Adames
65       Eduardo Escobar
Name: Name, dtype: object