In [None]:
import pandas as pd
import numpy as np
import tabulate
import matplotlib.pyplot as plt
import re
import sklearn.preprocessing
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics as skm
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
 
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
    
pd.options.display.max_columns = None

In [None]:
# load the data

pitches = pd.read_csv("data/pitches.csv")
atbats = pd.read_csv("data/atbats.csv")
ejections = pd.read_csv("data/ejections.csv")
games = pd.read_csv("data/games.csv")
player_names = pd.read_csv("data/player_names.csv")

In [None]:
# decode labels for outcomes and pitch types

# outcome codes
outcome_code = {
    'outcome_code' : ['B', '*B', 'S', 'C', 
                      'F', 'T', 'L', 'I', 
                      'W', 'M', 'P', 
                      'Q','R', 'X', 
                      'D', 'E', 'H',
                      'V', 'Z'],
    'outcome_description' : ['Ball', 'Ball in Dirt', 'Swinging Strike', 'Called Strike',
                            'Foul', 'Foul Tip', 'Foul Bunt', 'Intentional Ball',
                            'Swinging Strike (Blocked)', 'Missed Bunt', 'Pitchout',
                            'Swinging Pitchout', 'Foul Pitchout', 'In Play, Out(s)', 
                            'In Play, No Outs', 'In Play, Runs', 'Hit by pitch',
                            'V', 'Z']
}
outcome_code = pd.DataFrame.from_dict(outcome_code)

# pitch type codes
pitch_code = {
    'pitch_code' : ['CH', 'CU', 'EP', 'FC', 
                      'FF', 'FO', 'FS', 'FT', 
                      'IN', 'KC', 'KN', 
                      'PO','SC', 'SI', 
                      'SL', 'UN', 'FA', 'AB'],
    'pitch_description' : ['Changeup', 'Curveball', 'Eephus', 'Cutter',
                            'Four-seam Fastball', 'Pitchout', 'Splitter', 'Two-seam Fastball',
                            'Intentionall ball', 'Knuckle curve', 'Knuckleball',
                            'Pitchout', 'Screwball', 'Sinker', 
                            'Slider', 'Unknown', 'FA', 'AB']
}
pitch_code = pd.DataFrame.from_dict(pitch_code)

In [None]:
# pitch outcomes, probabilities
pitch_outcome = pd.DataFrame(pitches['code'].value_counts())
pitch_outcome.columns = ['outcome_count']

outcome_df = pd.merge(pitch_outcome, outcome_code, how='left',
        left_index=True, right_on = 'outcome_code')
outcome_df['outcome_prob']= outcome_df['outcome_count']/outcome_df['outcome_count'].sum()
outcome_df

In [None]:
# bar chart of pitch outcomes
objects = outcome_df.outcome_description
y_pos = np.arange(len(outcome_df.outcome_description))
performance = outcome_df.outcome_count

plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('Pitches')
plt.title('MLB 2015-2018')

plt.show()

In [None]:
# pitch types, probabilities
pitch_type = pd.DataFrame(pitches['pitch_type'].value_counts())
pitch_type.columns = ['pitch_type_count']

pitch_df = pd.merge(pitch_type, pitch_code, how='left',
        left_index=True, right_on = 'pitch_code')
pitch_df['pitch_prob']= pitch_df['pitch_type_count']/pitch_df['pitch_type_count'].sum()
pitch_df


In [None]:
# bar chart of pitch types
objects = pitch_df.pitch_description
y_pos = np.arange(len(pitch_df.pitch_description))
performance = pitch_df.pitch_type_count

plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('Pitches')
plt.title('MLB 2015-2018')

plt.show()

In [None]:
# join all the data into all_df
all_df = pd.merge(pitches, atbats, how='left',
        left_on = 'ab_id', right_on = 'ab_id')

pitcher_df = player_names
pitcher_df.columns = ['pitcher_id', 'pitcher_first_name', 'pitcher_last_name']

all_df = pd.merge(all_df, pitcher_df, how='left',
        left_on = 'pitcher_id', right_on = 'pitcher_id')

batter_df = player_names
batter_df.columns = ['batter_id', 'batter_first_name', 'batter_last_name']

all_df = pd.merge(all_df, batter_df, how='left',
        left_on = 'batter_id', right_on = 'batter_id')

all_df = pd.merge(all_df, games, how='left',
        left_on = 'g_id', right_on = 'g_id')

all_df = pd.merge(all_df, pitch_code, how='left',
        left_on = 'pitch_type', right_on = 'pitch_code')

all_df = pd.merge(all_df, outcome_code, how='left',
        left_on = 'code', right_on = 'outcome_code')

all_df.head(25)


#all_df.shape[0]

In [None]:
# pick out the columns that are known before a pitch takes place

pred_df = all_df[['pitch_code', 'pitch_description', 'b_score', 'b_count', 's_count',
                 'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'batter_id', 'inning', 'o',
                 'p_score', 'p_throws', 'stand', 'top', 'batter_id', 'pitcher_id',
                 'attendance', 'away_team', 'home_team', 'umpire_HP',
                 'venue_name', 'weather', 'wind', 'delay']]
pred_df.head(5)

In [None]:
pitches.shape[0]

In [None]:
# extract weather conditions
pred_df['temp'] = pred_df['weather'].str.extract(r'(\d+)') # temperature
pred_df['temp'] = pd.to_numeric(pred_df['temp'])
pred_df['weather_cond'] = pred_df['weather'].str.extract(r'([^,]*$)') # weather condition
pred_df['wind_mph'] = pred_df['wind'].str.extract(r'(\d+)') # wind speed
pred_df['wind_mph'] = pd.to_numeric(pred_df['wind_mph'])
pred_df['wind_dir'] = pred_df['wind'].str.extract(r'([^,]*$)') # wind direction

pred_df.head(5)

In [None]:
# one-hot encode all the dummy variables

pitch_num_dummy = pd.get_dummies(pred_df['pitch_num'], prefix='pitch_num')
p_throws_dummy = pd.get_dummies(pred_df['p_throws'], prefix='p_throws')
stand_dummy = pd.get_dummies(pred_df['stand'], prefix='stand')
away_team_dummy = pd.get_dummies(pred_df['away_team'], prefix='away_team')
home_team_dummy = pd.get_dummies(pred_df['home_team'], prefix='home_team')
umpire_HP_dummy = pd.get_dummies(pred_df['umpire_HP'], prefix='umpire_HP')
venue_name_dummy = pd.get_dummies(pred_df['venue_name'], prefix='venue_name')
weather_cond_dummy = pd.get_dummies(pred_df['weather_cond'], prefix='weather_cond')
wind_dir_dummy = pd.get_dummies(pred_df['wind_dir'], prefix='wind_dir')

encoded_df = pd.concat([pred_df, pitch_num_dummy, p_throws_dummy, stand_dummy, away_team_dummy,
           home_team_dummy, umpire_HP_dummy, venue_name_dummy, weather_cond_dummy,
           wind_dir_dummy], axis=1)
encoded_df.head(5)

In [None]:
# factorize the dependent variable pitch code

factor = pd.factorize(encoded_df['pitch_code'])
encoded_df['pitch_code_num'] = factor[0]
pitch_code_def = factor[1]
print(pitch_code_def)

In [None]:
# remove categorical variables

encoded_df = encoded_df.drop(['pitch_code','pitch_description', 'pitch_num', 'p_throws', 'stand',
                             'away_team', 'home_team', 'umpire_HP', 'venue_name', 'weather_cond',
                             'wind_dir', 'batter_id', 'pitcher_id', 'wind', 'weather'], axis=1)

In [None]:
# specify subset of records to use
# encoded_df = encoded_df[ :100000]

In [None]:
encoded_df.info()

In [None]:
encoded_df.dropna(inplace=True)

In [None]:
encoded_df.dtypes

In [None]:
X = sklearn.preprocessing.scale(encoded_df.iloc[ : , :-1]) # use all but the last column
y = encoded_df['pitch_code_num'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# logistic regression - this fails to converge after many hours

# mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='saga').fit(X_train, y_train)

# print('Multinomial Logistic regression Train Accuracy')
# metrics.accuracy_score(y_train, mul_lr.predict(X_train))

# print('Multinomial Logistic regression Test Accuracy')
# metrics.accuracy_score(y_test, mul_lr.predict(X_test))

In [None]:
# random forest model
# https://www.codementor.io/agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu

classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
  
y_pred = classifier.predict(X_test)
y_pred.size

# reverse factorization
reversefactor = dict(zip(range(18),pitch_code_def))
y_test = np.vectorize(reversefactor.get)(y_test)

y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
cm = pd.crosstab(y_test, y_pred, rownames=['Actual Pitch'], colnames=['Predicted Pitch'])
print(cm)

In [None]:
df_x_train = pd.DataFrame(X_train)

feature_importances = pd.DataFrame(classifier.feature_importances_,
                                   index = df_x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

train_col_df = pd.DataFrame(encoded_df.columns)
train_col_df

fi = pd.merge(feature_importances, train_col_df, how='left',
        left_index = True, right_index = True)
fi.iloc[:20]
#fi.to_csv('importance.csv')

In [None]:
print(skm.classification_report(y_test,y_pred))

In [None]:
encoded_df

Multi-layer network (MLP)

In [None]:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

X = sklearn.preprocessing.scale(encoded_df.iloc[ : , :-1]) # use all but the last column
y = encoded_df['pitch_code_num'].values

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.2, random_state=0)
model = Sequential()
model.add(Dense(96, input_dim=261, activation='relu'))
model.add(Dense(96, input_dim=261, activation='relu'))
model.add(Dense(19, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=True)


In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()