In [None]:
from automatic_pitch import model, movement, plotting

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Model Comparison
Previously I explored the effect of using oversampling on improving the classification of baseball pitches.  Now I'm going to take a deeper dive into comparing models and seeing which is more effective.

In [None]:
start = '2020-09-01'
end = '2020-09-10'
data = model.get_pitches(start, end)

In [None]:
columns = list(data.columns)
columns_keep = ['pitch_type', 
                'release_speed',
                'p_throws',
                'release_pos_x',
                'release_pos_z',
                'pfx_x',
                'pfx_z',
                'plate_x',
                'plate_z',
                'zone',
                'vx0',
                'vy0',
                'vz0',
                'ax',
                'ay',
                'az',
                'release_spin_rate',
                'release_extension',
                'release_pos_y'
               ]

# Drop dolumns not in `columns_keep`
col_to_drop = model.remove_columns(columns, columns_keep)
data = data.drop(col_to_drop, axis=1)

In [None]:
# Add movement in x and z directions
v_o = np.array([data.vx0, data.vy0, data.vz0])
p_o = np.array([data.release_pos_x, data.release_pos_y, data.release_pos_z])
a = np.array([data.ax, data.ay, data.az])
dx, dz, dzg = movement.calc_movement(p_o, v_o, a)
data['movement_x'] = dx
data['movement_z'] = dz

# Remove data points that have no pitch_type
data = data.dropna(subset=['pitch_type', "release_extension"])
pitch_types = list(data.pitch_type.values)

#Creating the dependent variable class
factor = pd.factorize(data['pitch_type'])
data['pitch_type'] = factor[0]
definitions = factor[1]
print(set(data.pitch_type))
print(definitions)

In [None]:
# Plot movement_x for right-handed pitchers
fig, ax = plt.subplots()
#ax.hist(data["movement_z"][(data["p_throws"] == 'L')], alpha=0.5, color='k', bins=50)
ax.hist(data["pfx_z"][(data["p_throws"] == 'L')]- data["movement_z"][(data["p_throws"] == 'L')], alpha=0.5, color='k', bins=50)
#ax.hist(data["pfx_z"][(data["p_throws"] == 'L')], alpha=0.5, bins=50, color='b')

In [None]:
# Plot movement_x for right-handed pitchers
fig = plt.figure()
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    ax.set_title(definitions[i-1])
    ax.hist(data["movement_x"][(data["pitch_type"] == (i-1)) & (data["p_throws"] == 'R')], alpha=0.5, bins=50)
    ax.hist(data["pfx_x"][(data["pitch_type"] == (i-1)) & (data["p_throws"] == 'R')], alpha=0.5, bins=50)
    ax.set_xlim((-2, 2))
plt.tight_layout()

In [None]:
# Plot movement_x for left-handed pitchers
fig = plt.figure()
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    ax.set_title(definitions[i-1])
    ax.hist(data["movement_x"][(data["pitch_type"] == (i-1)) & (data["p_throws"] == 'L')], alpha=0.5, bins=50)
    ax.hist(data["pfx_x"][(data["pitch_type"] == (i-1)) & (data["p_throws"] == 'L')], alpha=0.5, bins=50)
    ax.set_xlim((-2, 2))
plt.tight_layout()

In [None]:
# Plot movement_z
fig = plt.figure()
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    ax.set_title(definitions[i-1])
    ax.hist(data["movement_z"][(data["pitch_type"] == (i-1))], alpha=0.5, bins=50)
    ax.hist(data["pfx_z"][(data["pitch_type"] == (i-1))], alpha=0.5, bins=50)
    ax.set_xlim((-2, 2))
plt.tight_layout()

Need to better understand the shifts between pfx and the movements I calculated

In [None]:
(data["pfx_z"], data["movement_z"])

In [None]:
# Split the data into the features (X) and the label to predict (y)
y = data['pitch_type']
X = data.loc[:, data.columns != 'pitch_type']

# Now use `train_test_split` in scikit-learn to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)

# Balance the data using imblearn
X_resampled, y_resampled = RandomOverSampler(sampling_strategy='not majority', random_state=123).fit_resample(X_train,y_train)

## Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_resampled, y_resampled)

logreg_predictions = logreg.predict(X_test)

In [None]:
score = logreg.score(X_test, y_test)
print(score)

In [None]:
#Reverse factorize (converting y_pred from 0s,1s and 2s
reversefactor = dict(zip(range(len(definitions)),definitions))
y_update = np.vectorize(reversefactor.get)(y_test)
logreg_update = np.vectorize(reversefactor.get)(logreg_predictions)

# Plot confusion matrix
cnf_matrix = confusion_matrix(y_update, logreg_update)
plotting.plot_confusion_matrix(cnf_matrix, classes=list(set(data.pitch_type)), normalize=True,
                      title='Normalized confusion matrix')

In [None]:
# Run Random Forest Model
"""rf = RandomForestClassifier(n_estimators=100,
                           oob_score=True,
                           random_state=0)

rf.fit(X_resampled, y_resampled)

rf_predictions = rf.predict(X_test)

#Reverse factorize (converting y_pred from 0s,1s and 2s
reversefactor = dict(zip(range(len(definitions)),definitions))
y_update = np.vectorize(reversefactor.get)(y_test)
rf_update = np.vectorize(reversefactor.get)(rf_predictions)
# Making the Confusion Matrix
print(pd.crosstab(y_update, rf_update, rownames=['Actual Pitches'], colnames=['Predicted Pitches']))
"""