In [None]:
from automatic_pitch import model, movement, plotting
from automatic_pitch.utils import get_color

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from scipy.stats import pearsonr

from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# TODO: Raise error if end < start
start = '2019-08-01'
end = '2019-09-01'
data = model.get_pitches(start, end)

In [None]:
# Add movement in x and z directions
# Taking the absolute value is important
v_o = np.array([np.abs(data.vx0), data.vy0, data.vz0])
p_o = np.array([data.release_pos_x, data.release_pos_y, data.release_pos_z])
a = np.array([np.abs(data.ax), data.ay, data.az])
dx, dz, dzg = movement.calc_movement(p_o, v_o, a)
data['movement_x'] = dx
data['movement_z'] = dz

# Remove data points that have no pitch_type
data = data.dropna(subset=['pitch_type', "release_extension", "release_spin_rate"])
#data = data.dropna()
pitch_types = list(set(data.pitch_type.values))

In [None]:
columns = list(data.columns)
columns_keep = ['pitch_type',
                'pfx_z',
                'game_date',
                'pitcher',
                'release_speed',
                'p_throws',
                'release_pos_x',
                'release_pos_z',
                'movement_x',
                'movement_z',
                'plate_x',
                'plate_z',
                'zone',
                'vx0',
                'vy0',
                'vz0',
                'ax',
                'ay',
                'az',
                'release_spin_rate',
                'release_extension',
                'release_pos_y'
               ]

# Drop dolumns not in `columns_keep`
col_to_drop = model.remove_columns(columns, columns_keep)
data = data.drop(col_to_drop, axis=1)

In [None]:
np.where(data.isnull())

## Univariate Analysis

In [None]:
var = "movement_z"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1])], alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

Fortunately, we have enough data and with the exception of ephus and forkballs, the distributions are normal.

Here, we should mainly see the vertical movement as a result of magnus force.  A four-seam fastball is thrown with backspin which should result in an upward magnus force.  It makes sense then that the majority of four-seam fastballs have a positive z movement.  We see this similarly for changeups, though with lower movement due to a different grip. Curve balls are thrown with top-spin so we largely see negative z movement.  Sliders mainly have a horizontal spin, so we see that the z movement is centered around 0.  We see this slightly with a splitter as well.

One thing I don't quite understand right now is the z movement of the sinker.

In [None]:
var = "movement_x"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    x_axis = data[var][(data["pitch_type"] == pitch_types[i-1])]
    left = data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")]
    ax.hist(x_axis, alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

Taking the absolute value of x movement correctly normalizes the data between the left and right handed pitchers.  The fastballs, changeups, sinkers, splitters, and two-seam fastballs have normal distributions.  Cutters, knuckle-curves, and sliders have right skew distributions.

In [None]:
var = "movement_x"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "R")], alpha=0.5, bins=50, color='k', density=True)
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")], alpha=0.5, bins=50, color='b', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "movement_z"
fig, ax = plt.subplots()
ax.hist(data[var], alpha=0.5, bins=100, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

The bimodal distributions are due to the handedness of the pitchers.  Let's look at the absolute value of the x movements.

For the most part, these distributions seem normal.  The cutters

In [None]:
var = "release_speed"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 11):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1])], alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "release_spin_rate"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1])], alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "vy0"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data[var][(data["pitch_type"] == pitch_types[i-1])], alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

Checking the starting positions as a sanity check to make sure there aren't any faulty measurements.  All of these are normal distributions, so it seems like we don't need to discard any data.

In [None]:
# This might be a good way to detect outliers
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    ax.hist(data["movement_z"][(data["pitch_type"] == pitch_types[i-1])]/np.abs(data["movement_x"][(data["pitch_type"] == pitch_types[i-1])]), alpha=0.5, bins=50, color='k', density=True)
    #ax.set_xlim((-100, 100))
plt.tight_layout()
plt.xlabel("movement (ft)")


Some of these pitches have a very high ratio, part of which is due to a very low movement in the x-direction.  Might need to investigate this more.

## Bivariate Analysis

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    corr, _ = pearsonr(data["release_speed"][data["pitch_type"] == pitch_types[i-1]], data["release_spin_rate"][data["pitch_type"] == pitch_types[i-1]])
    ax.set_title(f"{pitch_types[i-1]}, pearsonr = {corr}")
    ax.scatter(data["release_speed"][data["pitch_type"] == pitch_types[i-1]], data["release_spin_rate"][data["pitch_type"] == pitch_types[i-1]])
plt.tight_layout()
plt.xlabel("movement (ft)")

Overall, the correlation between spin rate and speed is pretty low.  

In [None]:
x_var = "movement_x"
y_var = "movement_z"
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    corr, _ = pearsonr(data[x_var][data["pitch_type"] == pitch_types[i-1]], data[y_var][data["pitch_type"] == pitch_types[i-1]])
    ax.set_title(f"{pitch_types[i-1]}, pearsonr = {corr}")
    ax.scatter(data[x_var][data["pitch_type"] == pitch_types[i-1]], data[y_var][data["pitch_type"] == pitch_types[i-1]])
plt.tight_layout()
plt.xlabel("movement (ft)")

There doesn't seem to be too many anomalous outiers here.  I do notice there is a cluster of fastballs that have quite a negative z-movement.  These might need to be removed as this doesn't seem to be expected of fastballs, which should have a high backspin and upwards magnus force

In [None]:
x_var = "release_speed"
y_var = "movement_z"
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    corr, _ = pearsonr(data[x_var][data["pitch_type"] == pitch_types[i-1]], data[y_var][data["pitch_type"] == pitch_types[i-1]])
    ax.set_title(f"{pitch_types[i-1]}, pearsonr = {corr}")
    ax.scatter(data[x_var][data["pitch_type"] == pitch_types[i-1]], data[y_var][data["pitch_type"] == pitch_types[i-1]])
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
data[(data["movement_z"] < -0.5) & (data["pitch_type"] == 'FF')]

Interesting, the pitches with negative z-movement on fastballs is tied to a single player.  The player ID belongs to Tyler Rogers of the SF Giants.  Looking at this youtube video: https://www.youtube.com/watch?v=l1CGmjOpzRA, Rogers has a unique windup and throwing position.  I wonder if this is messing with the tracker slightly.

What's even weirder is the zone that these pitches are landing in.  The zone of 13.0 indicates a pitch that is slightly inside.  However, the release point is not even a foot off the ground, and the z-movement is negative?

In [None]:
from pybaseball import statcast_pitcher

rogers = statcast_pitcher('2019-04-01', '2020-11-01', 643511)

In [None]:
var = "pfx_z"
fig, ax = plt.subplots()
ax.hist(rogers[rogers["pitch_type"] == "FF"][var], alpha=0.5, bins=100, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

So it seems like these aren't an anomaly, so the tracker is probably properly working.  Upon thinking about this more, it might be that Rogers's release point is so low that his fastball is actually thrown with top-spin.  You can see an image of this here: https://www.mccoveychronicles.com/2020/11/24/21690851/san-francisco-giants-tyler-rogers-season-review.

Normally fast balls are thrown with an overhand motion which causes the backspin on the ball.  It looks like Roger's release point is so low that it's almost as if he's throwing underhand and causing topspin.  I don't want to remove this data but it's something to keep in mind.

In [None]:
x_var = "release_spin_rate"
y_var = "movement_x"
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(3, 3, i)
    corr, _ = pearsonr(data[x_var][data["pitch_type"] == pitch_types[i-1]], np.abs(data[y_var][data["pitch_type"] == pitch_types[i-1]]))
    ax.set_title(f"{pitch_types[i-1]}, pearsonr = {corr}")
    ax.scatter(data[x_var][data["pitch_type"] == pitch_types[i-1]], np.abs(data[y_var][data["pitch_type"] == pitch_types[i-1]]))
plt.tight_layout()
plt.xlabel("movement (ft)")

One last thing to check, I may combine the knuckle-curve and curveballs, as well as the splitter and two-seam fast balls.

In [None]:
# Rename knuckle curves as curve balls and two-seam as sinker
data.loc[(data.pitch_type == 'KC'),'pitch_type'] = "CU"
data.loc[(data.pitch_type == 'FT'),'pitch_type'] = "SI"

In [None]:
var = "movement_x"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    x_axis = data[var][(data["pitch_type"] == pitch_types[i-1])]
    left = data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")]
    ax.hist(x_axis, alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "movement_z"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    x_axis = data[var][(data["pitch_type"] == pitch_types[i-1])]
    left = data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")]
    ax.hist(x_axis, alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "release_speed"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    x_axis = data[var][(data["pitch_type"] == pitch_types[i-1])]
    left = data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")]
    ax.hist(x_axis, alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

In [None]:
var = "release_spin_rate"
fig = plt.figure(figsize=(7,7))
fig.subplots_adjust(hspace=0.7, wspace=0.4)
for i in range(1, 10):
    ax = fig.add_subplot(4, 3, i)
    ax.set_title(pitch_types[i-1])
    x_axis = data[var][(data["pitch_type"] == pitch_types[i-1])]
    left = data[var][(data["pitch_type"] == pitch_types[i-1]) & (data["p_throws"] == "L")]
    ax.hist(x_axis, alpha=0.5, bins=50, color='k', density=True)
plt.tight_layout()
plt.xlabel("movement (ft)")

Based on these distributions, it seems we can safely merge these pitch types together.

## Conclusions
- The majority of this data is normal, there are some skewed distributions
- We take the absolute value of initial velocity and acceleration in x-direction to normalize the pitch data for right and left handers
- For now, I'm going to not consider forkballs and ephus due to lack of data
- Due to similar distributions, the knuckle-curves and curve balls have been merged, the splitter and two-seam fastball has also been merged.
- A specific outlier was noticed for fastballs with submarine thrower Tyler Rogers.  I'm not going to remove his data but it's something to keep an eye out on as we run our models.