In [None]:
from pybaseball import statcast

Get data from statcast using pybaseball

In [None]:
#start = '2017-05-01'
# Use shorter time period so I can load this into memory
start = '2019-10-01'
end = '2019-10-06'
data = statcast(start, end)

In [None]:
data.head()

In [None]:
data.columns

Since we're trying to predict pitches, we need to get rid of many of these columns

In [None]:
columns = list(data.columns)
columns_keep = ['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
                'p_throws', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
               'zone', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'release_spin_rate', 'release_extension',
                'release_pos_y'
               ]

In [None]:
# Iterate through columns_keep and use pop() to remove the columns from `column`
# We will the use data.drop([columns]) to remove the remaining columns from the data frame
def remove_columns(all_columns, remove):
    """
    Parameters
    ----------
    all_columns: list
        all columns in a Pandas dataframe
    remove: list
        columns to remove from 'all_columns'
    Returns
    -------
    all_columns: list
        Updated columns after removal
    """
    for col in remove:
        all_columns.remove(col)
    
    return all_columns

In [None]:
col_to_drop = remove_columns(columns, columns_keep)

In [None]:
data = data.drop(col_to_drop, axis=1)

In [None]:
data.head()

### Defining the columns:
#### `release_speed`: Pitch velocities in mph
#### `release_pos_x`: Horizontal release position of pitch from catcher's perspective 
#### `release_pos_z`: Vertical release position of pitch
#### `zone`: Zone location of the ball when it cross the plate from the catcher's perspective
#### `p_throws`: Handedness of pitcher\n
#### `pfx`: Horizontal (or vertical) movement of pitch in feet
#### `plate_`: Position of ball when it reaches plate from catcher's perspective
#### `vx,vy,vz`: velocity of pitch in feet per second
#### `ax,ay,az`: acceleration of pitch, in feet per second^2
#### `release_spin_rate`: Spin rate of pitch
#### `release_extension`: Release extension of pitch in feet
#### `release_pos_y`: Release position of pitch measured in feet from the catcher's perspective.

### Potential Features to add:
#### movement: DIfference of `release_pos` and `plate_pos`

In [None]:
# Add movement
import movement
data['release_pos_y'] = 50

In [None]:
v_o = [data.vx0, data.vy0, data.vz0]
p_o = [data.release_pos_x, data.release_pos_y, data.release_pos_z]
a = [data.ax, data.ay, data.az]

In [None]:
dx, dz, dzg = movement.calc_movement(p_o, v_o, a)

In [None]:
data['movement_x'] = dx
data['movement_z'] = dz

In [None]:
data.head()

In [None]:
data = data.dropna(subset=['pitch_type'])

In [None]:
pitch_types = list(data.pitch_type.values)

In [None]:
set(pitch_types)

### Pitch Types
#### `CH`: changeup
#### `CU`:  curveball
#### `EP`: ephus
#### `FC`: cutter
#### `FF`: four-seam
#### `FS`: splitter
#### `FT`: two-seam 
#### `KC`: knuckle-curve
#### `SI`: sinker
#### `SL`: slider

Let's take a look at some distributions of several of these features

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
speed_histo = plt.hist(data.release_speed, bins=300)

There appears to be a distribution of pitches around 85 mph(off-speed) and a distribution around 93-4 mph (fastball)

In [None]:
num_pitches = len(set(pitch_types))
pitches = list(set(pitch_types))
fig, axes = plt.subplots(nrows=2, ncols=round(num_pitches/2), figsize=(20,6))
k = 0
for i in [0,1]:
    for j in range(round(num_pitches/2)):
        histo = axes[i,j].hist(data.release_speed[data.pitch_type == pitches[k]], label=pitches[k])
        axes[i,j].legend()
        k += 1

### A Few notes on pitch types versus velocity:
#### - Sinker, Four-seam, two-seam, and cutter have similar velocity distributions
#### - curve, knuckle-curve, and changeup have similar velocity distributions
#### - ephus pitches are hardely thrown, and when they are, their velocities are much lower than the other pitches

Pitch movement: http://baseball.physics.illinois.edu/Movement.pdf

In [None]:
data.columns

#### Movement: deviation of trajectory from a straight line without the effect of gravity

In [None]:
columns_keep = ['pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate',
                'movement_x', 'movement_z']
columns = list(data.columns)

In [None]:
col_to_drop = remove_columns(columns, columns_keep)

In [None]:
data = data.drop(col_to_drop, axis=1)

In [None]:
y = data.pitch_type
X = data.loc[:, data.columns != 'pitch_type']

TODO: Look up appropriate models to use