In [1]:
import pandas as pd
from collections import defaultdict, Counter

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Baseball Dataset
Note: This data was downloaded from https://www.kaggle.com/pschale/mlb-pitch-data-20152018

Pitch-level data for every pitch thrown during the 2015-2018 MLB regular seasons. Data scraped from http://gd2.mlb.com/components/game/mlb/. Each row represents a single pitch.

The data doesn't come with clear definitions (that I can find, at least). Here's what I believe the codes mean:

## Pitch Type Definitions
CH - Changeup

CU - Curveball

EP - Eephus*

FC - Cutter

FF - Four-seam Fastball

FO - Pitchout (also PO)*

FS - Splitter

FT - Two-seam Fastball

IN - Intentional ball

KC - Knuckle curve

KN - Knuckeball

PO - Pitchout (also FO)*

SC - Screwball*

SI - Sinker

SL - Slider

UN - Unknown*

*these pitch types occur rarely

## Code Definitions
While these aren't spelled out anywhere, play descriptions allowed confident identification of these codes

B - Ball

*B - Ball in dirt

S - Swinging Strike

C - Called Strike

F - Foul

T - Foul Tip

L - Foul Bunt

I - Intentional Ball

W - Swinging Strike (Blocked)

M - Missed Bunt

P - Pitchout

Q - Swinging pitchout

R - Foul pitchout

Values that only occur on last pitch of at-bat:

X - In play, out(s)

D - In play, no out

E - In play, runs

H - Hit by pitch

Note: all codes, except for H, come directly from the XML files. All at-bats with code H were given no code in the XMLs.

In [2]:
# If you haven't already, uncomment and execute the following line to unzip the data
# ! unzip 90666_1164824_bundle_archive.zip

## Get and clean up pitches data

In [3]:
pitches = pd.read_csv('pitches.csv')
pitches.head(10)

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,34.685,-11.96,1.72,3.56,2.0,-6.409,-136.065,-3.995,101.14,2.28,158.78,50.0,5.302,4.16,10.93,55.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,34.225,-10.085,1.72,3.56,2.0,-8.411,-135.69,-5.98,124.28,2.119,175.41,50.0,5.307,6.57,12.0,31.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,35.276,-11.56,1.72,3.56,2.0,-9.802,-137.668,-3.337,136.74,2.127,150.11,50.0,5.313,7.61,10.88,49.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,28.354,-20.54,1.74,3.35,2.0,-8.071,-133.005,-6.567,109.685636,2.279,187.463482,50.0,5.21,1.17,6.45,41.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,21.774,-34.111,1.72,3.56,2.0,-6.309,-110.409,0.325,146.527525,2.179,177.242829,50.0,5.557,-8.43,-1.65,18.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0
5,0.627,2.397,92.9,84.8,2743.856,148.11,-45.7,3.7,23.7,13.59,32.274,-10.333,1.72,3.56,2.0,-6.943,-136.012,-5.738,118.004772,2.273,164.467012,50.0,5.264,7.32,11.72,42.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0
6,-1.088,1.61,93.3,85.3,2848.535,147.044,-46.3,3.6,23.7,14.549,31.469,-9.734,1.59,3.45,2.0,-11.032,-136.208,-7.762,141.43,2.013,205.81,50.0,5.179,7.79,11.97,80.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7,-0.257,2.047,89.3,82.4,1433.743,185.948,7.3,4.8,23.8,-1.339,27.421,-19.326,1.59,3.45,0.778,-6.335,-130.711,-4.611,186.41,2.298,182.54,50.0,5.284,-0.77,7.38,39.0,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0
8,1.47,2.35,92.1,85.0,2666.09,146.146,-45.0,4.0,23.8,13.808,28.169,-11.591,1.89,3.46,2.0,-5.075,-134.873,-5.723,93.1,2.402,174.06,50.0,5.31,7.46,11.09,42.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,-1.337,1.898,89.3,82.0,1384.143,174.388,2.0,4.9,23.8,1.218,28.828,-19.782,1.81,3.52,0.648,-9.239,-130.512,-4.904,135.831493,2.165,182.991946,50.0,5.302,0.71,7.18,34.0,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [4]:
pitches.describe()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
count,2852965.0,2852965.0,2853040.0,2853040.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,2865071.0,2865071.0,2852965.0,2852965.0,2852965.0,2852965.0,2867154.0,2852965.0,2867154.0,2852965.0,2852965.0,2853012.0,2853012.0,2852965.0,2852965.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0
mean,0.006572502,2.254962,88.38124,81.36274,1731.173,180.2308,5.848084,6.601459,23.81685,-2.308512,26.49175,-22.78275,1.566689,3.435719,1.550896,2.282811,-128.8567,-4.420795,115.9975,-0.7189111,175.8503,50.0,5.814139,-1.139971,5.046326,44.42669,9.81975,312.9711,2.256608,2016606000.0,0.8824259,0.8810873,0.9826033,2.894472,0.3018314,0.1860451,0.09595543
std,0.892749,0.9463968,6.01554,5.364057,682.7521,67.42859,24.43621,133.1708,0.0644654,10.74066,4.142729,8.865066,0.1549407,0.217259,0.5537377,5.990039,8.726126,3.067389,34.09448,1.728592,29.60411,3.120278e-14,0.4578852,6.275731,5.187005,16.83662,4.060956,192.8482,2.550955,1117238.0,0.9665457,0.8251786,0.8174947,1.726595,0.4590526,0.3891431,0.2945302
min,-10.54333,-5.183664,33.9,32.4,1.214,-0.002,-90.0,0.1,23.3,-59.29009,-0.5243748,-77.17142,-7.921409,-1.849129,0.0,-24.869,-153.362,-19.7706,0.0,-9.556,0.0,50.0,-1.941,-28.09165,-70.206,0.0,1.0,3.0,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,-0.5908243,1.65,84.3,77.9,1238.745,137.154,-11.9,4.3,23.8,-11.29,23.46,-28.71775,1.485351,3.3,0.906,-2.081861,-135.51,-6.513,93.42,-1.955,160.65,50.0,5.564194,-6.45,2.16,32.0,6.0,149.0,0.0,2016004000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.013,2.259,89.7,82.5,1866.322,195.885,6.8,6.0,23.8,-3.142,26.47444,-21.595,1.57,3.44,2.0,3.989,-130.8299,-4.604841,116.14,-1.33353,177.18,50.0,5.842237,-1.88,6.09,44.0,11.0,302.0,1.0,2017004000.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
75%,0.609,2.863,93.0,85.4,2247.564,225.234,27.0,8.0,23.9,5.995827,29.38409,-15.743,1.658626,3.578467,2.0,6.625942,-122.945,-2.488206,139.24,0.7766212,193.64,50.0,6.104551,3.864644,8.95,56.0,13.0,464.0,3.0,2018002000.0,2.0,2.0,2.0,4.0,1.0,0.0,0.0
max,12.95291,14.88624,105.0,96.9,6539.259,360.001,269.4,224889.3,36.4,40.978,54.057,22.3052,10.69854,10.54815,2.0,25.15,-47.552,27.815,249.95,9.676,250.0,50.0,10.596,22.02,33.16048,100.0,14.0,1336.0,25.0,2018186000.0,4.0,2.0,2.0,21.0,1.0,1.0,1.0


In [5]:
pitches.type_confidence.describe()

count    2.852965e+06
mean     1.550896e+00
std      5.537377e-01
min      0.000000e+00
25%      9.060000e-01
50%      2.000000e+00
75%      2.000000e+00
max      2.000000e+00
Name: type_confidence, dtype: float64

In [6]:
pitches.pitch_type.unique()

array(['FF', 'CU', 'FC', 'SI', 'CH', 'FT', 'IN', 'SL', nan, 'KC', 'EP',
       'FS', 'FO', 'PO', 'KN', 'UN', 'SC', 'FA', 'AB'], dtype=object)

### Observation
(Maybe we can create a model to predict what pitches these lower `type_confidence` values are!)
- It looks like most of the ``confidence_type`` values are ``2.000`` while others are less than this. Let's assume that ``2.000`` is maximum confidence
- It's hard to determine what a lot of these values are, but many seem to have to do with the mechanical characteristics of the pitches (i.e., speed and position of the pitches, spin and break, etc.). These are important! Others seem to have to do with strike zone, *nastiness*, game state (e.g., ``b_score``, ``b_count``, ``s_count``, etc). These are probably not so important!
- We may want join this data to the ``at_bat`` and ``pitcher`` data, so we probably want to hang on to the ``ab_id``

In [7]:
# # drop unwanted columns
# unwanted_columns = [
#     'sz_bot',
#     'sz_top',
#     'nasty'
# ]
# pitches.drop(columns=unwanted_columns, inplace=True)
# pitches.head(10)

In [8]:
# clean up the data a bit
pitches['zone'].fillna(-1, inplace=True)  # for now, assign -1 for all empty zone values
pitches['type_confidence'].fillna(-1, inplace=True) # assuming that type confidence ranges from 0 to 2, assign 0 for all empty type_confidence values
pitches['pitch_type'].fillna('UNK', inplace=True) # based on the pitch type definitions, assign 'UN' for all empty pitch_type values
# pitches = pitches.astype({
#     'zone':'int32',
#     'b_score':'int32',
#     'ab_id':'int32',
#     'b_count':'int32',
#     's_count':'int32',
#     'outs':'int32',
#     'pitch_num':'int32',
#     'on_1b':'int32',
#     'on_2b':'int32',
#     'on_3b':'int32'
# })
pitches.head(10)

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,34.685,-11.96,1.72,3.56,2.0,-6.409,-136.065,-3.995,101.14,2.28,158.78,50.0,5.302,4.16,10.93,55.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,34.225,-10.085,1.72,3.56,2.0,-8.411,-135.69,-5.98,124.28,2.119,175.41,50.0,5.307,6.57,12.0,31.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,35.276,-11.56,1.72,3.56,2.0,-9.802,-137.668,-3.337,136.74,2.127,150.11,50.0,5.313,7.61,10.88,49.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,28.354,-20.54,1.74,3.35,2.0,-8.071,-133.005,-6.567,109.685636,2.279,187.463482,50.0,5.21,1.17,6.45,41.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,21.774,-34.111,1.72,3.56,2.0,-6.309,-110.409,0.325,146.527525,2.179,177.242829,50.0,5.557,-8.43,-1.65,18.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0
5,0.627,2.397,92.9,84.8,2743.856,148.11,-45.7,3.7,23.7,13.59,32.274,-10.333,1.72,3.56,2.0,-6.943,-136.012,-5.738,118.004772,2.273,164.467012,50.0,5.264,7.32,11.72,42.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0
6,-1.088,1.61,93.3,85.3,2848.535,147.044,-46.3,3.6,23.7,14.549,31.469,-9.734,1.59,3.45,2.0,-11.032,-136.208,-7.762,141.43,2.013,205.81,50.0,5.179,7.79,11.97,80.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7,-0.257,2.047,89.3,82.4,1433.743,185.948,7.3,4.8,23.8,-1.339,27.421,-19.326,1.59,3.45,0.778,-6.335,-130.711,-4.611,186.41,2.298,182.54,50.0,5.284,-0.77,7.38,39.0,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0
8,1.47,2.35,92.1,85.0,2666.09,146.146,-45.0,4.0,23.8,13.808,28.169,-11.591,1.89,3.46,2.0,-5.075,-134.873,-5.723,93.1,2.402,174.06,50.0,5.31,7.46,11.09,42.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,-1.337,1.898,89.3,82.0,1384.143,174.388,2.0,4.9,23.8,1.218,28.828,-19.782,1.81,3.52,0.648,-9.239,-130.512,-4.904,135.831493,2.165,182.991946,50.0,5.302,0.71,7.18,34.0,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [9]:
at_bats = pd.read_csv('atbats.csv') 
at_bats.head(10)

# drop unwanted columns
unwanted_columns = ['event']
at_bats.drop(columns=unwanted_columns, inplace=True)
at_bats.head(10)

Unnamed: 0,ab_id,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,2015000001,572761,201500001,1,1,0,L,452657,L,True
1,2015000002,518792,201500001,1,1,0,L,452657,L,True
2,2015000003,407812,201500001,1,1,0,L,452657,R,True
3,2015000004,425509,201500001,1,2,0,L,452657,R,True
4,2015000005,571431,201500001,1,3,0,L,452657,L,True
5,2015000006,451594,201500001,1,0,1,R,425794,L,False
6,2015000007,624585,201500001,1,1,1,R,425794,R,False
7,2015000008,519203,201500001,1,2,1,R,425794,L,False
8,2015000009,516770,201500001,1,3,1,R,425794,R,False
9,2015000010,425877,201500001,2,1,0,L,452657,R,True


## Merge dataframes and do final cleanup

In [10]:
mlb_df = pd.merge(left=pitches, right=at_bats, on=['ab_id'], how='left')
mlb_df.head(10)

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,34.685,-11.96,1.72,3.56,2.0,-6.409,-136.065,-3.995,101.14,2.28,158.78,50.0,5.302,4.16,10.93,55.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,34.225,-10.085,1.72,3.56,2.0,-8.411,-135.69,-5.98,124.28,2.119,175.41,50.0,5.307,6.57,12.0,31.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,35.276,-11.56,1.72,3.56,2.0,-9.802,-137.668,-3.337,136.74,2.127,150.11,50.0,5.313,7.61,10.88,49.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,28.354,-20.54,1.74,3.35,2.0,-8.071,-133.005,-6.567,109.685636,2.279,187.463482,50.0,5.21,1.17,6.45,41.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,21.774,-34.111,1.72,3.56,2.0,-6.309,-110.409,0.325,146.527525,2.179,177.242829,50.0,5.557,-8.43,-1.65,18.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
5,0.627,2.397,92.9,84.8,2743.856,148.11,-45.7,3.7,23.7,13.59,32.274,-10.333,1.72,3.56,2.0,-6.943,-136.012,-5.738,118.004772,2.273,164.467012,50.0,5.264,7.32,11.72,42.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
6,-1.088,1.61,93.3,85.3,2848.535,147.044,-46.3,3.6,23.7,14.549,31.469,-9.734,1.59,3.45,2.0,-11.032,-136.208,-7.762,141.43,2.013,205.81,50.0,5.179,7.79,11.97,80.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,518792,201500001,1,1,0,L,452657,L,True
7,-0.257,2.047,89.3,82.4,1433.743,185.948,7.3,4.8,23.8,-1.339,27.421,-19.326,1.59,3.45,0.778,-6.335,-130.711,-4.611,186.41,2.298,182.54,50.0,5.284,-0.77,7.38,39.0,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,518792,201500001,1,1,0,L,452657,L,True
8,1.47,2.35,92.1,85.0,2666.09,146.146,-45.0,4.0,23.8,13.808,28.169,-11.591,1.89,3.46,2.0,-5.075,-134.873,-5.723,93.1,2.402,174.06,50.0,5.31,7.46,11.09,42.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,407812,201500001,1,1,0,L,452657,R,True
9,-1.337,1.898,89.3,82.0,1384.143,174.388,2.0,4.9,23.8,1.218,28.828,-19.782,1.81,3.52,0.648,-9.239,-130.512,-4.904,135.831493,2.165,182.991946,50.0,5.302,0.71,7.18,34.0,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,407812,201500001,1,1,0,L,452657,R,True


In [11]:
# reset the index
mlb_df = mlb_df.reset_index()
mlb_df.drop(columns=['index'], inplace=True)
mlb_df.head(10)

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,34.685,-11.96,1.72,3.56,2.0,-6.409,-136.065,-3.995,101.14,2.28,158.78,50.0,5.302,4.16,10.93,55.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,34.225,-10.085,1.72,3.56,2.0,-8.411,-135.69,-5.98,124.28,2.119,175.41,50.0,5.307,6.57,12.0,31.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,35.276,-11.56,1.72,3.56,2.0,-9.802,-137.668,-3.337,136.74,2.127,150.11,50.0,5.313,7.61,10.88,49.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,28.354,-20.54,1.74,3.35,2.0,-8.071,-133.005,-6.567,109.685636,2.279,187.463482,50.0,5.21,1.17,6.45,41.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,21.774,-34.111,1.72,3.56,2.0,-6.309,-110.409,0.325,146.527525,2.179,177.242829,50.0,5.557,-8.43,-1.65,18.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
5,0.627,2.397,92.9,84.8,2743.856,148.11,-45.7,3.7,23.7,13.59,32.274,-10.333,1.72,3.56,2.0,-6.943,-136.012,-5.738,118.004772,2.273,164.467012,50.0,5.264,7.32,11.72,42.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0,572761,201500001,1,1,0,L,452657,L,True
6,-1.088,1.61,93.3,85.3,2848.535,147.044,-46.3,3.6,23.7,14.549,31.469,-9.734,1.59,3.45,2.0,-11.032,-136.208,-7.762,141.43,2.013,205.81,50.0,5.179,7.79,11.97,80.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,518792,201500001,1,1,0,L,452657,L,True
7,-0.257,2.047,89.3,82.4,1433.743,185.948,7.3,4.8,23.8,-1.339,27.421,-19.326,1.59,3.45,0.778,-6.335,-130.711,-4.611,186.41,2.298,182.54,50.0,5.284,-0.77,7.38,39.0,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,518792,201500001,1,1,0,L,452657,L,True
8,1.47,2.35,92.1,85.0,2666.09,146.146,-45.0,4.0,23.8,13.808,28.169,-11.591,1.89,3.46,2.0,-5.075,-134.873,-5.723,93.1,2.402,174.06,50.0,5.31,7.46,11.09,42.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,407812,201500001,1,1,0,L,452657,R,True
9,-1.337,1.898,89.3,82.0,1384.143,174.388,2.0,4.9,23.8,1.218,28.828,-19.782,1.81,3.52,0.648,-9.239,-130.512,-4.904,135.831493,2.165,182.991946,50.0,5.302,0.71,7.18,34.0,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,407812,201500001,1,1,0,L,452657,R,True


In [12]:
low_confidence_df = mlb_df[mlb_df.type_confidence < 2.000]
print(f"low confidence count: {len(low_confidence_df)}")
print(f"low confidence/unknown pitch_type count: {len(low_confidence_df[low_confidence_df.pitch_type == 'UN'])}")

low confidence count: 1155992
low confidence/unknown pitch_type count: 57


In [13]:
full_confidence_df = mlb_df[mlb_df.type_confidence >= 2.000]
print(f"full confidence count: {len(full_confidence_df)}")
print(f"full confidence/unknown pitch_type count: {len(full_confidence_df[full_confidence_df.pitch_type == 'UN'])}")

full confidence count: 1711162
full confidence/unknown pitch_type count: 0


### First model attempt
For a first pass, let's use what I would consider the strongest signals/features: ``start_speed``, ``end_speed``, ``spin_rate``, ``spin_dir``, ``break_angle``, and ``break_length``.

In [14]:
y = pd.get_dummies(full_confidence_df.pitch_type)
print(y.columns)
y.head(10)

Index(['CH', 'CU', 'EP', 'FA', 'FC', 'FF', 'FO', 'FS', 'FT', 'KC', 'KN', 'SC',
       'SI', 'SL'],
      dtype='object')


Unnamed: 0,CH,CU,EP,FA,FC,FF,FO,FS,FT,KC,KN,SC,SI,SL
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0
10,0,0,0,0,0,1,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [15]:
features = ['start_speed', 'end_speed', 'spin_rate', 'spin_dir', 'break_angle', 'break_length']
X = full_confidence_df[features]

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.tree import DecisionTreeClassifier

# Define model. Specify a number for random_state to ensure same results each run
pitch_predictor_model = DecisionTreeClassifier(
    random_state = 42, 
    max_depth = 5
)

# Fit model
pitch_predictor_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=42)

In [18]:
y_pred = pitch_predictor_model.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

ValueError: multilabel-indicator is not supported

In [None]:
print(y.columns)
i = 4
print(y_pred[i])
print(y_test.iloc[i])

In [None]:
from sklearn.metrics import log_loss

log_loss(y_test, y_pred)

In [None]:
# p = pd.DataFrame({"monkey":[0,1,0,0,0],"rabbit":[1,0,0,0,0],"fox":[0,0,1,0,0]})
# ['CH', 'CU', 'EP', 'FA','FC', 'FF', 'FO', 'FS','FT', 'KC', 'KN', 'SC','SI', 'SL']

y.columns

In [None]:
def get_pitch_type(row):
    for i in range(len(y.columns)):
        if row[i] == 1:
            return y.columns[i]

In [None]:
import numpy as np
y_pred_pitch_type = np.apply_along_axis(get_pitch_type, 1, y_pred)

In [None]:
y_pred_pitch_type[:10]

In [None]:
changeups = full_confidence_df[full_confidence_df.pitch_type == 'CH']
changeups.head(10)