In [1]:
import pandas as pd
from collections import defaultdict, Counter

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Baseball Dataset
Note: This data was downloaded from https://www.kaggle.com/pschale/mlb-pitch-data-20152018

Pitch-level data for every pitch thrown during the 2015-2018 MLB regular seasons. Data scraped from http://gd2.mlb.com/components/game/mlb/. Each row represents a single pitch.

The data doesn't come with clear definitions (that I can find, at least). Here's what I believe the codes mean:

## Pitch Type Definitions
CH - Changeup

CU - Curveball

EP - Eephus*

FC - Cutter

FF - Four-seam Fastball

FO - Pitchout (also PO)*

FS - Splitter

FT - Two-seam Fastball

IN - Intentional ball

KC - Knuckle curve

KN - Knuckeball

PO - Pitchout (also FO)*

SC - Screwball*

SI - Sinker

SL - Slider

UN - Unknown*

*these pitch types occur rarely

## Code Definitions
While these aren't spelled out anywhere, play descriptions allowed confident identification of these codes

B - Ball

*B - Ball in dirt

S - Swinging Strike

C - Called Strike

F - Foul

T - Foul Tip

L - Foul Bunt

I - Intentional Ball

W - Swinging Strike (Blocked)

M - Missed Bunt

P - Pitchout

Q - Swinging pitchout

R - Foul pitchout

Values that only occur on last pitch of at-bat:

X - In play, out(s)

D - In play, no out

E - In play, runs

H - Hit by pitch

Note: all codes, except for H, come directly from the XML files. All at-bats with code H were given no code in the XMLs.

In [2]:
# If you haven't already, uncomment and execute the following line to unzip the data
# ! unzip 90666_1164824_bundle_archive.zip

## Get and clean up ab_bats data

In [2]:
at_bats = pd.read_csv('atbats.csv') 
at_bats.head(10)

Unnamed: 0,ab_id,batter_id,event,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,2015000001,572761,Groundout,201500001,1,1,0,L,452657,L,True
1,2015000002,518792,Double,201500001,1,1,0,L,452657,L,True
2,2015000003,407812,Single,201500001,1,1,0,L,452657,R,True
3,2015000004,425509,Strikeout,201500001,1,2,0,L,452657,R,True
4,2015000005,571431,Strikeout,201500001,1,3,0,L,452657,L,True
5,2015000006,451594,Double,201500001,1,0,1,R,425794,L,False
6,2015000007,624585,Groundout,201500001,1,1,1,R,425794,R,False
7,2015000008,519203,Strikeout,201500001,1,2,1,R,425794,L,False
8,2015000009,516770,Groundout,201500001,1,3,1,R,425794,R,False
9,2015000010,425877,Strikeout,201500001,2,1,0,L,452657,R,True


In [3]:
# drop unwanted columns
unwanted_columns = ['event']
at_bats.drop(columns=unwanted_columns, inplace=True)
at_bats.head(10)

Unnamed: 0,ab_id,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,2015000001,572761,201500001,1,1,0,L,452657,L,True
1,2015000002,518792,201500001,1,1,0,L,452657,L,True
2,2015000003,407812,201500001,1,1,0,L,452657,R,True
3,2015000004,425509,201500001,1,2,0,L,452657,R,True
4,2015000005,571431,201500001,1,3,0,L,452657,L,True
5,2015000006,451594,201500001,1,0,1,R,425794,L,False
6,2015000007,624585,201500001,1,1,1,R,425794,R,False
7,2015000008,519203,201500001,1,2,1,R,425794,L,False
8,2015000009,516770,201500001,1,3,1,R,425794,R,False
9,2015000010,425877,201500001,2,1,0,L,452657,R,True


## Get and clean up pitches data

In [4]:
pitches = pd.read_csv('pitches.csv')
pitches.head(10)

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,ay,az,sz_bot,sz_top,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,34.685,-11.96,1.72,3.56,2.0,-6.409,-136.065,-3.995,101.14,2.28,158.78,50.0,5.302,4.16,10.93,55.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,34.225,-10.085,1.72,3.56,2.0,-8.411,-135.69,-5.98,124.28,2.119,175.41,50.0,5.307,6.57,12.0,31.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,35.276,-11.56,1.72,3.56,2.0,-9.802,-137.668,-3.337,136.74,2.127,150.11,50.0,5.313,7.61,10.88,49.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,28.354,-20.54,1.74,3.35,2.0,-8.071,-133.005,-6.567,109.685636,2.279,187.463482,50.0,5.21,1.17,6.45,41.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,21.774,-34.111,1.72,3.56,2.0,-6.309,-110.409,0.325,146.527525,2.179,177.242829,50.0,5.557,-8.43,-1.65,18.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0
5,0.627,2.397,92.9,84.8,2743.856,148.11,-45.7,3.7,23.7,13.59,32.274,-10.333,1.72,3.56,2.0,-6.943,-136.012,-5.738,118.004772,2.273,164.467012,50.0,5.264,7.32,11.72,42.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0
6,-1.088,1.61,93.3,85.3,2848.535,147.044,-46.3,3.6,23.7,14.549,31.469,-9.734,1.59,3.45,2.0,-11.032,-136.208,-7.762,141.43,2.013,205.81,50.0,5.179,7.79,11.97,80.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7,-0.257,2.047,89.3,82.4,1433.743,185.948,7.3,4.8,23.8,-1.339,27.421,-19.326,1.59,3.45,0.778,-6.335,-130.711,-4.611,186.41,2.298,182.54,50.0,5.284,-0.77,7.38,39.0,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0
8,1.47,2.35,92.1,85.0,2666.09,146.146,-45.0,4.0,23.8,13.808,28.169,-11.591,1.89,3.46,2.0,-5.075,-134.873,-5.723,93.1,2.402,174.06,50.0,5.31,7.46,11.09,42.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,-1.337,1.898,89.3,82.0,1384.143,174.388,2.0,4.9,23.8,1.218,28.828,-19.782,1.81,3.52,0.648,-9.239,-130.512,-4.904,135.831493,2.165,182.991946,50.0,5.302,0.71,7.18,34.0,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [5]:
# drop unwanted columns
unwanted_columns = [
    'px', 
    'pz', 
    'start_speed',
    'end_speed',
    'spin_rate',
    'spin_dir',
    'break_angle',
    'break_length',
    'break_y',
    'ax',
    'ay',
    'az',
    'az',
    'sz_bot',
    'sz_top',
    'vx0',
    'vy0',
    'vz0',
    'x',
    'x0',
    'y',
    'y0',
    'z0',
    'pfx_x',
    'pfx_z',
    'nasty',
]
pitches.drop(columns=unwanted_columns, inplace=True)
pitches.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,2.0,3.0,C,S,FF,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2.0,5.0,S,S,FF,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,2.0,1.0,F,S,FF,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,2.0,13.0,B,B,FF,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,2.0,13.0,B,B,CU,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0
5,2.0,6.0,X,X,FF,8,0.0,2015000000.0,2.0,2.0,0.0,6.0,0.0,0.0,0.0
6,2.0,13.0,B,B,FF,12,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7,0.778,7.0,D,X,FC,13,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0
8,2.0,14.0,B,B,FF,17,0.0,2015000000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,0.648,13.0,B,B,FF,18,0.0,2015000000.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [6]:
# clean up the data a bit
pitches['zone'].fillna(-1, inplace=True)
pitches['type_confidence'].fillna(-1, inplace=True)
pitches['pitch_type'].fillna('UNK', inplace=True)
pitches = pitches.astype({
    'zone':'int32',
    'b_score':'int32',
    'ab_id':'int32',
    'b_count':'int32',
    's_count':'int32',
    'outs':'int32',
    'pitch_num':'int32',
    'on_1b':'int32',
    'on_2b':'int32',
    'on_3b':'int32'
})
pitches.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0
6,2.0,13,B,B,FF,12,0,2015000002,0,0,1,1,0,0,0
7,0.778,7,D,X,FC,13,0,2015000002,1,0,1,2,0,0,0
8,2.0,14,B,B,FF,17,0,2015000003,0,0,1,1,0,1,0
9,0.648,13,B,B,FF,18,0,2015000003,1,0,1,2,0,1,0


## Get and clean up player_names data

In [7]:
player_names = pd.read_csv('player_names.csv')
player_names.head(10)

Unnamed: 0,id,first_name,last_name
0,452657,Jon,Lester
1,425794,Adam,Wainwright
2,457435,Phil,Coke
3,435400,Jason,Motte
4,519166,Neil,Ramirez
5,593372,Carlos,Martinez
6,467008,Pedro,Strop
7,477229,Jordan,Walden
8,444468,Hector,Rondon
9,572096,Trevor,Rosenthal


## Merge dataframes and do final cleanup

In [8]:
mlb_df = pd.merge(left=pitches, right=at_bats, on=['ab_id'], how='left')
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True
6,2.0,13,B,B,FF,12,0,2015000002,0,0,1,1,0,0,0,518792,201500001,1,1,0,L,452657,L,True
7,0.778,7,D,X,FC,13,0,2015000002,1,0,1,2,0,0,0,518792,201500001,1,1,0,L,452657,L,True
8,2.0,14,B,B,FF,17,0,2015000003,0,0,1,1,0,1,0,407812,201500001,1,1,0,L,452657,R,True
9,0.648,13,B,B,FF,18,0,2015000003,1,0,1,2,0,1,0,407812,201500001,1,1,0,L,452657,R,True


In [9]:
mlb_df = pd.merge(left=mlb_df, right=player_names, left_on=['batter_id'], right_on=['id'])
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,id,first_name,last_name
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,572761,Matt,Carpenter
6,2.0,4,E,X,FC,106,1,2015000014,0,0,2,1,1,0,1,572761,201500001,2,2,0,L,452657,L,True,572761,Matt,Carpenter
7,2.0,13,B,B,FF,282,2,2015000035,0,0,0,1,0,0,0,572761,201500001,5,0,0,L,452657,L,True,572761,Matt,Carpenter
8,2.0,13,C,S,FC,283,2,2015000035,1,0,0,2,0,0,0,572761,201500001,5,0,0,L,452657,L,True,572761,Matt,Carpenter
9,2.0,14,B,B,SI,284,2,2015000035,1,1,0,3,0,0,0,572761,201500001,5,0,0,L,452657,L,True,572761,Matt,Carpenter


In [10]:
# rename and drop columns
mlb_df.rename(columns={'first_name':'batter_fn', 'last_name':'batter_ln'}, inplace=True)
mlb_df.drop(columns=['id'], inplace=True)
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,batter_fn,batter_ln
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter
6,2.0,4,E,X,FC,106,1,2015000014,0,0,2,1,1,0,1,572761,201500001,2,2,0,L,452657,L,True,Matt,Carpenter
7,2.0,13,B,B,FF,282,2,2015000035,0,0,0,1,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter
8,2.0,13,C,S,FC,283,2,2015000035,1,0,0,2,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter
9,2.0,14,B,B,SI,284,2,2015000035,1,1,0,3,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter


In [11]:
mlb_df = pd.merge(left=mlb_df, right=player_names, left_on=['pitcher_id'], right_on=['id'])
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,batter_fn,batter_ln,id,first_name,last_name
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
6,2.0,4,E,X,FC,106,1,2015000014,0,0,2,1,1,0,1,572761,201500001,2,2,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
7,2.0,13,B,B,FF,282,2,2015000035,0,0,0,1,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
8,2.0,13,C,S,FC,283,2,2015000035,1,0,0,2,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester
9,2.0,14,B,B,SI,284,2,2015000035,1,1,0,3,0,0,0,572761,201500001,5,0,0,L,452657,L,True,Matt,Carpenter,452657,Jon,Lester


In [12]:
# rename and drop columns
mlb_df.rename(columns={'first_name':'pitcher_fn', 'last_name':'pitcher_ln'}, inplace=True)
mlb_df.drop(columns=['id'], inplace=True)
mlb_df.sort_values(by=['ab_id', 'event_num'], ascending=[True, True], inplace=True)
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,batter_fn,batter_ln,pitcher_fn,pitcher_ln
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
203,2.0,13,B,B,FF,12,0,2015000002,0,0,1,1,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
204,0.778,7,D,X,FC,13,0,2015000002,1,0,1,2,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
271,2.0,14,B,B,FF,17,0,2015000003,0,0,1,1,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester
272,0.648,13,B,B,FF,18,0,2015000003,1,0,1,2,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester


In [13]:
# reset the index
mlb_df = mlb_df.reset_index()
mlb_df.drop(columns=['index'], inplace=True)
mlb_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,batter_fn,batter_ln,pitcher_fn,pitcher_ln
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
6,2.0,13,B,B,FF,12,0,2015000002,0,0,1,1,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
7,0.778,7,D,X,FC,13,0,2015000002,1,0,1,2,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
8,2.0,14,B,B,FF,17,0,2015000003,0,0,1,1,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester
9,0.648,13,B,B,FF,18,0,2015000003,1,0,1,2,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester


## Let's play with a single game's data, so we have a smaller dataset while we work things out

In [14]:
game_1_df = mlb_df[mlb_df['g_id'] == mlb_df.iloc[0]['g_id']]
game_1_df.head(10)

Unnamed: 0,type_confidence,zone,code,type,pitch_type,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top,batter_fn,batter_ln,pitcher_fn,pitcher_ln
0,2.0,3,C,S,FF,3,0,2015000001,0,0,0,1,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
1,2.0,5,S,S,FF,4,0,2015000001,0,1,0,2,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
2,2.0,1,F,S,FF,5,0,2015000001,0,2,0,3,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
3,2.0,13,B,B,FF,6,0,2015000001,0,2,0,4,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
4,2.0,13,B,B,CU,7,0,2015000001,1,2,0,5,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
5,2.0,6,X,X,FF,8,0,2015000001,2,2,0,6,0,0,0,572761,201500001,1,1,0,L,452657,L,True,Matt,Carpenter,Jon,Lester
6,2.0,13,B,B,FF,12,0,2015000002,0,0,1,1,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
7,0.778,7,D,X,FC,13,0,2015000002,1,0,1,2,0,0,0,518792,201500001,1,1,0,L,452657,L,True,Jason,Heyward,Jon,Lester
8,2.0,14,B,B,FF,17,0,2015000003,0,0,1,1,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester
9,0.648,13,B,B,FF,18,0,2015000003,1,0,1,2,0,1,0,407812,201500001,1,1,0,L,452657,R,True,Matt,Holliday,Jon,Lester


In [15]:
game_1_ab_id_keys = game_1_df['ab_id'].unique()
game_1_num_atbats = len(game_1_ab_id_keys)
game_1_num_pitches = len(game_1_df)
print(f"Number of at bats in game #1: {game_1_num_atbats}")
print(f"Number of pitches in game #1: {game_1_num_pitches}")
print(f"Average pitches/at bat in game #1: {game_1_num_pitches/game_1_num_atbats}")

Number of at bats in game #1: 75
Number of pitches in game #1: 314
Average pitches/at bat in game #1: 4.1866666666666665


In [16]:
# create ab -> pitch dict
ab_pitch_dict = {k:mlb_df[mlb_df['ab_id'] == k]['pitch_type'].tolist() for k in mlb_df['ab_id'].unique()}
# peak at the first 10
list(ab_pitch_dict.items())[:10]

[(2015000001, ['FF', 'FF', 'FF', 'FF', 'CU', 'FF']),
 (2015000002, ['FF', 'FC']),
 (2015000003, ['FF', 'FF', 'FF']),
 (2015000004, ['SI', 'FF', 'CU']),
 (2015000005, ['FF', 'FF', 'FF', 'FF', 'FC']),
 (2015000006, ['SI', 'FC']),
 (2015000007, ['FF', 'CU', 'FC', 'FC']),
 (2015000008, ['CU', 'FC', 'FF', 'CU', 'FC']),
 (2015000009, ['FC', 'CU', 'FF', 'FC']),
 (2015000010, ['FC', 'FC', 'CH', 'FF'])]

In [3]:
# import json
# with open("ab_pitch_dict.json","w") as f:
#     f.write(json.dumps(ab_pitch_dict))

In [19]:
# serialize and deserialize dict
# import pickle
# with open("ab_pitch_dict.pkl","wb") as f:
#     pickle.dump(ab_pitch_dict,f)
    
# with open("ab_pitch_dict.pkl","rb") as f:
#     ab_pitch_dict2 = pickle.load(f)
    
# list(ab_pitch_dict2.items())[:10]

[(2015000001, ['FF', 'FF', 'FF', 'FF', 'CU', 'FF']),
 (2015000002, ['FF', 'FC']),
 (2015000003, ['FF', 'FF', 'FF']),
 (2015000004, ['SI', 'FF', 'CU']),
 (2015000005, ['FF', 'FF', 'FF', 'FF', 'FC']),
 (2015000006, ['SI', 'FC']),
 (2015000007, ['FF', 'CU', 'FC', 'FC']),
 (2015000008, ['CU', 'FC', 'FF', 'CU', 'FC']),
 (2015000009, ['FC', 'CU', 'FF', 'FC']),
 (2015000010, ['FC', 'FC', 'CH', 'FF'])]

In [17]:
# create ab -> pitch dict
game_1_ab_pitch_dict = {k:game_1_df[game_1_df['ab_id'] == k]['pitch_type'].tolist() for k in game_1_ab_id_keys}
# peak at the first 10
list(game_1_ab_pitch_dict.items())[:10]

[(2015000001, ['FF', 'FF', 'FF', 'FF', 'CU', 'FF']),
 (2015000002, ['FF', 'FC']),
 (2015000003, ['FF', 'FF', 'FF']),
 (2015000004, ['SI', 'FF', 'CU']),
 (2015000005, ['FF', 'FF', 'FF', 'FF', 'FC']),
 (2015000006, ['SI', 'FC']),
 (2015000007, ['FF', 'CU', 'FC', 'FC']),
 (2015000008, ['CU', 'FC', 'FF', 'CU', 'FC']),
 (2015000009, ['FC', 'CU', 'FF', 'FC']),
 (2015000010, ['FC', 'FC', 'CH', 'FF'])]

In [None]:
# # get max pitch count
# max_pitch_count = max(len(x) for x in game_1_ab_pitch_dict.values())
# max_pitch_countpad each at bat with a placeholder value so all the lists are the same length
#
# pad_value = 'PAD'
# game_1_ab_pitch_padded_dict = {k:(v + [pad_value] * (max_pitch_count - len(v))) for k,v in game_1_ab_pitch_dict.items()}
# list(game_1_ab_pitch_padded_dict.items())[:10]

In [None]:
game_1_ab_pitch_dict[2015000001]

In [None]:
def generate_pairs(tokens, key_size = 1, pad_value = 'PAD'):
    padded_tokens = [pad_value] * (key_size ) + tokens
    for i in range(len(padded_tokens) - key_size):
        key = tuple(padded_tokens[i:i + key_size])
        value = padded_tokens[i + key_size]
        yield [key, value]

In [None]:
def generate_next_pitch_freqs(lookup_dict):
    for previous_pitches, next_pitches in lookup_dict.items():
        pitch_freqs = Counter()
        for pitch in next_pitches:
            pitch_freqs[pitch] += 1
            
        yield (previous_pitches, pitch_freqs)

In [None]:
def generate_next_pitch_probs(next_pitch_freq):
    for previous_pitches, next_pitch_freqs in next_pitch_freq.items():
        total_pitch_count = sum(next_pitch_freqs.values())
        pitch_prob = {pitch:count/total_pitch_count for pitch, count in next_pitch_freqs.items()}
        
        yield (previous_pitches, pitch_prob)

In [None]:
def generate_next_pitch_probs(next_pitch_freq):
    for previous_pitches, next_pitch_freqs in next_pitch_freq.items():
        total_pitch_count = sum(next_pitch_freqs.values())
        pitch_prob = {pitch:count/total_pitch_count for pitch, count in next_pitch_freqs.items()}
        
        yield (previous_pitches, pitch_prob)

In [None]:
ab_pitch_dict = ab_pitch_dict
lookup_dict = defaultdict(list)
for ab, pitches in ab_pitch_dict.items():
    for previous_pitches, next_pitch in generate_pairs(pitches, key_size = 2):
        lookup_dict[previous_pitches].append(next_pitch)

In [None]:
# lookup_dict[('PAD', 'PAD')]

In [None]:
next_pitch_freq = {previous_pitches:pitch_freqs for previous_pitches, pitch_freqs in generate_next_pitch_freqs(lookup_dict)}

In [None]:
next_pitch_freq[('PAD', 'PAD')]

In [None]:
next_pitch_prob = {previous_pitches:pitch_probs for previous_pitches, pitch_probs in generate_next_pitch_probs(next_pitch_freq)}

In [None]:
next_pitch_prob

In [None]:
ab_id_keys = mlb_df['ab_id'].unique()
num_atbats = len(ab_id_keys)
num_pitches = len(mlb_df)
print(f"Number of at bats: {num_atbats}")
print(f"Number of pitches: {num_pitches}")
print(f"Average pitches/at bat: {num_pitches/num_atbats}")

In [None]:
mlb_df.dtypes

In [None]:
# sample_ab = mlb_df[mlb_df['ab_id'] == 2015000001]['pitch_type'].tolist()
# sample_ab

In [None]:
# ab_dict = {}
# for ab_id in ab_id_keys:
#     ab_df = final_df[final_df['ab_id'] == ab_id]
#     ab_dict[ab_id] = ab_df

In [None]:
# ab_pitch_dict = {k:mlb_df[mlb_df['ab_id'] == k]['pitch_type'].tolist for k in ab_id_keys}