In [3]:
import pandas as pd

pd.set_option('display.max_columns', None)

def clean_dir(path):
    """Makes a clean directory, removes all files and folders in the specified path"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")   

This Notebook will be dedicated to parsing the Play-By-Play data from RetroSheets. This will allow us to gather data on a per-player basis to give us many more features with the hope of deriving more meaningful results from the models we train. The goal for this parser will to give us a boxscore-like DataFrame for each game from 2014-2019. There will be 18 batters for both NL and AL games. For the sake of simplicity, pitching stats are not included in the PBP for now, but this can easily be changed for the future. Let's walk through how we'll parse this data.

We will go through each play of each game, and update statistics for each player after each play. 

The fields of each player will be:

- (Home/Visitor) Player i:
    - player id
    - hits
    - singles
    - doubles
    - triples
    - walks
    - bunts
    - sacrifice hits
    - sacrifice flys
    - RBIs
    - at bats
    - number of stolen bases
    - number times caught stealing
    - number times picked off
    - number of errors
    
Therefore each player has 15 fields associated with them, and since there will be 18 batters, there will be a total of 270 columns of batter data. An example of one of these columns is "Visiting Player i Errors" where "i" is the ith player on the visiting team, which is determined by the "batting" flag in the event file and "Errors" is determined by summing the number of errors that player committed over the course of a single game.

In addition to the 270 batter data columns, there will be one column for the game ID, which be used to get the stats on a per-game basis.

In [4]:
import pandas as pd

column_labels = ["game id",
"visiting team",
"inning",
"batting team",
"outs",
"balls",
"strikes",
"pitch sequence",
"vis score",
"home score",
"batter",
"batter hand",
"res batter",
"res batter hand",
"pitcher",
"pitcher hand",
"res pitcher",
"res pitcher hand",
"catcher",
"first base",
"second base",
"third base",
"shortstop",
"left field",
"center field",
"right field",
"first runner",
"second runner",
"third runner",
"event text",
"leadoff flag",
"pinchhit flag",
"defensive position",
"lineup position",
"event type",
"batter event flag",
"ab flag",
"hit value",
"SH flag",
"SF flag",
"outs on play",
"double play flag",
"triple play flag",
"RBI on play",
"wild pitch flag",
"passed ball flag",
"fielded by",
"batted ball type",
"bunt flag",
"foul flag",
"hit location",
"num errors",
"1st error player",
"1st error type",
"2nd error player",
"2nd error type",
"3rd error player",
"3rd error type",
"batter dest",
"runner on 1st dest",
"runner on 2nd dest",
"runner on 3rd dest",
"play on batter",
"play on runner on 1st",
"play on runner on 2nd",
"play on runner on 3rd",
"SB for runner on 1st flag",
"SB for runner on 2nd flag",
"SB for runner on 3rd flag",
"CS for runner on 1st flag",
"CS for runner on 2nd flag",
"CS for runner on 3rd flag",
"PO for runner on 1st flag",
"PO for runner on 2nd flag",
"PO for runner on 3rd flag",
"Responsible pitcher for runner on 1st",
"Responsible pitcher for runner on 2nd",
"Responsible pitcher for runner on 3rd",
"New Game Flag",
"End Game Flag",
"Pinch-runner on 1st",
"Pinch-runner on 2nd",
"Pinch-runner on 3rd",
"Runner removed for pinch-runner on 1st",
"Runner removed for pinch-runner on 2nd",
"Runner removed for pinch-runner on 3rd",
"Batter removed for pinch-hitter",
"Position of batter removed for pinch-hitter",
"Fielder with First Putout (0 if none)",
"Fielder with Second Putout (0 if none)",
"Fielder with Third Putout (0 if none)",
"Fielder with First Assist (0 if none)",
"Fielder with Second Assist (0 if none)",
"Fielder with Third Assist (0 if none)",
"Fielder with Fourth Assist (0 if none)",
"Fielder with Fifth Assist (0 if none)",
"event num",]

mets_2014 = pd.read_csv('datasets/retro_sheets_pbp_filtered/2014NYN.EVN', names=column_labels)

mets_2014.head()

Unnamed: 0,game id,visiting team,inning,batting team,outs,balls,strikes,pitch sequence,vis score,home score,batter,batter hand,res batter,res batter hand,pitcher,pitcher hand,res pitcher,res pitcher hand,catcher,first base,second base,third base,shortstop,left field,center field,right field,first runner,second runner,third runner,event text,leadoff flag,pinchhit flag,defensive position,lineup position,event type,batter event flag,ab flag,hit value,SH flag,SF flag,outs on play,double play flag,triple play flag,RBI on play,wild pitch flag,passed ball flag,fielded by,batted ball type,bunt flag,foul flag,hit location,num errors,1st error player,1st error type,2nd error player,2nd error type,3rd error player,3rd error type,batter dest,runner on 1st dest,runner on 2nd dest,runner on 3rd dest,play on batter,play on runner on 1st,play on runner on 2nd,play on runner on 3rd,SB for runner on 1st flag,SB for runner on 2nd flag,SB for runner on 3rd flag,CS for runner on 1st flag,CS for runner on 2nd flag,CS for runner on 3rd flag,PO for runner on 1st flag,PO for runner on 2nd flag,PO for runner on 3rd flag,Responsible pitcher for runner on 1st,Responsible pitcher for runner on 2nd,Responsible pitcher for runner on 3rd,New Game Flag,End Game Flag,Pinch-runner on 1st,Pinch-runner on 2nd,Pinch-runner on 3rd,Runner removed for pinch-runner on 1st,Runner removed for pinch-runner on 2nd,Runner removed for pinch-runner on 3rd,Batter removed for pinch-hitter,Position of batter removed for pinch-hitter,Fielder with First Putout (0 if none),Fielder with Second Putout (0 if none),Fielder with Third Putout (0 if none),Fielder with First Assist (0 if none),Fielder with Second Assist (0 if none),Fielder with Third Assist (0 if none),Fielder with Fourth Assist (0 if none),Fielder with Fifth Assist (0 if none),event num
0,NYN201403310,WAS,1,0,0,2,2,BCSFBFFX,0,0,spand001,L,spand001,L,gee-d001,R,gee-d001,R,darnt001,davii001,youne003,wrigd002,tejar001,browa003,lagaj001,granc001,,,,5/P,T,F,8,1,2,T,T,0,F,F,1,F,F,0,F,F,5,P,F,F,,0,0,N,0,N,0,N,0,0,0,0,5.0,,,,F,F,F,F,F,F,F,F,F,,,,T,F,F,F,F,,,,,0,5,0,0,0,0,0,0,0,1
1,NYN201403310,WAS,1,0,1,1,1,BFX,0,0,zimmr001,R,zimmr001,R,gee-d001,R,gee-d001,R,darnt001,davii001,youne003,wrigd002,tejar001,browa003,lagaj001,granc001,,,,S7/G+,F,F,5,2,20,T,T,1,F,F,0,F,F,0,F,F,7,G,F,F,,0,0,N,0,N,0,N,1,0,0,0,,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,0,0,0,0,0,0,0,0,2
2,NYN201403310,WAS,1,0,1,2,2,BBCC1S,0,0,wertj001,R,wertj001,R,gee-d001,R,gee-d001,R,darnt001,davii001,youne003,wrigd002,tejar001,browa003,lagaj001,granc001,zimmr001,,,K,F,F,9,3,3,T,T,0,F,F,1,F,F,0,F,F,0,,F,F,,0,0,N,0,N,0,N,0,1,0,0,2.0,,,,F,F,F,F,F,F,F,F,F,gee-d001,,,F,F,F,F,F,,,,,0,2,0,0,0,0,0,0,0,3
3,NYN201403310,WAS,1,0,2,3,0,BBBX,0,0,ramow001,R,ramow001,R,gee-d001,R,gee-d001,R,darnt001,davii001,youne003,wrigd002,tejar001,browa003,lagaj001,granc001,zimmr001,,,53/G-,F,F,2,4,2,T,T,0,F,F,1,F,F,0,F,F,5,G,F,F,,0,0,N,0,N,0,N,0,1,0,0,53.0,,,,F,F,F,F,F,F,F,F,F,gee-d001,,,F,F,F,F,F,,,,,0,3,0,0,5,0,0,0,0,4
4,NYN201403310,WAS,1,1,0,2,2,CCBBS,0,0,youne003,L,youne003,L,stras001,R,stras001,R,ramow001,laroa001,renda001,zimmr001,desmi001,harpb003,spand001,wertj001,,,,K23,T,F,4,1,3,T,T,0,F,F,1,F,F,0,F,F,0,,F,F,,0,0,N,0,N,0,N,0,0,0,0,23.0,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,3,0,0,2,0,0,0,0,5


In [2]:
mets_2014.shape

(6263, 97)

In [6]:
# Let's start by getting Starting Lineups
import os

gl_path = 'datasets/retro_sheet_gls/'
sl_path = 'datasets/starting_lineups/'
ignore = ['.ipynb_checkpoints']

if 'starting_lineups' not in os.listdir('datasets/'):
    os.mkdir('datasets/starting_lineups')

for file in os.listdir(gl_path):
    if file not in ignore:
        gl = pd.read_csv(gl_path + file)
        gl['Game ID'] = gl['Home Team'].str.cat(
                gl['Date'].astype('string').str.cat(
                gl['Number of game'].astype('string')))
        starting_lineups = gl.filter(items=['Game ID', 'Visiting Team Score', 'Home Team Score'] + 
                                     [f'Visiting Team Player {i+1} ID' for i in range(9)] +
                                     ['Visiting Team Starting Pitcher ID'] +
                                     [f'Home Team Player {i+1} ID' for i in range(9)] +
                                     ['Home Team Starting Pitcher ID'],
                                    axis=1)
        starting_lineups.to_csv(sl_path + 'SL' + file[2:6] + '.csv')

In [4]:
STARTING_LINEUPS_DIR = './datasets/starting_lineups/'

from datetime import date

def get_starting_lineups(directory=STARTING_LINEUPS_DIR):
    """Function that returns a dataframe object of all of starting lineups"""
    
    def generate_date(game_id):
        """Helper function that parses a game id for the date"""
        
        # Parse the game_id
        year = int(game_id[3:7])
        month = int(game_id[7:9])
        day = int(game_id[9:11])
        
        return date(year, month, day)
    
    def generate_game_number(game_id):
        """Helper function that parses a game id and returns the game number"""
        return int(game_id[-1])
    
    
    dataframes = []
    for file in os.listdir(directory):
        dataframe = pd.read_csv(os.path.join(directory, file))
        dataframe['Date'] = dataframe.apply(lambda row : generate_date(row['Game ID']), axis=1) 
        dataframe['Game Number'] = dataframe.apply(lambda row : generate_game_number(row['Game ID']), axis=1) 
        dataframes.append(dataframe)

    return pd.concat(dataframes).drop(columns=['Unnamed: 0']).sort_values(by=['Date', 'Game Number'])

In [7]:
# get all play-by-play data
pbp_data_path = 'datasets/retro_sheets_pbp_filtered/'
pbp_data = []
for file in os.listdir(pbp_data_path):
    if '.EVN' in file or '.EVA' in file:
        pbp_data.append(pd.read_csv(pbp_data_path + file, names=column_labels))
pbp_data = pd.concat(pbp_data, ignore_index=True)

pbp_data

Unnamed: 0,game id,visiting team,inning,batting team,outs,balls,strikes,pitch sequence,vis score,home score,batter,batter hand,res batter,res batter hand,pitcher,pitcher hand,res pitcher,res pitcher hand,catcher,first base,second base,third base,shortstop,left field,center field,right field,first runner,second runner,third runner,event text,leadoff flag,pinchhit flag,defensive position,lineup position,event type,batter event flag,ab flag,hit value,SH flag,SF flag,outs on play,double play flag,triple play flag,RBI on play,wild pitch flag,passed ball flag,fielded by,batted ball type,bunt flag,foul flag,hit location,num errors,1st error player,1st error type,2nd error player,2nd error type,3rd error player,3rd error type,batter dest,runner on 1st dest,runner on 2nd dest,runner on 3rd dest,play on batter,play on runner on 1st,play on runner on 2nd,play on runner on 3rd,SB for runner on 1st flag,SB for runner on 2nd flag,SB for runner on 3rd flag,CS for runner on 1st flag,CS for runner on 2nd flag,CS for runner on 3rd flag,PO for runner on 1st flag,PO for runner on 2nd flag,PO for runner on 3rd flag,Responsible pitcher for runner on 1st,Responsible pitcher for runner on 2nd,Responsible pitcher for runner on 3rd,New Game Flag,End Game Flag,Pinch-runner on 1st,Pinch-runner on 2nd,Pinch-runner on 3rd,Runner removed for pinch-runner on 1st,Runner removed for pinch-runner on 2nd,Runner removed for pinch-runner on 3rd,Batter removed for pinch-hitter,Position of batter removed for pinch-hitter,Fielder with First Putout (0 if none),Fielder with Second Putout (0 if none),Fielder with Third Putout (0 if none),Fielder with First Assist (0 if none),Fielder with Second Assist (0 if none),Fielder with Third Assist (0 if none),Fielder with Fourth Assist (0 if none),Fielder with Fifth Assist (0 if none),event num
0,ANA201403310,SEA,1,0,0,3,2,CFBFBBX,0,0,almoa001,L,almoa001,L,weavj003,R,weavj003,R,iannc001,pujoa001,kendh001,freed001,aybae001,hamij003,troum001,calhk001,,,,S9/G+,T,F,8,1,20,T,T,1,F,F,0,F,F,0,F,F,9,G,F,F,,0,0,N,0,N,0,N,1,0,0,0,,,,,F,F,F,F,F,F,F,F,F,,,,T,F,F,F,F,,,,,0,0,0,0,0,0,0,0,0,1
1,ANA201403310,SEA,1,0,0,2,1,B11BC>B,0,0,millb002,L,millb002,L,weavj003,R,weavj003,R,iannc001,pujoa001,kendh001,freed001,aybae001,hamij003,troum001,calhk001,almoa001,,,CS2(26),F,F,6,2,6,F,F,0,F,F,1,F,F,0,F,F,0,,F,F,,0,0,N,0,N,0,N,0,0,0,0,,26,,,F,F,F,T,F,F,F,F,F,weavj003,,,F,F,F,F,F,,,,,0,6,0,0,2,0,0,0,0,2
2,ANA201403310,SEA,1,0,1,3,1,B11BC>B.X,0,0,millb002,L,millb002,L,weavj003,R,weavj003,R,iannc001,pujoa001,kendh001,freed001,aybae001,hamij003,troum001,calhk001,,,,5/P5F,F,F,6,2,2,T,T,0,F,F,1,F,F,0,F,F,5,P,F,T,5F,0,0,N,0,N,0,N,0,0,0,0,5.0,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,5,0,0,0,0,0,0,0,3
3,ANA201403310,SEA,1,0,2,3,2,BCBSBX,0,0,canor001,L,canor001,L,weavj003,R,weavj003,R,iannc001,pujoa001,kendh001,freed001,aybae001,hamij003,troum001,calhk001,,,,63/G,F,F,4,3,2,T,T,0,F,F,1,F,F,0,F,F,6,G,F,F,,0,0,N,0,N,0,N,0,0,0,0,63.0,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,3,0,0,6,0,0,0,0,4
4,ANA201403310,SEA,1,1,0,2,0,BBX,0,0,calhk001,L,calhk001,L,hernf002,R,hernf002,R,zunim001,smoaj001,canor001,seagk001,millb002,ackld001,almoa001,saunm001,,,,S8/L,T,F,9,1,20,T,T,1,F,F,0,F,F,0,F,F,8,L,F,F,,0,0,N,0,N,0,N,1,0,0,0,,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144617,WAS201909290,CLE,8,1,1,1,2,SC*BS,2,8,steva001,L,steva001,L,carrc003,R,carrc003,R,haase001,bauej001,freem002,flahr001,chany001,alleg002,velaa001,zimmb001,parrg001,,difow001,K,F,F,7,2,3,T,T,0,F,F,1,F,F,0,F,F,0,,F,F,,0,0,N,0,N,0,N,0,1,0,3,2.0,,,,F,F,F,F,F,F,F,F,F,carrc003,,carrc003,F,F,F,F,F,,,,,0,2,0,0,0,0,0,0,0,76
1144618,WAS201909290,CLE,8,1,2,0,0,X,2,8,barrt004,R,barrt004,R,carrc003,R,carrc003,R,haase001,bauej001,freem002,flahr001,chany001,alleg002,velaa001,zimmb001,parrg001,,difow001,7/F,F,F,2,3,2,T,T,0,F,F,1,F,F,0,F,F,7,F,F,F,,0,0,N,0,N,0,N,0,1,0,3,7.0,,,,F,F,F,F,F,F,F,F,F,carrc003,,carrc003,F,F,F,F,F,,,,,0,7,0,0,0,0,0,0,0,77
1144619,WAS201909290,CLE,9,0,0,3,2,BBCFBB,2,8,bauej001,L,bauej001,L,fedde001,R,fedde001,R,barrt004,adamm002,dozib001,sanca007,difow001,steva001,taylm002,parrg001,,,,W,T,F,3,7,14,T,F,0,F,F,0,F,F,0,F,F,0,,F,F,,0,0,N,0,N,0,N,1,0,0,0,,,,,F,F,F,F,F,F,F,F,F,,,,F,F,F,F,F,,,,,0,0,0,0,0,0,0,0,0,78
1144620,WAS201909290,CLE,9,0,0,2,1,.BBFX,2,8,merco003,R,merco003,R,fedde001,R,fedde001,R,barrt004,adamm002,dozib001,sanca007,difow001,steva001,taylm002,parrg001,bauej001,,,54(1)3/GDP,F,T,11,8,2,T,T,0,F,F,2,T,F,0,F,F,5,G,F,F,,0,0,N,0,N,0,N,0,0,0,0,43.0,54,,,F,F,F,F,F,F,F,F,F,fedde001,,,F,F,F,F,F,,,,carrc003,1,4,3,0,5,4,0,0,0,79


In [6]:
# get all starting lineups
sl_path = 'datasets/starting_lineups/'
ignore = ['.ipynb_checkpoints']

sl = list()

for file_num in range(2014, 2020, 1):
    sl.append(pd.read_csv(f'datasets/starting_lineups/SL{file_num}.csv'))  
        
starting_lineups = pd.concat(sl, ignore_index=True).drop(columns='Unnamed: 0')

starting_lineups

Unnamed: 0,Game ID,Visiting Team Score,Home Team Score,Visiting Team Player 1 ID,Visiting Team Player 2 ID,Visiting Team Player 3 ID,Visiting Team Player 4 ID,Visiting Team Player 5 ID,Visiting Team Player 6 ID,Visiting Team Player 7 ID,...,Home Team Player 1 ID,Home Team Player 2 ID,Home Team Player 3 ID,Home Team Player 4 ID,Home Team Player 5 ID,Home Team Player 6 ID,Home Team Player 7 ID,Home Team Player 8 ID,Home Team Player 9 ID,Home Team Starting Pitcher ID
0,ARI201403220,3,1,puigy001,turnj001,ramih003,gonza003,vanss001,uribj002,ethia001,...,polla001,hilla001,goldp001,pradm001,trumm001,montm001,owinc001,parrg001,milew001,milew001
1,ARI201403230,7,5,gordd002,puigy001,ramih003,gonza003,ethia001,ellia001,baxtm001,...,polla001,hilla001,goldp001,pradm001,montm001,trumm001,parrg001,gregd001,cahit001,cahit001
2,SDN201403300,1,3,crawc002,puigy001,ramih003,gonza003,ethia001,uribj002,ellia001,...,cabre001,denoc001,headc001,gyorj001,alony001,medit001,venaw001,river003,casha001,casha001
3,ANA201403310,10,3,almoa001,millb002,canor001,smoaj001,morrl001,seagk001,saunm001,...,calhk001,troum001,pujoa001,hamij003,freed001,ibanr001,kendh001,iannc001,aybae001,weavj003
4,BAL201403310,1,2,navad002,pedrd001,ortid001,napom001,carpm001,sizeg001,bogax001,...,markn001,hardj003,jonea003,davic003,cruzn002,wietm001,yound003,flahr001,schoj001,tillc001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14572,CHA201909290,3,5,reyev001,mercj002,cabrm001,hickj001,rodrr009,stewc002,demet001,...,sancc001,andet001,abrej003,moncy001,jimee001,collz001,castw002,palkd001,engea001,detwr001
14573,KCA201909290,4,5,wadel001,polaj001,sanom001,cronc002,cavej001,schoj001,castj006,...,merrw001,solej001,dozih001,gorda001,mcbrr001,cuthc001,mejie001,arteh001,dinin001,lopej004
14574,SEA201909290,1,3,semim001,profj001,piscs001,davik003,brows003,phegj001,neuss001,...,longs001,crawj002,nolaa002,seagk001,lewik001,narvo001,voged001,smitm007,gordd002,dunnj002
14575,TEX201909290,1,6,lemad001,judga001,gardb001,stanm004,torrg001,sancg002,gregd001,...,choos001,andre001,calhw001,santd001,odorr001,solan001,guzmr001,deshd002,trevj001,lynnl001


In [None]:
# calculate a batters stats for a single game
def calculate_stats(data, player_id):
    stats = dict()
    # calculate batting stats
    h, s, d, t, hr, w = 0, 0, 0, 0, 0, 0
    for event in data[data['batter'] == player_id]['event type']:
        if event == 20:
            s, h = s+1, h+1
        elif event == 21:
            d, h = d+1, h+1
        elif event == 22:
            t, h = t+1, h+1
        elif event == 23:
            hr, h = hr+1, h+1
        elif event == 14 or event == 15:
            w += 1
    # calculate at-bats, sh, sh, RBIs and bunts
    tf = ['ab flag', 'SH flag', 'SF flag', 'bunt flag']
    tf_val = [0,0,0,0]
    for label_index in range(len(tf)):
        tf_val[label_index] += data[(data['batter'] == player_id) & (data[tf[label_index]] == 'T')].shape[0]
    ab, sh, sf, b = tf_val
    rbi = sum(data[data['batter'] == player_id]['RBI on play'])
    # calculate base running stats
    sb, cs, po = 0, 0, 0
    sb += data[(player_id == data['first runner']) & ('T' == data['SB for runner on 1st flag'])].shape[0]
    cs += data[(player_id == data['first runner']) & ('T' == data['CS for runner on 1st flag'])].shape[0]
    po += data[(player_id == data['first runner']) & ('T' == data['PO for runner on 1st flag'])].shape[0]

    sb += data[(player_id == data['second runner']) & ('T' == data['SB for runner on 2nd flag'])].shape[0]
    cs += data[(player_id == data['second runner']) & ('T' == data['CS for runner on 2nd flag'])].shape[0]
    po += data[(player_id == data['second runner']) & ('T' == data['PO for runner on 2nd flag'])].shape[0]

    sb += data[(player_id == data['third runner']) & ('T' == data['SB for runner on 3rd flag'])].shape[0]
    cs += data[(player_id == data['third runner']) & ('T' == data['CS for runner on 3rd flag'])].shape[0]
    po += data[(player_id == data['third runner']) & ('T' == data['PO for runner on 3rd flag'])].shape[0]
    # calculate errors
    err = 0
    err += data[(player_id == data['1st error player'])].shape[0]
    err += data[(player_id == data['2nd error player'])].shape[0]
    err += data[(player_id == data['3rd error player'])].shape[0]
    # fill new dictionary with stats
    stats[f'ID'] = [player_id]
    stats[f'Hits'] = [h]
    stats[f'Singles'] = [s]
    stats[f'Doubles'] = [d]
    stats[f'Triples'] = [t]
    stats[f'Home Runs'] = [hr]
    stats[f'Walks'] = [w]
    stats[f'Bunts'] = [b]
    stats[f'Sacrifice Bunts'] = [sh]
    stats[f'Sacrifice Flies'] = [sf]
    stats[f'RBIs'] = [rbi]
    stats[f'At-bats'] = [ab]
    stats[f'Stolen Bases'] = [sb]
    stats[f'Caught Stealing'] = [cs]
    stats[f'Picked Off'] = [po]
    stats[f'Errors'] = [err]
    
    # return stats
    return stats

In [None]:
def pbp_parser(pbp_data):
    # get column labels for new play-by-play dataframe
    defaults = ['Game ID']
    player_stats = ['Hits', 'Singles',
                    'Doubles', 'Triples', 'Home Runs', 
                    'Walks', 'Bunts', 'Sacrifice Bunts', 
                    'Sacrifice Flies', 'RBIs', 'At-bats', 
                    'Stolen Bases', 'Caught Stealing', 
                    'Picked Off', 'Errors']
    pbp_labels = defaults + player_stats
    
    # make a directory for individual player stats
    players_dir = 'datasets/player_stats/'
    if 'player_stats' not in os.listdir('datasets/'):
        os.mkdir(players_dir)
    
    # loop thru all pbp
    for i in range(pbp_data.shape[0]):
        batter_id = pbp_data.loc[i, 'batter']
        
        batter_filename = f'{batter_id}.csv'
        
        already_seen = set() # set to track which games we've already calculated stats for
        
        # check if we've already done this batter
        if batter_filename in os.listdir(players_dir):
            continue
        else:
            batter_stats = pd.DataFrame(columns=pbp_labels)
            
            batter_pbp = pbp_data[pbp_data['batter'] == batter_id].reset_index()
            for j in range(batter_pbp.shape[0]):
                # get game ID
                game_id = batter_pbp.loc[j, 'game id']
                # have we already done this game?
                if game_id in already_seen:
                    continue
                else:
                    # get game's play by play
                    current_game_pbp = batter_pbp[batter_pbp['game id'] == game_id]
                    # append this games stats to end of data frame
                    current_game_stats = pd.DataFrame(data=calculate_stats(data=current_game_pbp, player_id=batter_id))
                    current_game_stats['Game ID'] = game_id
                    batter_stats = pd.concat([batter_stats, current_game_stats])
                    # add to set of already seen games so no duplicates
                    already_seen.add(game_id)
                    
            # write batter's stats to csv
            batter_stats.to_csv(players_dir + batter_filename)
        
    return

In [None]:
pbp_parser(pbp_data)

In [None]:
batter = pbp_data.loc[0,'batter']
filename = f'{batter}.csv'
hey = pd.read_csv('datasets/player_stats/' + filename)
hey.head()

In [None]:
hey.shape

In [None]:
dir = 'datasets/player_stats/group_'
os.mkdir(dir + '1')
os.mkdir(dir + '2')
os.mkdir(dir + '3')

In [None]:
#os.mkdir('datasets/player_stats/group_1')
#os.mkdir('datasets/player_stats/group_2')
#os.mkdir('datasets/player_stats/group_3')

directory = 1
count = 0
ignore = ['group_1', 'group_2', 'group_3']
for file in os.listdir('datasets/player_stats/'):
    if file not in ignore:
        os.rename(f'datasets/player_stats/{file}', f'datasets/player_stats/group_{directory}/{file}')
        count += 1
        if count == 1000:
            directory +=1
            count = 0

In [None]:
len(os.listdir('datasets/player_stats/'))

In [None]:
print(len(os.listdir('datasets/player_stats/group_1')))
print(len(os.listdir('datasets/player_stats/group_2')))
print(len(os.listdir('datasets/player_stats/group_3')))

In [7]:
def calculate_pitcher_stats(data, pitcher_id):
    # balls and strikes need to be updated - incorrect - parse 'pitch sequence'
    balls = data[(data['pitcher'] == pitcher_id) & (data['batter event flag'] == 'T')]['balls'].sum()
    strikes = data[(data['pitcher'] == pitcher_id) & (data['batter event flag'] == 'T')]['strikes'].sum()
    strikeouts, pickoff_errs, pickoffs, wild_pitches, balks, walks, int_walks, hbp = 0, 0, 0, 0, 0, 0, 0, 0
    for event in data[data['pitcher'] == pitcher_id]['event type']:
        if event == 3: 
            strikeouts += 1
        elif event == 7:
            pickoff_errs += 1
        elif event == 8:
            pickoffs += 1
        elif event == 9:
            wild_pitches += 1
        elif event == 11:
            balks += 1
        elif event == 14:
            walks += 1
        elif event == 15:
            int_walks += 1
        elif event == 16:
            hbp += 1
    er = data[(data['pitcher'] == pitcher_id) & (data['batter dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 1st'] == pitcher_id) & (data['runner on 1st dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 2nd'] == pitcher_id) & (data['runner on 2nd dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 3rd'] == pitcher_id) &(data['runner on 3rd dest'] == 4)].shape[0]
    ip = data[data['pitcher'] == pitcher_id]['inning'].max()
    era = er * 9 / ip
    
    stats = dict()
    stats['Balls'] = [balls]
    stats['Strikes'] = [strikes]
    stats['Strikeouts'] = [strikeouts]
    stats['Pickoff Errors'] = [pickoff_errs]
    stats['Pickoffs'] = [pickoffs]
    stats['Wild Pitches'] = [wild_pitches]
    stats['Balks'] = [balks]
    stats['Walks'] = [walks]
    stats['Intentional Walks'] = [int_walks]
    stats['Hit by Pitch'] = [hbp]
    stats['Earned Runs'] = [er]
    stats['Innings Pitched'] = [ip]
    stats['ERA'] = [era]
    
    return stats
    

In [8]:
def pbp_pitcher_parser(pbp_data, starting_lineups):
    # get column labels for new play-by-play dataframe
    defaults = ['Game ID']
    pitcher_stats = ['Balls', 'Strikes',
                    'Strikeouts', 'Pickoff Errors', 'Pickoffs', 
                    'Wild Pitches', 'Balks', 'Walks', 
                    'Intentional Walks', 'Hit by Pitch', 
                    'Earned Runs', 'Innings Pitched', 
                    'ERA',]
    pitcher_labels = defaults + pitcher_stats
    
    # make a directory for individual player stats
    pitcher_dir = 'datasets/pitcher_stats/'
    if 'pitcher_stats' not in os.listdir('datasets/'):
        os.mkdir(pitcher_dir)
        
    # loop thru all starting lineups
    for i in range(starting_lineups.shape[0]):
        v_label = 'Visiting Team Starting Pitcher ID'
        h_label = 'Home Team Starting Pitcher ID'
        v_pitcher_id = starting_lineups.loc[i, v_label]
        h_pitcher_id = starting_lineups.loc[i, h_label]
        
        # have we calculate stats for this pitcher?
        for pitcher_id in [v_pitcher_id, h_pitcher_id]:
            pitcher_filename = f'{pitcher_id}.csv'
        
            # check if we've already done this pitcher
            if pitcher_filename in os.listdir(pitcher_dir):
                continue
            else:
                pitcher_stats = pd.DataFrame(columns=pitcher_labels)
            
                for game_id in starting_lineups[(starting_lineups[v_label] == pitcher_id) | (starting_lineups[h_label] == pitcher_id)]['Game ID']:
                    # get game's play by play
                    current_game_pbp = pbp_data[pbp_data['game id'] == game_id]
                    # append this games stats to end of data frame
                    current_game_stats = pd.DataFrame(data=calculate_pitcher_stats(data=current_game_pbp, pitcher_id=pitcher_id))
                    current_game_stats['Game ID'] = game_id
                    pitcher_stats = pd.concat([pitcher_stats, current_game_stats])
                    
                # write pitcher's stats to csv
                pitcher_stats.to_csv(pitcher_dir + pitcher_filename)
        
    return

In [26]:
from tqdm.notebook import tqdm

PITCHER_STATS_DIR = "./datasets/pitcher_stats/"

def get_pitcher_stats_2(game_id, player_id, data):
    """Calculate the pitcher's stats in a particular game"""
    
    def get_num_inning_pitched():
        last_inning_pitcher = data.loc[data['pitcher'] == player_id].sort_values(by=['event num']).iloc[-1]
        
        if last_inning_pitcher['event type'] in {2, 3, 8}:
            return last_inning_pitcher["inning"] - 1 + ((last_inning_pitcher["outs"] + 1) / 3)
        else:
            return last_inning_pitcher["inning"] - 1 + (last_inning_pitcher["outs"] / 3)
        
    pitcher_data = data.loc[data['pitcher'] == player_id]

    # For some reason, starting pitcher doesn't pitch
    if pitcher_data.shape[0] == 0:
        return None
        
    # balls and strikes need to be updated - incorrect - parse 'pitch sequence'
    balls = pitcher_data[(pitcher_data['batter event flag'] == 'T')]['balls'].sum()
    strikes = pitcher_data[(pitcher_data['batter event flag'] == 'T')]['strikes'].sum()

    er = data[(data['pitcher'] == player_id) & (data['batter dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 1st'] == player_id) & (data['runner on 1st dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 2nd'] == player_id) & (data['runner on 2nd dest'] == 4)].shape[0]
    er += data[(data['Responsible pitcher for runner on 3rd'] == player_id) &(data['runner on 3rd dest'] == 4)].shape[0]
    ip = get_num_inning_pitched()
    
    # Get stats
    stats = dict()
    stats['Game ID'] = [game_id]
    stats['ID'] = [player_id]
    stats['Balls'] = [balls]
    stats['Strikes'] = [strikes]
    stats['Homeruns Allowed'] = [pitcher_data.loc[(pitcher_data['event type'] == 23)].shape[0]]
    stats['Hits Allowed'] = [pitcher_data.loc[(pitcher_data['event type'] == 20) |
                                              (pitcher_data['event type'] == 21) |
                                              (pitcher_data['event type'] == 22) |
                                              (pitcher_data['event type'] == 23)].shape[0]]
    stats['Strikeouts'] = [pitcher_data.loc[(pitcher_data['event type'] == 3)].shape[0]]
    stats['Pickoff Errors'] = [pitcher_data.loc[(pitcher_data['event type'] == 7)].shape[0]]
    stats['Pickoffs'] = [pitcher_data.loc[(pitcher_data['event type'] == 8)].shape[0]]
    stats['Wild Pitches'] = [pitcher_data.loc[(pitcher_data['event type'] == 9)].shape[0]]
    stats['Balks'] = [pitcher_data.loc[(pitcher_data['event type'] == 11)].shape[0]]
    stats['Walks'] = [pitcher_data.loc[(pitcher_data['event type'] == 14)].shape[0]]
    stats['Intentional Walks'] = [pitcher_data.loc[(pitcher_data['event type'] == 15)].shape[0]]
    stats['Hit by Pitch'] = [pitcher_data.loc[(pitcher_data['event type'] == 16)].shape[0]]
    stats['Earned Runs'] = [er]
    stats['Innings Pitched'] = [ip]
    
    # Return DataFrame object
    return pd.DataFrame(data=stats)
    

def pbp_pitcher_parser_2(pbp_data, starting_lineups, save_dir=PITCHER_STATS_DIR):
    """Get the stats for all starting pitchers"""

    # dictionary to store all of the dataframes
    pitchers = dict()
    
    pbp = pd.merge(pbp_data,
                   starting_lineups,
                   how='left',
                   left_on=['game id'],
                   right_on=['Game ID'])
    
    pbp = pbp.loc[(pbp['pitcher'] == pbp['Home Team Starting Pitcher ID']) |
                  (pbp['pitcher'] == pbp['Visiting Team Starting Pitcher ID'])]

    game_ids = set(pbp['game id'])
    
    for game_id in tqdm(game_ids):
        
        game_pbp = pbp.loc[pbp['game id'] == game_id]
    
        away_pitcher = game_pbp.iloc[0]['Visiting Team Starting Pitcher ID']
        home_pitcher = game_pbp.iloc[0]['Home Team Starting Pitcher ID']
        
        for pitcher_id in (away_pitcher, home_pitcher):
            pitcher_stats = get_pitcher_stats_2(game_id,
                                                pitcher_id,
                                                game_pbp)
            if pitcher_stats is None:
                continue
            
            if pitcher_id in pitchers.keys():
                pitchers[pitcher_id] = pd.concat([pitchers[pitcher_id], pitcher_stats])
            else:
                pitchers[pitcher_id] = pitcher_stats
                
    
    clean_dir(save_dir)
    
    for player_id, df in pitchers.items():
        df.to_csv(os.path.join(save_dir, f'{player_id}.csv'))

In [27]:
pbp_pitcher_parser_2(pbp_data, get_starting_lineups())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=14577.0), HTML(value='')))




In [28]:
pd.read_csv('./datasets/pitcher_stats/kersc001.csv')

Unnamed: 0.1,Unnamed: 0,Game ID,ID,Balls,Strikes,Homeruns Allowed,Hits Allowed,Strikeouts,Pickoff Errors,Pickoffs,Wild Pitches,Balks,Walks,Intentional Walks,Hit by Pitch,Earned Runs,Innings Pitched
0,0,LAN201704190,kersc001,26,41,0,5,10,0,0,0,0,1,0,0,2,7.000000
1,0,MIL201505040,kersc001,32,28,1,5,8,0,1,0,1,0,0,0,2,7.333333
2,0,COL201506010,kersc001,32,30,1,5,7,0,0,0,0,1,0,0,2,7.000000
3,0,SDN201909260,kersc001,23,31,0,2,7,0,0,1,0,1,0,0,0,6.000000
4,0,LAN201409240,kersc001,34,43,0,7,11,0,1,1,1,0,0,0,1,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0,COL201407040,kersc001,25,33,0,2,8,0,1,0,0,1,0,0,0,8.000000
158,0,OAK201508180,kersc001,38,41,0,5,7,0,0,0,0,2,0,0,1,7.000000
159,0,ARI201804030,kersc001,33,31,2,4,6,0,0,1,0,1,0,0,2,6.000000
160,0,ARI201403220,kersc001,28,40,0,5,7,0,0,1,0,1,0,0,1,6.666667


In [8]:
game_id = pbp_data.loc[0, 'game id']
pitcher_id = pbp_data.loc[0, 'pitcher']
pitcher_data = pbp_data[pbp_data['game id'] == game_id]
pitcher_data['runner on 1st dest']

0     0
1     0
2     0
3     0
4     0
     ..
81    4
82    0
83    0
84    0
85    0
Name: runner on 1st dest, Length: 86, dtype: int64

In [9]:
pitcher_data[pitcher_data['pitcher'] == pitcher_id]['inning'].max()

7

In [10]:
pbp_pitcher_parser(pbp_data, starting_lineups)

In [11]:
starting_lineups.shape

(14577, 23)

In [12]:
game_id = starting_lineups.loc[0, 'Game ID']
pitcher_id = starting_lineups.loc[0, 'Visiting Team Starting Pitcher ID']
pitcher_id

'kersc001'

In [13]:
game_pbp = pbp_data[pbp_data['game id'] == game_id]

game_pbp

Unnamed: 0,game id,visiting team,inning,batting team,outs,balls,strikes,pitch sequence,vis score,home score,...,Position of batter removed for pinch-hitter,Fielder with First Putout (0 if none),Fielder with Second Putout (0 if none),Fielder with Third Putout (0 if none),Fielder with First Assist (0 if none),Fielder with Second Assist (0 if none),Fielder with Third Assist (0 if none),Fielder with Fourth Assist (0 if none),Fielder with Fifth Assist (0 if none),event num
6457,ARI201403220,LAN,1,0,0,0,2,CFS,0,0,...,0,3,0,0,2,0,0,0,0,1
6458,ARI201403220,LAN,1,0,1,2,2,BBFSC,0,0,...,0,2,0,0,0,0,0,0,0,2
6459,ARI201403220,LAN,1,0,2,2,2,SBSBX,0,0,...,0,3,0,0,4,0,0,0,0,3
6460,ARI201403220,LAN,1,1,0,0,2,CSC,0,0,...,0,2,0,0,0,0,0,0,0,4
6461,ARI201403220,LAN,1,1,1,1,2,BCSX,0,0,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6525,ARI201403220,LAN,9,0,2,0,0,X,3,1,...,0,8,0,0,0,0,0,0,0,69
6526,ARI201403220,LAN,9,1,0,0,2,.CSS,3,1,...,0,2,0,0,0,0,0,0,0,70
6527,ARI201403220,LAN,9,1,1,1,2,FBSX,3,1,...,0,3,0,0,6,0,0,0,0,71
6528,ARI201403220,LAN,9,1,2,3,2,CBBBFB,3,1,...,0,0,0,0,0,0,0,0,0,72


In [14]:
pitcher_pbp = game_pbp[(game_pbp['pitcher'] == pitcher_id)]

pitcher_pbp

Unnamed: 0,game id,visiting team,inning,batting team,outs,balls,strikes,pitch sequence,vis score,home score,...,Position of batter removed for pinch-hitter,Fielder with First Putout (0 if none),Fielder with Second Putout (0 if none),Fielder with Third Putout (0 if none),Fielder with First Assist (0 if none),Fielder with Second Assist (0 if none),Fielder with Third Assist (0 if none),Fielder with Fourth Assist (0 if none),Fielder with Fifth Assist (0 if none),event num
6460,ARI201403220,LAN,1,1,0,0,2,CSC,0,0,...,0,2,0,0,0,0,0,0,0,4
6461,ARI201403220,LAN,1,1,1,1,2,BCSX,0,0,...,0,0,0,0,0,0,0,0,0,5
6462,ARI201403220,LAN,1,1,1,1,0,*BX,0,0,...,0,0,0,0,0,0,0,0,0,6
6463,ARI201403220,LAN,1,1,1,1,1,C*BX,0,0,...,0,3,0,0,1,0,0,0,0,7
6464,ARI201403220,LAN,1,1,2,2,1,BBCX,0,0,...,0,3,0,0,6,0,0,0,0,8
6470,ARI201403220,LAN,2,1,0,1,1,CBX,1,0,...,0,7,0,0,0,0,0,0,0,14
6471,ARI201403220,LAN,2,1,1,0,2,CCFS,1,0,...,0,2,0,0,0,0,0,0,0,15
6472,ARI201403220,LAN,2,1,2,1,2,CBSS,1,0,...,0,3,0,0,2,0,0,0,0,16
6476,ARI201403220,LAN,3,1,0,2,2,BBFFFFFFX,1,0,...,0,3,0,0,4,0,0,0,0,20
6477,ARI201403220,LAN,3,1,1,2,2,BBCFX,1,0,...,0,3,0,0,5,0,0,0,0,21


In [15]:
len(os.listdir('datasets/pitcher_stats/'))

746