In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup, SoupStrainer
from datetime import timedelta, date
import os
from tqdm import tqdm
import time
from numpy import nan

In [None]:
def try_ap(l, dic, field, type=None):
    try:
        if type:
            l.append(type(dic[field]))
        else:
            l.append(dic[field])
    except:
        l.append(nan)
        #print('added nan to {}'.format(field))

In [100]:
years = [2015, 2016, 2017, 2018]

keys_to_grab_floats = ['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir', 'break_angle',
                        'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot', 'sz_top', 'type_confidence',
                        'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0', 'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone']

keys_to_grab_str = ['code', 'type', 'pitch_type']


other_cols = ['b_score', 'ab_id', 'b_count', 's_count', 'outs', 'pitch_num', 'on_1b', 'on_2b', 'on_3b']

ab_keys = ['inning', 'top', 'ab_id', 'g_id', 'p_score', 'batter_id', 'pitcher_id', 'stand', 'p_throws']

game_keys = ['g_id', 'home_team', 'away_team', 'home_final_score', 'away_final_score']


pitch_data_dict = {}
ab_data_dict = {}
game_data_dict = {}
for ele in keys_to_grab_floats + keys_to_grab_str + other_cols:
    pitch_data_dict[ele] = []

for ele in ab_keys:
    ab_data_dict[ele] = []
    
for ele in game_keys:
    game_data_dict[ele] = []

for year in years:
    print(year)
    xml_dir = str(year) + '_xmls'
    game_files = os.listdir(xml_dir)
    game_files = [ele for ele in sorted(game_files) if ele[:3]=='gid'] # the sorted is important 
                                                                       # to get chronological order
    g_id = year*1e5 + 1
    ab_id = year*1e6 + 1

    for g in tqdm(game_files):
        soup = BeautifulSoup(open(os.path.join(xml_dir, g)).read(), 'xml')

        inning = 1
        top = True
        num_outs = 0
        new_inning = True
        
        game_data_dict['g_id'].append(g_id)
        game_data_dict['home_team'].append(soup.findAll('inning')[0]['home_team'])
        game_data_dict['away_team'].append(soup.findAll('inning')[0]['away_team'])
        
        home_team_runs = 0
        away_team_runs = 0
        

        for ab in soup.findAll('atbat'):
            
            
            #if new_inning: #score can't change during 1st AB of inning
                #p_side_runs = 'home_team_runs' if top else 'away_team_runs'
                #b_side_runs = 'away_team_runs' if top else 'home_team_runs'
                #p_side_runs = 'home_team_runs' if top else 'away_team_runs'
                #pscore = int(ab[p_side_runs])
                #bscore = away_team_runs if top else home_team_runs
                
                #try:
                #    p_score = int(ab[p_side_runs])
                #except:
                #    print(g)
                #    break
                #b_score = int(ab[b_side_runs])
                #new_inning = False
                

            pitches = ab.findAll(['pitch', 'runner'])
            count_balls = 0
            count_strikes = 0
            pitch_num_counter = 0
            
            ab_data_dict['top'].append(top)
            ab_data_dict['ab_id'].append(ab_id)
            ab_data_dict['g_id'].append(g_id)
            ab_data_dict['inning'].append(inning)
            ab_data_dict['p_score'].append(home_team_runs if top else away_team_runs)
            ab_data_dict['batter_id'].append(ab['batter'])
            ab_data_dict['pitcher_id'].append(ab['pitcher'])
            ab_data_dict['stand'].append(ab['stand'])
            ab_data_dict['p_throws'].append(ab['p_throws'])
            for n, p in enumerate(pitches):
                
                #print(p.prettify())

                # outs and scores are only given at the ab level in the xml file
                # this represents the outs and score at the *end* of the ab
                # to get pitch-level, we keep track of what it was at the end of the last AB,
                # and increment as needed by events like caught stealing, scoring on wild pitches, etc.
                # To start an inning, the outs start at 0 (obviously), and we can trust whatever it says
                # on score because that can't change during the 1st AB (HR would mean the score happens after)
                # note that the continue implements this nicely: if it's the last event of an AB, 
                # it will skip down to the end-of-AB section where we check the number of outs and runs
                # directly from the xml, and start a new inning if necessary. This means we effectively ignore
                # everything that happens in the field after the last pitch of the AB, and look to the XML
                # to tell us exactly what the situation is afterward.
                if p.name == 'runner':
                    if p['end'] == "": #runner doesn't end on base

                        
                        #check for pinch-runner substitution
                        prev_base = p['start']
                        #print(p.prettify())
                        if p.has_attr('score'): # runner scored
                            if top:
                                away_team_runs += 1
                            else:
                                home_team_runs += 1
                            #b_score += 1
                        else: #runner is out
                            num_outs += 1
                    if p['start'] == "" and not p['id'] == ab['batter']: #this means a pinch runner coming in
                        num_outs -= 1 # so we subtract an out because we added an out when the old runner came off
                    continue
                if num_outs <0:
                    raise
                
                if num_outs>2:
                    print('lol')
                    break
                    
                pitch_num_counter += 1
                pitch_data_dict['ab_id'].append(ab_id)
                pitch_data_dict['outs'].append(num_outs)
                pitch_data_dict['b_score'].append(away_team_runs if top else home_team_runs)
                pitch_data_dict['on_1b'].append(p.has_attr('on_1b'))
                pitch_data_dict['on_2b'].append(p.has_attr('on_2b'))
                pitch_data_dict['on_3b'].append(p.has_attr('on_3b'))
                pitch_data_dict['pitch_num'].append(pitch_num_counter)
                pitch_data_dict['b_count'].append(count_balls)
                pitch_data_dict['s_count'].append(count_strikes)

                  
                for ele in keys_to_grab_floats:
                    try_ap(pitch_data_dict[ele], p, ele, float)
                  
                for ele in keys_to_grab_str:
                    try_ap(pitch_data_dict[ele], p, ele)

                if p['type'] == 'B':
                    count_balls += 1
                if p['type'] == 'S' and count_strikes < 2:
                    count_strikes += 1

            # end of AB
            ab_id += 1
            num_outs = int(ab['o'])
            if not away_team_runs == int(ab['away_team_runs']):
                raise ValueError('outs counted wrong')
            if not home_team_runs == int(ab['home_team_runs']):
                raise ValueError('outs counted wrong')

            if num_outs == 3:
                if not top:
                    inning += 1
                top = not top
                num_outs = 0
                new_inning = True
                
        # end of game
        game_data_dict['away_final_score'].append(ab['away_team_runs'])
        game_data_dict['home_final_score'].append(ab['home_team_runs'])
        g_id += 1
    
pitches_df = pd.DataFrame(pitch_data_dict)
pitches_df.to_csv('pitches.csv', index=False)
ab_df = pd.DataFrame(ab_data_dict)
ab_df.to_csv('atbats.csv', index=False)
game_df = pd.DataFrame(game_data_dict)
game_df.to_csv('games.csv', index=False)

  0%|          | 3/2456 [00:00<01:43, 23.80it/s]

2015


100%|██████████| 2456/2456 [02:34<00:00, 15.89it/s]
  0%|          | 3/2445 [00:00<01:34, 25.84it/s]

2016


100%|██████████| 2445/2445 [03:04<00:00, 13.27it/s]
  0%|          | 2/2445 [00:00<02:02, 19.99it/s]

2017


100%|██████████| 2445/2445 [03:42<00:00, 11.00it/s]
  0%|          | 3/2455 [00:00<01:57, 20.85it/s]

2018


100%|██████████| 2455/2455 [04:12<00:00,  9.74it/s]


# Code definitions:

B - Ball

\*B - Ball in dirt

S - Swinging Strike

C - Called Strike

F - Foul

T - Foul Tip

L - Foul Bunt

I - Intentional Ball

W - Swinging Strike (Blocked)

M - Missed Bunt

P - Pitchout

Q - Swinging pitchout

R - Foul pitchout

## Codes the can only occur on last pitch of ab

X - In play, out(s)

D - In play, no out

E - In play, runs

In [20]:
# Validation
invalid_outs = (pitches_df['outs'] > 2).sum()
if invalid_outs > 0:
    print('Warning: invalid number of outs for some pitches')

In [28]:
soup.findAll('inning')[0]['home_team']

'chn'

In [38]:
game_df[(game_df['g_id']>201800000) & (game_df['away_team']=='hou')]

Unnamed: 0,away_final_score,away_team,g_id,home_final_score,home_team
7352,4,hou,201800007.0,1,tex
7363,1,hou,201800018.0,5,tex
7376,9,hou,201800031.0,3,tex
7389,8,hou,201800044.0,2,tex
7488,2,hou,201800143.0,0,min
7500,1,hou,201800155.0,4,min
7515,8,hou,201800170.0,9,min
7574,1,hou,201800229.0,2,sea
7587,4,hou,201800242.0,1,sea
7603,7,hou,201800258.0,1,sea


In [49]:
ab_df[ab_df['g_id']==201800007]['ab_id'].values[0]

2018000478.0

In [102]:
pitches_df[pitches_df['ab_id']==2018000479.0]

Unnamed: 0,ab_id,ax,ay,az,b_count,b_score,break_angle,break_length,break_y,code,...,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,zone
2146412,2018000000.0,9.092517,27.35666,-15.928903,0,1,-25.7,4.3,23.8,B,...,2.0,-4.92887,-131.013212,-4.20784,124.544331,2.445584,198.382812,50.0,6.155188,12.0
2146413,2018000000.0,9.968671,28.41754,-15.240553,1,1,-26.7,4.3,23.8,D,...,2.0,-7.951041,-131.121105,-5.956593,109.203216,2.302043,167.755371,50.0,5.995996,2.0


In [70]:
away_team_runs, int(ab['away_team_runs'])

(3, 4)

In [71]:
home_team_runs, int(ab['home_team_runs'])

(0, 0)

In [72]:
print(ab.prettify())

<atbat away_team_runs="4" b="0" b_height="5-11" batter="435559" des="Kurt Suzuki singles on a ground ball to shortstop Alexei Ramirez.   Torii Hunter scores.    Kurt Suzuki to 2nd.  Shane Robinson advances to 3rd, on a throwing error by shortstop Alexei Ramirez.  " des_es="Kurt Suzuki pega sencillo con rodado a campo corto Alexei Ramirez.   Torii Hunter anota  Kurt Suzuki a 2da.  Shane Robinson avanza a la 3ra, on error en tiro de campo corto Alexei Ramirez.  " end_tfs_zulu="2015-04-10T23:01:03Z" event="Single" event2="Error" event2_es="Error" event_es="Sencillo" event_num="589" home_team_runs="0" num="71" o="2" p_throws="R" pitcher="474029" play_guid="9155f4ab-e2ff-4077-af92-f6326138b7c8" s="1" score="T" stand="R" start_tfs="225940" start_tfs_zulu="2015-04-10T22:59:40Z">
 <runner end="" event="Single" event_num="589" id="542455" start="1B"/>
 <runner end="1B" event="Single" event_num="589" id="453203" start=""/>
 <pitch ax="-17.367" ay="27.145" az="-23.265" break_angle="33.8" break_le

In [73]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<!--Copyright 2018 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt-->
<game atBat="547989" deck="541645" hole="425560" ind="F">
 <inning away_team="min" home_team="cha" next="Y" num="1">
  <top>
   <atbat away_team_runs="0" b="3" b_height="5-11" batter="572821" des="Brian Dozier strikes out swinging.  " des_es="Brian Dozier se poncha tirándole.  " end_tfs_zulu="2015-04-10T20:12:00Z" event="Strikeout" event_es="Ponche" event_num="11" home_team_runs="0" num="1" o="1" p_throws="R" pitcher="456051" play_guid="7f270877-f0f7-4dd7-af80-cc175b41b93a" s="3" stand="R" start_tfs="200958" start_tfs_zulu="2015-04-10T20:09:58Z">
    <pitch ax="-15.437" ay="31.448" az="-19.475" break_angle="32.4" break_length="5.4" break_y="23.8" cc="" code="C" des="Called Strike" des_es="Strike cantado" end_speed="85.3" event_num="3" id="3" mt="" nasty="54" pfx_x="-8.3" pfx_z="

In [106]:
soup.findAll('inning')[0].findAll(['top', 'bottom'])[1]

<bottom><atbat away_team_runs="0" b="0" b_height="6-1" batter="502517" des="Daniel Murphy flies out to center fielder Lorenzo Cain.  " des_es="Daniel Murphy batea elevado de out a jardinero central Lorenzo Cain.  " end_tfs_zulu="2018-10-01T17:13:26Z" event="Flyout" event_es="Elevado de Out" event_num="26" home_team_runs="0" num="4" o="1" p_throws="R" pitcher="468504" play_guid="fe596af2-5c34-48cb-82b1-2bfeb0bf9ef9" s="2" stand="L" start_tfs="171245" start_tfs_zulu="2018-10-01T17:12:45Z"><pitch ax="-20.9048709700689" ay="30.1293634341575" az="-24.2361341262639" break_angle="36.9" break_length="7.0" break_y="23.8" cc="" code="C" des="Called Strike" des_es="Strike cantado" end_speed="85.5" event_num="22" id="22" mt="" nasty="48" pfx_x="-11.261117645902404" pfx_z="4.276026787689708" pitch_type="FT" play_guid="f5aaf663-ccbf-404d-81b1-840511436770" px="0.395132500235353" pz="1.96780847647053" spin_dir="249.207" spin_rate="2393.768" start_speed="93.4" sv_id="181001_171243" sz_bot="1.771132194

In [76]:
for x in ab:
    print(x.prettify())
    print('\n')

<runner end="" event="Single" event_num="589" id="542455" start="1B"/>



<runner end="1B" event="Single" event_num="589" id="453203" start=""/>



<pitch ax="-17.367" ay="27.145" az="-23.265" break_angle="33.8" break_length="6.8" break_y="23.8" cc="" code="C" des="Called Strike" des_es="Strike cantado" end_speed="82.7" event_num="579" id="579" mt="" nasty="48" on_1b="453203" on_2b="116338" pfx_x="-9.97" pfx_z="5.07" pitch_type="SI" play_guid="cb179e11-79eb-42db-acf5-dedd3aa5ed35" px="-0.305" pz="1.963" spin_dir="242.842" spin_rate="2161.130" start_speed="89.5" sv_id="150410_175950" sz_bot="1.44" sz_top="3.09" tfs="225949" tfs_zulu="2015-04-10T22:59:49Z" type="S" type_confidence="2.000" vx0="5.751" vy0="-131.031" vz0="-4.744" x="128.63" x0="-1.231" y="185.78" y0="50" z0="5.531" zone="7"/>



<pitch ax="-15.822" ay="26.513" az="-22.766" break_angle="31.8" break_length="6.4" break_y="23.8" cc="" code="E" des="In play, run(s)" des_es="En juego, carrera(s)" end_speed="83.7" event_num="580"

In [92]:
c = ab.findAll(['pitch', 'runner'])

In [89]:
for a, b in enumerate(ab):
    print(a)

0
1
2
3
4
5
6


In [95]:
c = 3

In [96]:
c-=1

In [97]:
c

2