In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup, SoupStrainer
from datetime import timedelta, date
import os
from tqdm import tqdm
import time
from numpy import nan

In [None]:
def try_ap(l, dic, field):
    try:
        l.append(dic[field])
    except:
        l.append(nan)
        #print('added nan to {}'.format(field))
        
def get_delay(x):
    if '(' in x:
        t = x.split(' ')[1][1:].split(':')
        minutes = int(t[1])
        if t[0]:
            minutes += 60*int(t[0])
        return minutes
    else:
        return 0

def get_gametime(x):
    if '(' in x:
        t = x.split(' ')[0].split(':')
    else:
        t = x.split(':')
    return int(t[0])*60 + int(t[1])

In [None]:
years = [2015, 2016, 2017, 2018]

keys_to_grab = ['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir', 'break_angle',
                'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot', 'sz_top', 'type_confidence',
                'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0', 'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone',
                'code', 'type', 'pitch_type']

other_cols = ['b_score', 'ab_id', 'b_count', 's_count', 'outs', 
              'pitch_num', 'on_1b', 'on_2b', 'on_3b']

ab_keys = ['inning', 'top', 'ab_id', 'g_id', 'p_score', 'batter_id', 
           'pitcher_id', 'stand', 'p_throws', 'event', 'o']

game_keys_simple = ['start_time', 'venue_name', 'weather', 'wind',
                    'elapsed_time', 'attendance']

game_keys = ['g_id', 'home_team', 'away_team', 'home_final_score', 
             'away_final_score', 'date', 'umpire_HP', 'umpire_1B', 
             'umpire_2B', 'umpire_3B']

pitch_data_dict = {}
ab_data_dict = {}
game_data_dict = {}

for ele in keys_to_grab + other_cols:
    pitch_data_dict[ele] = []
for ele in ab_keys:
    ab_data_dict[ele] = []
for ele in game_keys + game_keys_simple:
    game_data_dict[ele] = []
    
    
    
for year in years:
    print(year)
    xml_dir = str(year) + '_xmls'
    boxscore_dir = str(year) + '_rawboxscores'
    game_files = os.listdir(xml_dir)
    game_files = [ele for ele in sorted(game_files) if ele[:3]=='gid'] # the sorted is important 
                                                                       # to get chronological order
    g_id = int(year*1e5) + 1
    ab_id = int(year*1e6) + 1

    for g in tqdm(game_files):
        game_xml = BeautifulSoup(open(os.path.join(xml_dir, g)).read(), 'xml')
        game_ind = game_xml('game')[0]['ind']
        
        if (len(game_xml('pitch')) == 0) or game_ind=='DR': #game postponed
            continue
            
        game_boxscore = BeautifulSoup(open(os.path.join(boxscore_dir, g)).read(), 'xml')
        bs = game_boxscore('boxscore')[0]
        for ele in game_keys_simple:
            try_ap(game_data_dict[ele], bs, ele)
        
        ump_list = bs('umpire')
        if len(ump_list) == 4:                  
            for ump in bs('umpire'):
                game_data_dict['umpire_' + ump['position']].append(ump['name'])
        elif len(ump_list) > 4:
            for ump in bs('umpire'):
                try:
                    game_data_dict['umpire_' + ump['position']].append(ump['name'])
                except:
                    pass
        else:
            for base in ['HP', '1B', '2B', '3B']:
                game_data_dict['umpire_' + base].append('')
            for ump in bs('umpire'):
                game_data_dict['umpire_' + ump['position']][-1] = ump['name']         

        top = False
        num_outs = 0
        
        game_data_dict['g_id'].append(g_id)
        game_data_dict['home_team'].append(game_xml.findAll('inning')[0]['home_team'])
        game_data_dict['away_team'].append(game_xml.findAll('inning')[0]['away_team'])
        
        home_team_runs = 0
        away_team_runs = 0
        
        all_innings = game_xml.findAll('inning')
        num_innings = len(all_innings)
        
        for inning_xml in all_innings: 
            inning_num = int(inning_xml['num'])
            
            inning_top  = inning_xml.findAll('top')
            inning_bot  = inning_xml.findAll('bottom')

            assert top == False
            for half_xml in inning_xml.findAll(['top', 'bottom']):
                top = not top
                num_outs = 0
                
                for ab in half_xml.findAll('atbat'):
                    pitches = ab.findAll(['pitch', 'runner'])
                    count_balls = 0
                    count_strikes = 0
                    pitch_num_counter = 0

                    ab_data_dict['top'].append(top)
                    ab_data_dict['ab_id'].append(ab_id)
                    ab_data_dict['g_id'].append(g_id)
                    ab_data_dict['inning'].append(inning_num)
                    ab_data_dict['p_score'].append(home_team_runs if top else away_team_runs)
                    ab_data_dict['batter_id'].append(ab['batter'])
                    ab_data_dict['pitcher_id'].append(ab['pitcher'])
                    ab_data_dict['stand'].append(ab['stand'])
                    ab_data_dict['p_throws'].append(ab['p_throws'])
                    ab_data_dict['event'].append(ab['event'])
                    ab_data_dict['o'].append(ab['o'])
                    fan_int = ab['event'] == 'Fan interference'
                    if fan_int:
                        des = ab['des']
                        if 'ground-rule double' in des:
                            ruling = 'Double'
                        elif ' doubles' in des:
                            ruling = 'Double'
                        elif ' triples ' in des:
                            ruling = 'Triple'
                        elif ' pops out' in des:
                            ruling = 'Pop Out'
                        elif ' flies out ' in des:
                            ruling = 'Flyout'
                        elif ' fielding error ' in des:
                            ruling = 'Field Error'
                        elif ' singles' in des:
                            ruling = 'Single'
                        elif ' homers' in des:
                            ruling = 'Home Run'
                        else:
                            raise ValueError('Unable to parse description ' +
                                             'for fan interference: {}'.format(des))
                        ab_data_dict['event'][-1] = ruling
                    for n, p in enumerate(pitches):

                        # outs and scores are only given at the ab level in the xml file
                        # this represents the outs and score at the *end* of the ab
                        # to get pitch-level, we keep track of what it was at the end of the last AB,
                        # and increment as needed by events like caught stealing, scoring on wild pitches, etc.
                        # To start an inning, the outs start at 0 (obviously), and we can trust whatever it says
                        # on score because that can't change during the 1st AB (HR would mean the score happens after)
                        # note that the continue implements this nicely: if it's the last event of an AB, 
                        # it will skip down to the end-of-AB section where we check the number of outs and runs
                        # directly from the xml, and start a new inning if necessary. This means we effectively ignore
                        # everything that happens in the field after the last pitch of the AB, and look to the XML
                        # to tell us exactly what the situation is afterward.
                        if p.name == 'runner':
                            if p['end'] == "": #runner doesn't end on base
                                if p.has_attr('score'): # runner scored
                                    if top:
                                        away_team_runs += 1
                                    else:
                                        home_team_runs += 1
                                    #b_score += 1
                                else: #runner is out
                                    num_outs += 1
                            elif p['start'] == "" and not p['id'] == ab['batter']: #pinch runner substitution
                                num_outs -= 1 # subtract an out bc we added an out when old runner came off
                            continue
                            
                        if num_outs <0:
                            raise
                        
                        assert num_outs >= 0
                        if num_outs > 2: # this should also be an assert, but at least one XML has 
                                         # a weird error - after Adam Eaton was thrown out of the game
                            break        # it repeated half of the at-bat, this break will properly skip those
                                         # see gid_2016_05_20_kcamlb_chamlb_1.xml

                        pitch_num_counter += 1
                        pitch_data_dict['ab_id'].append(ab_id)
                        pitch_data_dict['outs'].append(num_outs)
                        pitch_data_dict['b_score'].append(away_team_runs if top else home_team_runs)
                        pitch_data_dict['on_1b'].append(p.has_attr('on_1b'))
                        pitch_data_dict['on_2b'].append(p.has_attr('on_2b'))
                        pitch_data_dict['on_3b'].append(p.has_attr('on_3b'))
                        pitch_data_dict['pitch_num'].append(pitch_num_counter)
                        pitch_data_dict['b_count'].append(count_balls)
                        pitch_data_dict['s_count'].append(count_strikes)


                        for ele in keys_to_grab:
                            try_ap(pitch_data_dict[ele], p, ele)
                            
                        if p['type'] == 'B':
                            count_balls += 1
                        if p['type'] == 'S' and (count_strikes < 2):
                            count_strikes += 1

                    # end of AB
                    if ab['event'] == 'Hit By Pitch':
                        pitch_data_dict['code'][-1] = 'H'
                    
                    ab_id += 1
                    assert num_outs >= int(ab['o'])-1
                    num_outs = int(ab['o'])
                    assert away_team_runs == int(ab['away_team_runs'])
                    assert home_team_runs == int(ab['home_team_runs'])
                    
                    # these are commented out because the MLB xml files are often wrong
                    # ('often' means about one ab per few games)
                    #assert count_balls == int(ab['b'])
                    #assert count_strikes == int(ab['s'])


                # end of half-inning

            # end of inning
            
        # end of game
        game_data_dict['away_final_score'].append(int(ab['away_team_runs']))
        game_data_dict['home_final_score'].append(int(ab['home_team_runs']))
        game_data_dict['date'].append("-".join(g.split('_')[1:4]))
        g_id += 1

pitches_df = pd.DataFrame(pitch_data_dict)
pitches_df.to_csv('pitches.csv', index=False)
ab_df = pd.DataFrame(ab_data_dict)
ab_df.to_csv('atbats.csv', index=False)
game_df = pd.DataFrame(game_data_dict)
game_df['date'] = pd.DatetimeIndex(game_df['date'])
game_df['attendance'] = game_df['attendance'].apply(lambda x: int(x.replace(',', '')))
game_df['delay'] = game_df['elapsed_time'].apply(get_delay)
game_df['elapsed_time'] = game_df['elapsed_time'].apply(get_gametime)
game_df.to_csv('games.csv', index=False)

# Code definitions:

B - Ball

\*B - Ball in dirt

S - Swinging Strike

C - Called Strike

F - Foul

T - Foul Tip

L - Foul Bunt

I - Intentional Ball

W - Swinging Strike (Blocked)

M - Missed Bunt

P - Pitchout

Q - Swinging pitchout

R - Foul pitchout

## Codes the can only occur on last pitch of ab

X - In play, out(s)

D - In play, no out

E - In play, runs

H - Hit By Pitch (not found in xml files)

In [None]:
# Validation
game_df['year'] = game_df['g_id'].astype(int)//1e5
away_game_counts = game_df.groupby(['year', 'away_team'])['g_id'].count() 
home_game_counts = game_df.groupby(['year', 'home_team'])['g_id'].count()
game_count = away_game_counts + home_game_counts

In [None]:
game_count[~(game_count==162)]

### teams with other-than 162 games:

2015:

    161:
        cle
        det
        
            
2016:

    161:
        mia
        atl
        cle
        det
    (chn and pit had a tie; this counts as 161 games in the standings, but stats still count so
        by this count it should show up as 162)
        
2017:
    none
        
2018:

    163:
        chn
        mil
        lan
        col
        
    161:
        pit
        mia




In [None]:
#mike trout
mt_2018 = ab_df[(ab_df['batter_id']=='545361') & (ab_df['ab_id']//1e6 == 2018)]
mt_2018['event'].value_counts()

According to bbref.com (https://www.baseball-reference.com/players/t/troutmi01.shtml), we should have:

- K: 124
- 1B: 80
- 2B: 24
- 3B: 4
- HR: 39
- BB: 122 (this is walks + intentional walks)
- IBB: 25