In [None]:
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import numpy as np

In [None]:
t = time()
atbats = pd.read_csv('../input/atbats.csv', index_col=0)
games = pd.read_csv('../input/games.csv', index_col=0)
print(time()-t)

In [None]:
enc_dict = {'Single': '1B','Double': '2B','Triple': '3B','Home Run': 'HR','Walk': 'BB',
            'Intent Walk': 'IBB','Hit By Pitch': 'HBP','Strikeout': 'K','Sac Fly': 'SF',
            'Grounded Into DP': 'GIDP','Groundout': 'GO','Lineout': 'LO','Pop Out': 'PO',
            'Flyout': 'FO','Fielders Choice': 'FC','Sac Bunt': 'SAC','Double Play': 'DP',
            'Triple Play': 'TP','Batter Interference': 'BI','Fan interference': 'FI',
            'Catcher Interference': 'CI','Field Error': 'ROE','Bunt Groundout': 'BGO',
            'Bunt Lineout': 'BLO','Bunt Pop Out': 'BPO','Fielders Choice Out': 'FCO',
            'Forceout': 'FORCE','Sacrifice Bunt DP': 'SBDP','Strikeout - DP': 'KDP',
            'Runner Out': 'RO','Sac Fly DP': 'SFDP'
           }

In [None]:
t = time()
atbats['year'] = (atbats.index//1e6).astype(int)
atbats['event'] = atbats['event'].apply(lambda x: enc_dict[x])
eventcol = atbats['event']
atbats = pd.get_dummies(atbats, columns=['event'], prefix='')
atbats['event'] = eventcol
print(time()-t)

# Compiling Basic Batter/Pitcher Statistics

Once we've got the data loaded, and one-hot encoded the event column, we need to do some categorization of events: 

- hits (single, double, triple, HR)

- number of total bases (1 for single, 2 for double, etc.)

- strikeouts, which are labeled as either strikeout or strikeout double play (includes a runner caught stealing)

- walks (intentional and non-intentional), 

- at-bats (catcher's interference, sac bunt, walks, hit-by-pitch, sac fly, and runner getting thrown out to end inning don't count)

- plate appearances (runner getting thrown out to end inning don't count

For these stats, I prefix each column name with an underscore to signify that this is just for the one at-bat. Later we'll have the same stats, but cumulative for the season.

In [None]:
atbats['_H'] = atbats[['_1B', '_2B', '_3B', '_HR']].sum(axis=1)
atbats['_TB'] = atbats['_1B'] + 2*atbats['_2B'] + 3*atbats['_3B'] + 4*atbats['_HR']
atbats['_K'] = atbats['_K'] + atbats['_KDP']
atbats['_BB'] = atbats['_BB'] + atbats['_IBB']
atbats['_AB'] = 1 - atbats[['_CI', '_SAC', '_BB', 
                            '_HBP', '_RO', '_SF']].sum(axis=1)
atbats['_PA'] = 1 - atbats['_RO']

## Calculating a pitcher's IP (innings pitched)

At first glance, it seems like we should just assign a number of outs to every at-bat based on what happens (hit=0, flyout=1, double plays=2, etc). But this runs into a problem: sometimes runners get thrown out during the at-bat and this isn't reflected in the event because it doesn't happen on the last pitch. So we use the column atbats['o'], which says the number of outs at the end of the at-bat. Shifting this down (and filling the top with 0) gives the number at the start of the next one, and subtracting these gives what we want.

In [None]:
atbats['_outs'] = atbats['o'] - atbats.groupby(['g_id', 
                                                'inning', 
                                                'top'])['o'].shift(1).fillna(0)

# Add up the stats!

Using the groupby function, we can easily get cumulative sums of each batter's stats, as well as each those stats for the batters each pitcher has faced.

(Note: I use pd.Index so that a prefix can be appended to each element by just adding it to the Index object, eg: thisprefix_ + stats_to_cum

In [None]:
stats_to_cum = pd.Index(['H', 'TB', 'PA', 'AB', 'BB', 'HBP', 'IBB', 'K', 
                         '2B', '3B', 'HR', 'GIDP', 'SF'])

batter_groups = atbats.groupby(['batter_id', 'year'])
pitcher_groups = atbats.groupby(['pitcher_id', 'year'])

atbats[stats_to_cum] = (batter_groups['_' + stats_to_cum]
                                     .transform(pd.Series.cumsum)).astype(int)

atbats['opp_' + stats_to_cum] = (pitcher_groups['_' + stats_to_cum]
                                               .transform(pd.Series.cumsum)).astype(int)

atbats['IP'] = pitcher_groups['_outs'].transform(pd.Series.cumsum) / 3

# Calculate the rate stats for batters

In [None]:
atbats['AVG'] = atbats['H']/atbats['AB']
atbats['SLG'] = atbats['TB']/atbats['AB']
atbats['OBP'] = atbats[['H', 'BB', 'HBP']].sum(axis=1)/atbats[['AB', 'BB', 'HBP', 
                                                               'SF']].sum(axis=1)
atbats['OPS'] = atbats['SLG'] + atbats['OBP']
atbats['K%'] = atbats['K'] / atbats['AB']
atbats['BB%'] = atbats['BB'] / atbats['AB']
atbats['K-BB%'] = atbats['K%'] - atbats['BB%']
atbats['BABIP'] = (atbats['H'] - atbats['HR'])/(atbats['AB'] + atbats['SF'] 
                                                          - atbats['HR'] - atbats['K'])
atbats['ISO'] = atbats['SLG'] - atbats['AVG']

# Now for pitchers

(note: the FIP calculation isn't quite correct; the constant added at the end changes such that the league average FIP is the same as the ERA)

In [None]:
atbats['opp_AVG'] = atbats['opp_H']/atbats['opp_AB']
atbats['opp_SLG'] = atbats['opp_TB']/atbats['opp_AB']
atbats['opp_OBP'] = (atbats[['opp_H', 'opp_BB', 'opp_HBP']].sum(axis=1) / 
                             atbats[['opp_AB', 'opp_BB', 
                                     'opp_HBP', 'opp_SF']].sum(axis=1))
atbats['opp_OPS'] = atbats['opp_SLG'] + atbats['opp_OBP']
atbats['opp_K%'] = atbats['opp_K'] / atbats['opp_AB']
atbats['opp_BB%'] = atbats['opp_BB'] / atbats['opp_AB']
atbats['opp_K-BB%'] = atbats['opp_K%'] - atbats['opp_BB%']
atbats['opp_BABIP'] = ((atbats['opp_H'] - atbats['opp_HR'])/
                               (atbats['opp_AB'] + atbats['opp_SF'] - 
                                atbats['opp_HR'] - atbats['opp_K']))

atbats['FIP'] = (13*atbats['opp_HR'] + 3*(atbats['opp_BB'] + atbats['opp_HBP']) 
                 - 2.0*atbats['opp_K'])/atbats['IP'] + 3.2
atbats['WHIP'] = atbats[['opp_H', 'opp_BB']].sum(axis=1)/atbats['IP']

# Let's get player names instead of ids

In [None]:
player_names = pd.read_csv('../input/player_names.csv', index_col=0)
atbats = atbats.merge(player_names, left_on='batter_id', right_index=True, how='left')
atbats = atbats.merge(player_names, left_on='pitcher_id', right_index=True, 
                      how='left', suffixes=['_batter', '_pitcher'])

# Now it's time to check the leaderboards

Let's take each player, group their data by season, and see which batters had the best individual seasons, as measured by OPS. Notice that we take the tail of each player/year group to get the stats at the end of the season; if this was all we wanted, it would have been faster to do sum instead of pd.Series.cumsum above.

In [None]:
(atbats[atbats['AB']>400].groupby(['batter_id', 'year']).tail(n=1)
                                               .sort_values(by='OPS', ascending=False)
                                               .set_index(['year', 'batter_id'])
                                                [['AB', 'H', '2B', '3B', 'HR', 'BB', 
                                                  'IBB', 'K', 'AVG', 'SLG', 'OBP', 'OPS', 
                                                  'first_name_batter', 
                                                  'last_name_batter']]
                                               .head(n=20))

## Pitchers

Earned runs, and thus ERA, aren't a part of this dataset, so we'll go by FIP instead

In [None]:
(atbats[atbats['IP']>100].groupby(['pitcher_id', 'year'])
                        .tail(n=1)
                        .sort_values(by='FIP').set_index(['year', 'pitcher_id'])
                         [['WHIP', 'FIP', 'IP', 'opp_K', 'opp_BB',
                           'first_name_pitcher', 'last_name_pitcher']]
                        .head(n=20))