# Capstone 1: Data Wrangling Part 2:
Calculating average stats leading into games, to start creating features for a predictive model.

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
import math

In [2]:
infile = open('../PickledFiles/gamelogs2', 'rb')
gamelogs = pickle.load(infile)
infile.close

infile = open('../PickledFiles/pitchlogs', 'rb')
pitchlogs = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [3]:
def melt_gamelogs(other_id_vars):
    """Function to melt gamelogs so can perform calculation for each team"""
    melted = pd.melt(gamelogs, id_vars= ['Date','HomeWin','DoubleHeader'] + other_id_vars, value_vars = ['HomeTeam','VisitingTeam'], value_name = 'Team')
    melted = melted.sort_values('Team')
    return melted

def merge_twice(df1, df2, df1_home, df1_visit, df2col):
    """Function to merge df twice-once for info about home team, then for info about away team"""
    merged = pd.merge(df1,df2, left_on = ['Date', 'DoubleHeader',df1_home], right_on = ['Date','DoubleHeader', df2col])
    merged = pd.merge(merged,df2, left_on = ['Date', 'DoubleHeader',df1_visit], right_on = ['Date', 'DoubleHeader',df2col], suffixes = ['_H','_V'])
    return merged

def relavent_team_stat(df, home_col, visit_col):
    """Function to get the stat of either the home team or the away team """
    stats = []
    for i, row in df.iterrows():
        if row['variable'] == 'HomeTeam':
            stat = row[home_col]
        else:
            stat = row[visit_col]
        stats.append(stat)
    return stats

def calculate_teamavg(game):
    """Function used to calculate average statistics for team."""
    avg = game.expanding(3).mean().shift()
    return avg

def calculate_pitchavg(game):
    """Function used to calculate averages for starting pitchers.  Since there is less data for starting pitchers """
    avg = game.expanding().mean().shift()
    return avg

def fill_with_EOPS_data(grouped_df, df, col, col_to_groupby, new_col, min_games):
    """Fill NA with data from end of prior season."""
    EOS_col = 'EOS' + col
    df[EOS_col] = grouped_df[col].apply(lambda x: x.expanding(min_games).mean())
    df[EOS_col] = df.groupby(col_to_groupby)[EOS_col].fillna(method='ffill')
    df[new_col] = df[new_col].fillna(df[EOS_col])
    df.drop(labels=EOS_col, axis=1, inplace=True)
    
    #want to leave the first time a pitcher starts as NaN
    #for unique in df[col_to_groupby].unique():
        #idx = df[col_to_groupby].find(unique)
        #df.loc[idx,col] = np.nan
    
    return df


In [4]:
pitchlogs.head()

Unnamed: 0,Game Date,Pitcher,Opponent,GS,CG,GF,W,L,Sv,Sho,...,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev
40498,2009-04-05,Brett Myers,Vs. ATL,1,0,0,0,1,0,0,...,8,4,4,3,1,6,0,0,0,Bre Myers
24300,2009-04-05,Derek Lowe,At PHI,1,0,0,1,0,0,0,...,2,0,0,0,0,4,0,0,0,Der Lowe
0,2009-04-06,Joe Saunders,Vs. OAK,1,0,0,1,0,0,0,...,3,0,0,0,2,2,1,0,0,Joe Saunders
42118,2009-04-06,Paul Maholm,At SLN,1,0,0,0,0,0,0,...,7,2,1,0,1,1,1,0,1,Pau Maholm
38878,2009-04-06,Johan Santana,At CIN,1,0,0,1,0,0,0,...,3,1,1,0,4,7,0,0,0,Joh Santana


## Calculate Averages for Starting Pitchers

In [5]:
pitchlogs.columns

Index(['Game Date', 'Pitcher', 'Opponent', 'GS', 'CG', 'GF', 'W', 'L', 'Sv',
       'Sho', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP',
       'PitchAbbrev'],
      dtype='object')

In [6]:
cols = ['GS', 'CG', 'GF', 'W', 'L', 'Sv','Sho']
for col in cols:
    pitchlogs[col] = pd.to_numeric(pitchlogs[col])
pitchlogs[cols].dtypes

GS     int64
CG     int64
GF     int64
W      int64
L      int64
Sv     int64
Sho    int64
dtype: object

In [7]:
pitchlogs[(pitchlogs.GS != 1) | (pitchlogs.Sv != 0)]

Unnamed: 0,Game Date,Pitcher,Opponent,GS,CG,GF,W,L,Sv,Sho,...,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev


In [8]:
#These columns are irrelevant because the starting pitcher always will always have 1 in the 'GS' (game started) and O
#  in the Sv ('Game Saved') column, so we can drop them
pitchlogs = pitchlogs.drop(['GS', 'Sv'], axis=1)
pitchlogs.columns

Index(['Game Date', 'Pitcher', 'Opponent', 'CG', 'GF', 'W', 'L', 'Sho', 'IP',
       'H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP', 'PitchAbbrev'],
      dtype='object')

In [9]:
#group by pitcher and season so we can calculate stats per season for individual starting pitchers
bypitcher = pitchlogs.groupby(['Pitcher', pitchlogs['Game Date'].dt.year])

#pitchlogs.reset_index(inplace=True)
for col in ['CG', 'GF', 'W', 'L', 'Sho']:
    new_col = 'pct'+col
    pitchlogs[new_col] = bypitcher[col].apply(calculate_pitchavg)
    fill_with_EOPS_data(bypitcher, pitchlogs, col,'Pitcher',new_col, 2)
    #fill_with_EOPS_data(grouped_df, df, col, col_to_groupby, new_col)

    #pitchlogs[new_col] = pitchlogs.groupby('Pitcher')[new_col].fillna(method='ffill')
    
for col in ['IP','H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP']:
    new_col = 'Avg'+col
    pitchlogs[new_col] = bypitcher[col].apply(calculate_pitchavg)
    #use previous season's end of season average for new season
    fill_with_EOPS_data(bypitcher, pitchlogs, col, 'Pitcher', new_col, 2)
    #EOS_col = 'EOS' + col
    #pitchlogs[EOS_col] = bypitcher[col].apply(lambda x: x.expanding(2).mean())
    #pitchlogs[EOS_col] = pitchlogs.groupby('Pitcher')[EOS_col].fillna(method='ffill')
    #pitchlogs[new_col] = pitchlogs[new_col].fillna(pitchlogs[EOS_col])
    #pitchlogs = pitchlogs.drop(EOS_col, axis=1)


In [10]:
#want to leave the first time a pitcher starts as NaN
pitchlogs = pitchlogs.sort_values('Game Date')
pitchlogs = pitchlogs.reset_index()
avg_cols = [col for col in pitchlogs.columns if col.startswith(('pct','Avg'))]
unique_starters =  pitchlogs['Pitcher'].unique()
for starter in unique_starters.flatten():
    idx = list(pitchlogs['Pitcher']).index(starter)
    pitchlogs.loc[idx,avg_cols] = np.nan

In [11]:
pitchlogs

Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,...,AvgIP,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP
0,40498,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.000000,...,,,,,,,,,,
1,24300,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.000000,...,,,,,,,,,,
2,25919,2009-04-06,Carlos Zambrano,At HOU,0,0,1,0,0,6.000000,...,,,,,,,,,,
3,45356,2009-04-06,Jake Peavy,Vs. LAN,0,0,0,1,0,7.000000,...,,,,,,,,,,
4,43736,2009-04-06,Adam Wainwright,Vs. PIT,0,0,0,0,0,5.333333,...,,,,,,,,,,
5,6480,2009-04-06,Cliff Lee,At TEX,0,0,0,1,0,5.000000,...,,,,,,,,,,
6,8098,2009-04-06,Justin Verlander,At TOR,0,0,0,1,0,3.666667,...,,,,,,,,,,
7,11337,2009-04-06,Francisco Liriano,Vs. SEA,0,0,0,1,0,7.000000,...,,,,,,,,,,
8,14578,2009-04-06,Dallas Braden,At LAA,0,0,0,1,0,6.000000,...,,,,,,,,,,
9,16198,2009-04-06,Felix Hernandez,At MIN,0,0,1,0,0,8.000000,...,,,,,,,,,,


In [12]:
pitchlogs['AvgERA'] = pitchlogs['AvgER'] * 9 / pitchlogs['AvgIP']

In [13]:
#check that it worked
RP = pitchlogs.loc[pitchlogs.Pitcher == 'Rick Porcello']
RP[['Game Date', 'ER','IP', 'AvgER','AvgIP', 'AvgERA']].tail(36).head(10)


Unnamed: 0,Game Date,ER,IP,AvgER,AvgIP,AvgERA
43322,2017-09-16,0,7.333333,3.2,6.211111,4.636852
43455,2017-09-22,4,4.0,3.096774,6.247312,4.461274
43604,2017-09-27,5,5.666667,3.125,6.177083,4.55312
43796,2018-03-31,1,5.333333,3.181818,6.161616,4.647541
43951,2018-04-07,3,7.333333,1.0,5.333333,1.6875
44100,2018-04-12,0,7.0,2.0,6.333333,2.842105
44233,2018-04-18,0,6.0,1.333333,6.555556,1.830508
44387,2018-04-24,3,7.0,1.0,6.416667,1.402597
44528,2018-04-29,3,7.666667,1.4,6.533333,1.928571
44681,2018-05-04,1,6.0,1.666667,6.722222,2.231405


In [14]:
RP[['Game Date', 'ER','IP', 'AvgER','AvgIP', 'AvgERA']]


Unnamed: 0,Game Date,ER,IP,AvgER,AvgIP,AvgERA
80,2009-04-09,4,5.000000,,,
349,2009-04-19,1,7.000000,4.000000,5.000000,7.200000
462,2009-04-24,4,6.000000,2.500000,6.000000,3.750000
602,2009-04-29,6,3.666667,3.000000,6.000000,4.500000
770,2009-05-05,0,7.000000,3.750000,5.416667,6.230769
917,2009-05-10,1,5.000000,3.000000,5.733333,4.709302
1074,2009-05-16,1,6.000000,2.666667,5.611111,4.277228
1240,2009-05-22,1,6.000000,2.428571,5.666667,3.857143
1381,2009-05-27,2,6.000000,2.250000,5.708333,3.547445
1551,2009-06-02,3,4.333333,2.222222,5.740741,3.483871


In [15]:
RP.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307 entries, 80 to 48582
Data columns (total 36 columns):
index          307 non-null int64
Game Date      307 non-null datetime64[ns]
Pitcher        307 non-null object
Opponent       307 non-null object
CG             307 non-null int64
GF             307 non-null int64
W              307 non-null int64
L              307 non-null int64
Sho            307 non-null int64
IP             307 non-null float64
H              307 non-null int64
R              307 non-null int64
ER             307 non-null int64
HR             307 non-null int64
BB             307 non-null int64
K              307 non-null int64
HBP            307 non-null int64
BK             307 non-null int64
WP             307 non-null int64
PitchAbbrev    307 non-null object
pctCG          306 non-null float64
pctGF          306 non-null float64
pctW           306 non-null float64
pctL           306 non-null float64
pctSho         306 non-null float64
AvgIP          306

Each pitcher should have one null row for their first start...

In [16]:
num_pitchers = pitchlogs['Pitcher'].nunique()
num_pitchers

946

In [17]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 36 columns):
index          48596 non-null int64
Game Date      48596 non-null datetime64[ns]
Pitcher        48596 non-null object
Opponent       48596 non-null object
CG             48596 non-null int64
GF             48596 non-null int64
W              48596 non-null int64
L              48596 non-null int64
Sho            48596 non-null int64
IP             48596 non-null float64
H              48596 non-null int64
R              48596 non-null int64
ER             48596 non-null int64
HR             48596 non-null int64
BB             48596 non-null int64
K              48596 non-null int64
HBP            48596 non-null int64
BK             48596 non-null int64
WP             48596 non-null int64
PitchAbbrev    48596 non-null object
pctCG          47590 non-null float64
pctGF          47590 non-null float64
pctW           47590 non-null float64
pctL           47590 non-null float64
pctSh

In [18]:
nulls = pitchlogs[pitchlogs['AvgIP'].isnull()]
print(nulls.shape)

(1006, 36)


In [19]:
nulls['Pitcher'].nunique()

946

In [20]:
dupes = nulls[nulls.duplicated(subset = 'Pitcher', keep = False)]
dupes.sort_values('Pitcher')

Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,...,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA
37729,38515,2016-08-22,A.J. Cole,At BAL,0,0,0,1,0,7.000000,...,,,,,,,,,,
29762,38250,2015-04-28,A.J. Cole,At ATL,0,0,0,0,0,2.000000,...,,,,,,,,,,
32265,15654,2015-08-01,Aaron Brooks,Vs. CLE,0,0,1,0,0,7.333333,...,,,,,,,,,,
25941,10581,2014-05-31,Aaron Brooks,At TOR,0,0,0,1,0,0.666667,...,,,,,,,,,,
23221,13731,2013-08-21,Adam Warren,Vs. TOR,0,0,0,0,0,3.000000,...,,,,,,,,,,
16874,13519,2012-06-29,Adam Warren,Vs. CHA,0,0,0,0,0,2.333333,...,,,,,,,,,,
36056,17399,2016-06-18,Adrian Sampson,At BOS,0,0,0,1,0,4.666667,...,,,,,,,,,,
48072,21042,2018-09-11,Adrian Sampson,At LAA,0,0,0,1,0,5.000000,...,,,,,,,,,,
48179,21045,2018-09-15,Alexander Claudio,At SDN,0,0,0,0,0,1.000000,...,,,,,,,,,,
39658,20762,2017-05-02,Alexander Claudio,At HOU,0,0,0,0,0,4.000000,...,,,,,,,,,,


The reason for the extra nulls seems to be that certain pitchers only made one start in a year and did not start again until a later season.  Most of these people are typically relief pitchers, who start occasionally.

In [21]:
#dupes = nulls[nulls.duplicated(subset = 'Pitcher', keep = 'first')]
dupes.shape

(116, 36)

In [22]:
dupes['Pitcher'].nunique()

56

In [23]:
116-56+946

1006

Why are there two more null ERAs than there are other nulls?

In [24]:
pitchlogs = pitchlogs.replace(np.inf,np.nan)

In [25]:
nullERA = pitchlogs[(pitchlogs['AvgIP'].isnull() == False) & (pitchlogs['AvgERA'].isnull())]
nullERA


Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,...,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA
1122,36,2009-05-18,John Lackey,At SEA,0,0,1,0,0,5.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,
13806,39336,2011-09-01,Miguel Batista,Vs. FLA,0,0,1,0,0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,


In [26]:
JL = pitchlogs[pitchlogs['Pitcher']=='John Lackey']

JL.head(10)

Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,...,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA
1085,34,2009-05-16,John Lackey,At TEX,0,0,0,0,0,0.0,...,,,,,,,,,,
1122,36,2009-05-18,John Lackey,At SEA,0,0,1,0,0,5.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,
1252,41,2009-05-23,John Lackey,At LAN,0,0,0,0,0,7.0,...,3.5,3.0,2.5,0.5,0.0,1.5,0.5,0.0,0.0,9.0
1413,46,2009-05-29,John Lackey,Vs. SEA,0,0,0,1,0,7.333333,...,4.666667,3.0,2.666667,0.333333,0.333333,2.666667,1.333333,0.0,0.333333,6.0
1578,51,2009-06-04,John Lackey,At TOR,0,0,0,0,0,7.0,...,6.0,3.5,3.25,0.5,0.75,2.5,1.0,0.0,0.25,6.051724
1752,56,2009-06-10,John Lackey,At TBA,0,0,0,1,0,5.0,...,6.2,3.2,3.0,0.4,1.2,3.2,0.8,0.0,0.4,5.126582
1889,61,2009-06-15,John Lackey,At SFN,0,0,1,0,0,7.0,...,7.0,4.166667,3.833333,0.666667,1.333333,3.0,0.666667,0.0,0.5,6.606383
2062,66,2009-06-21,John Lackey,Vs. LAN,0,0,0,1,0,8.0,...,7.428571,4.0,3.714286,0.714286,1.142857,4.0,0.714286,0.0,0.428571,6.104348
2205,71,2009-06-27,John Lackey,At ARI,0,0,0,0,0,7.0,...,7.625,4.0,3.75,0.75,1.5,4.125,0.625,0.0,0.375,5.827338
2336,76,2009-07-02,John Lackey,Vs. BAL,0,0,1,0,0,8.0,...,7.333333,3.666667,3.333333,0.666667,1.666667,4.666667,0.555556,0.0,0.333333,5.0625


In [27]:
MB = pitchlogs[pitchlogs['Pitcher']=='Miguel Batista']
MB.head(10)

Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,...,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA
7857,37520,2010-07-27,Miguel Batista,Vs. ATL,0,0,1,0,0,5.0,...,,,,,,,,,,
10287,44079,2011-04-22,Miguel Batista,Vs. CIN,0,0,0,0,0,0.0,...,,,,,,,,,,
13806,39336,2011-09-01,Miguel Batista,Vs. FLA,0,0,1,0,0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,
13959,39341,2011-09-06,Miguel Batista,At FLA,0,0,0,0,0,6.0,...,3.0,1.0,1.0,0.0,2.0,1.5,0.0,0.0,0.0,3.0
14067,39347,2011-09-11,Miguel Batista,Vs. CHN,0,0,0,0,0,5.0,...,3.666667,1.0,1.0,0.0,2.666667,2.0,0.0,0.0,0.0,2.25
14562,39363,2011-09-28,Miguel Batista,Vs. CIN,1,0,1,0,1,9.0,...,4.0,1.75,1.75,0.0,2.75,1.75,0.5,0.0,0.25,3.705882
15060,39378,2012-04-23,Miguel Batista,Vs. SFN,0,0,0,1,0,3.666667,...,3.6,1.4,1.4,0.0,2.6,2.4,0.4,0.0,0.2,2.423077
15475,39393,2012-05-08,Miguel Batista,At PHI,0,0,0,0,0,5.333333,...,8.0,6.0,6.0,2.0,3.0,2.0,0.0,0.0,0.0,14.727273
15628,39398,2012-05-14,Miguel Batista,Vs. MIL,0,0,1,0,0,7.0,...,8.0,5.0,4.0,1.5,2.5,1.5,0.0,0.0,0.0,8.0
15767,39403,2012-05-19,Miguel Batista,At TOR,0,0,0,0,0,2.0,...,6.666667,3.333333,2.666667,1.0,2.0,2.666667,0.0,0.0,0.0,4.5


We see that NaN and inf occur because in some games the starting pitcher pitches 0 innings in the first game of the season.  A starting pitcher could pitch 0 innings if they allow many runs before and never record an out, for example.  If this does not occur during the first start of the  pitcher in the season, it will not result in NaNs and 'inf's because they just won't be counted in the averages. 

In John Lackey's case on May 16, 2009, he threw two wild pitches, the second of which hit the batter, so he was ejected.  In Miguel Batista's case, he walked the first batter and then threw two more pitches and it started raining, so and they put in another pitcher after the delay.

It's worth looking at other games where the starting pitcher had 0 IPs, 0 hits, and 0 walks.

In [28]:
noOuts = pitchlogs[(pitchlogs['IP'] == 0) & (pitchlogs['H'] == 0) & (pitchlogs['BB']==0)]
cols_to_drop = [col for col in noOuts.columns if col.startswith(('Avg','pct'))]
noOuts.drop(cols_to_drop, axis=1)

Unnamed: 0,index,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev
1085,34,2009-05-16,John Lackey,At TEX,0,0,0,0,0,0.0,0,1,1,0,0,0,1,0,0,Joh Lackey
9335,5171,2010-09-20,Gavin Floyd,At OAK,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Gav Floyd
21626,46077,2013-06-21,Clayton Richard,Vs. LAN,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Cla Richard
28531,10667,2014-09-06,Daniel Duffy,At NYA,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Dan Duffy
31725,25357,2015-07-09,Alex Wood,At COL,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Ale Wood
36755,15803,2016-07-17,Rich Hill,Vs. TOR,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Ric Hill
45068,35519,2018-05-19,Rich Hill,At WSH,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Ric Hill
47338,32359,2018-08-15,Jose Urena,At ATL,0,0,0,0,0,0.0,0,0,0,0,0,0,1,0,0,Jos Urena
48375,3232,2018-09-23,Alex Cobb,At NYA,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,Ale Cobb


In [29]:
#Delete columns from gamelogs with starting pitcher info so they don't duplicate when we merge
cols_to_drop = [col for col in gamelogs.columns if col[-2:] in ['_V','_H']]
gamelogs = gamelogs.drop(cols_to_drop, axis=1)

#also delete columns with player IDs and position because we will not be using them
cols_to_drop = [col for col in gamelogs.columns if col.endswith(('Position','erID'))]
gamelogs = gamelogs.drop(cols_to_drop, axis=1) 

In [30]:
gamelogs.columns
gamelogs.shape

(24298, 123)

In [31]:
gamelogs_hsp = pd.merge(gamelogs, pitchlogs, how='left', left_on = ['Date', 'HomeSPAbbrev'], right_on=['Game Date', 'PitchAbbrev']) 
gamelogs = pd.merge(gamelogs_hsp, pitchlogs, how='left', left_on = ['Date', 'VisitSPAbbrev'], right_on=['Game Date', 'PitchAbbrev'], suffixes = ['_HSP', '_VSP'])

In [32]:
gamelogs.shape

(24298, 195)

## Calculate Averages for Teams
~to get relief pitchers stats alone~

In [33]:
gamelogs['NumHalfInnings'] = gamelogs['LengthInOuts']/3

In [34]:
def HomeTeamOffenseInnings(lengthinouts, numhalfinnings):
    if lengthinouts % 3 == 0:
        HomeTeamOffenseInnings = numhalfinnings // 2
    else:
        HomeTeamOffenseInnings = numhalfinnings - math.ceil(numhalfinnings/2)
    return HomeTeamOffenseInnings

for idx, game in gamelogs.iterrows():
    gamelogs.loc[idx,'HomeOffenseInnings'] = HomeTeamOffenseInnings(game['LengthInOuts'], game['NumHalfInnings'])

gamelogs['VisitorOffenseInnings'] = gamelogs['NumHalfInnings']-gamelogs['HomeOffenseInnings']

In [35]:
gamelogs[['NumHalfInnings','HomeOffenseInnings','VisitorOffenseInnings']].head(20)

Unnamed: 0,NumHalfInnings,HomeOffenseInnings,VisitorOffenseInnings
0,18.0,9.0,9.0
1,17.0,8.0,9.0
2,18.0,9.0,9.0
3,17.0,8.0,9.0
4,18.0,9.0,9.0
5,18.0,9.0,9.0
6,18.0,9.0,9.0
7,17.0,8.0,9.0
8,17.0,8.0,9.0
9,18.0,9.0,9.0


In [36]:
#create new columns with Relief Pitching Info
gamelogs['HomeReliefIP'] = gamelogs['VisitorOffenseInnings'] - gamelogs['IP_HSP']
gamelogs['VisitorReliefIP'] = gamelogs['HomeOffenseInnings'] - gamelogs['IP_VSP']
gamelogs['HomeReliefER'] = gamelogs['HomeER'] - gamelogs['ER_HSP']
gamelogs['VisitorReliefER'] = gamelogs['VisitorER'] - gamelogs['ER_VSP']
gamelogs['HomeReliefK'] = gamelogs['VisitorK'] - gamelogs['K_HSP']
gamelogs['VisitorReliefK'] = gamelogs['HomeK'] - gamelogs['K_VSP']
gamelogs['HomeReliefBB'] = gamelogs['VisitorBB'] - gamelogs['BB_HSP']
gamelogs['VisitorReliefBB'] = gamelogs['HomeBB'] - gamelogs['BB_VSP']


In [37]:
#calculate times reaching base and plate appearances
gamelogs['HomeRB'] = gamelogs['HomeH'] + gamelogs['HomeBB'] + gamelogs['HomeHBP']
gamelogs['HomePA'] = gamelogs['HomeAB']+gamelogs['HomeBB']+gamelogs['HomeHBP']+gamelogs['HomeSF']
gamelogs['VisitorRB'] = gamelogs['VisitorH'] + gamelogs['VisitorBB'] + gamelogs['VisitorHBP']
gamelogs['VisitorPA'] = gamelogs['VisitorAB']+gamelogs['VisitorBB']+gamelogs['VisitorHBP']+gamelogs['VisitorSF']
gamelogs['HomePower'] = gamelogs['HomeD'] + 2 * gamelogs['HomeT'] + 3 * gamelogs['HomeHR']
gamelogs['VisitorPower'] = gamelogs['VisitorD'] + 2 * gamelogs['VisitorT'] + 3 * gamelogs['VisitorHR']

In [38]:
gamelogs = gamelogs.drop(['HomeOBP','VisitOBP'], axis=1)

In [39]:
gamelogs.rename(columns = {'HomeRunsScore': 'HomeRunsScored'}, inplace=True)


In [40]:
home_idx1 = gamelogs.columns.get_loc('HomeAB')
home_idx2 = gamelogs.columns.get_loc('HomeTP')
print(home_idx1)
print(home_idx2)
cols1 = gamelogs['HomeRunsScored']
cols2 = gamelogs.iloc[:, home_idx1 : home_idx2+1]
cols3 = gamelogs[['HomeReliefIP','HomeReliefER', 'HomeReliefK', 'HomeReliefBB']]
cols4 = gamelogs[['HomeOffenseInnings','HomeRB','HomePA', 'HomePower']]
cols = pd.concat([cols1,cols2,cols3, cols4], axis=1)
cols.columns

49
76


Index(['HomeRunsScored', 'HomeAB', 'HomeH', 'HomeD', 'HomeT', 'HomeHR',
       'HomeRBI', 'HomeSH', 'HomeSF', 'HomeHBP', 'HomeBB', 'HomeIBB', 'HomeK',
       'HomeSB', 'HomeCS', 'HomeGDP', 'HomeCI', 'HomeLOB', 'HomePitchers',
       'HomeER', 'HomeTER', 'HomeWP', 'HomeBalks', 'HomePO', 'HomeA', 'HomeE',
       'HomePassed', 'HomeDB', 'HomeTP', 'HomeReliefIP', 'HomeReliefER',
       'HomeReliefK', 'HomeReliefBB', 'HomeOffenseInnings', 'HomeRB', 'HomePA',
       'HomePower'],
      dtype='object')

In [42]:
#calculate team averages
for home_col in cols.columns:
    stat = home_col[4:]
    visit_col = 'Visitor' + stat
    melted = melt_gamelogs([home_col, visit_col])  
    melted['stat'] = relavent_team_stat(melted, home_col, visit_col)
    melted = melted.sort_values(['Team','Date'])
    new_col = 'Avg' + stat
    melted_grpd = melted.groupby(['Team', melted['Date'].dt.year])
    melted[new_col] = melted_grpd['stat'].apply(calculate_teamavg)
    fill_with_EOPS_data(melted_grpd, melted, 'stat', 'Team', new_col, 4)  
    #melted[new_col] = melted.groupby('Team')[new_col].fillna(method='ffill')                                                                                   
    melted = melted[['Date','DoubleHeader','Team',new_col]]                                                                                
    gamelogs = merge_twice(gamelogs, melted, 'HomeTeam', 'VisitingTeam', 'Team')                                                                                    

In [43]:
gamelogs.tail()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,Team_V,AvgRB_V,Team_H,AvgPA_H,Team_V.1,AvgPA_V,Team_H.1,AvgPower_H,Team_V.2,AvgPower_V
24293,2018-09-30,0,Sun,CHA,AL,162,MIN,AL,162,4,...,CHA,11.254658,MIN,37.900621,CHA,37.322981,MIN,5.285714,CHA,5.478261
24294,2018-09-30,0,Sun,TEX,AL,162,SEA,AL,162,1,...,TEX,12.093168,SEA,37.403727,TEX,37.89441,SEA,5.254658,TEX,5.546584
24295,2018-09-30,0,Sun,TOR,AL,162,TBA,AL,162,4,...,TOR,11.68323,TBA,38.049689,TOR,37.47205,TBA,5.012422,TOR,6.204969
24296,2018-10-01,0,Mon,MIL,NL,163,CHN,NL,163,3,...,MIL,12.228395,CHN,38.845679,MIL,37.925926,CHN,5.259259,MIL,5.882716
24297,2018-10-01,0,Mon,COL,NL,163,LAN,NL,163,2,...,COL,12.141975,LAN,38.790123,COL,37.666667,LAN,6.530864,COL,6.098765


In [44]:
#OBP
gamelogs['AvgOBP_H'] = gamelogs['AvgRB_H']/gamelogs['AvgPA_H']
gamelogs['AvgOBP_V'] = gamelogs['AvgRB_V']/gamelogs['AvgPA_V']

#isolated power
gamelogs['AvgISO_H'] = gamelogs['AvgPower_H']/gamelogs['AvgAB_H']
gamelogs['AvgISO_V'] = gamelogs['AvgPower_H']/gamelogs['AvgAB_V']

#Offense earned runs scored per 9 innings
gamelogs['AvgOffERunsPer9Inn_H'] = gamelogs['AvgER_V'] * 9 / gamelogs['AvgOffenseInnings_H']
gamelogs['AvgOffERunsPer9Inn_V'] = gamelogs['AvgER_H'] * 9 / gamelogs['AvgOffenseInnings_V']

#Relief ERA
gamelogs['AvgReliefERA_H'] = gamelogs['AvgReliefER_H'] * 9 / gamelogs['AvgReliefIP_H']
gamelogs['AvgReliefERA_V'] = gamelogs['AvgReliefER_V'] * 9 / gamelogs['AvgReliefIP_H']

In [47]:
gamelogs.Attendance.head()

0    44532.0
1    48799.0
2    42177.0
3    34323.0
4    43827.0
Name: Attendance, dtype: float64

In [49]:
byparkyear = gamelogs.groupby(['ParkID',gamelogs['Date'].dt.year])
gamelogs['AvgAttendance'] = byparkyear['Attendance'].apply(calculate_pitchavg)
gamelogs['AvgAttendance'] = gamelogs.groupby('ParkID')['AvgAttendance'].fillna(method='ffill')

In [52]:
gamelogs[['Date','HomeTeam','AvgAttendance']].tail()

Unnamed: 0,Date,HomeTeam,AvgAttendance
24293,2018-09-30,MIN,24544.743109
24294,2018-09-30,SEA,28479.2875
24295,2018-09-30,TBA,14270.75
24296,2018-10-01,CHN,38798.012346
24297,2018-10-01,LAN,47033.135802


In [53]:
gamelogs.columns

Index(['Date', 'DoubleHeader', 'DayOfWeek', 'VisitingTeam',
       'VisitingTeamLeague', 'VisitingTeamGameNumber', 'HomeTeam',
       'HomeTeamLeague', 'HomeTeamGameNumber', 'VisitorRunsScored',
       ...
       'AvgPower_V', 'AvgOBP_H', 'AvgOBP_V', 'AvgISO_H', 'AvgISO_V',
       'AvgOffERunsPer9Inn_H', 'AvgOffERunsPer9Inn_V', 'AvgReliefERA_H',
       'AvgReliefERA_V', 'AvgAttendance'],
      dtype='object', length=367)

In [54]:
gamelogs = gamelogs.drop(['Team_V', 'Team_H'], axis=1)
gamelogs.shape

(24298, 293)

In [59]:
#check that it worked
RS = gamelogs[(gamelogs['HomeTeam']=='BOS') | (gamelogs['VisitingTeam']=='BOS')].sort_values('Date')
RS_H = RS[['Date', 'HomeTeam', 'VisitingTeam','HomeH','AvgH_H','VisitorH', 'AvgH_V']]
RS_H.iloc[155:170,:]

Unnamed: 0,Date,HomeTeam,VisitingTeam,HomeH,AvgH_H,VisitorH,AvgH_V
2338,2009-09-28,BOS,TOR,7,9.219355,14,9.282051
2351,2009-09-29,BOS,TOR,14,9.205128,11,9.312102
2367,2009-09-30,BOS,TOR,3,9.235669,17,9.322785
2380,2009-10-01,BOS,CLE,12,9.196203,3,9.107595
2393,2009-10-02,BOS,CLE,8,9.213836,8,9.069182
2408,2009-10-03,BOS,CLE,11,9.20625,10,9.0625
2423,2009-10-04,BOS,CLE,11,9.217391,8,9.068323
2430,2010-04-04,BOS,NYA,12,9.228395,12,9.901235
2445,2010-04-06,BOS,NYA,9,9.228395,9,9.901235
2452,2010-04-07,BOS,NYA,7,9.228395,6,9.901235


In [60]:
gamelogs = gamelogs.replace(np.inf,np.nan)

In [61]:
rel_cols = [col for col in gamelogs.columns if col.startswith(('pct','Avg'))]
gamelogs[rel_cols].shape

(24298, 115)

In [62]:
gamelogs[rel_cols].iloc[:,:60].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 60 columns):
pctCG_HSP          23772 non-null float64
pctGF_HSP          23772 non-null float64
pctW_HSP           23772 non-null float64
pctL_HSP           23772 non-null float64
pctSho_HSP         23772 non-null float64
AvgIP_HSP          23772 non-null float64
AvgH_HSP           23772 non-null float64
AvgR_HSP           23772 non-null float64
AvgER_HSP          23772 non-null float64
AvgHR_HSP          23772 non-null float64
AvgBB_HSP          23772 non-null float64
AvgK_HSP           23772 non-null float64
AvgHBP_HSP         23772 non-null float64
AvgBK_HSP          23772 non-null float64
AvgWP_HSP          23772 non-null float64
AvgERA_HSP         23771 non-null float64
pctCG_VSP          23818 non-null float64
pctGF_VSP          23818 non-null float64
pctW_VSP           23818 non-null float64
pctL_VSP           23818 non-null float64
pctSho_VSP         23818 non-null float64
AvgIP_VSP

In [63]:
gamelogs[rel_cols].iloc[:,60:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 55 columns):
AvgCS_H                 24253 non-null float64
AvgCS_V                 24253 non-null float64
AvgGDP_H                24253 non-null float64
AvgGDP_V                24253 non-null float64
AvgCI_H                 24253 non-null float64
AvgCI_V                 24253 non-null float64
AvgLOB_H                24253 non-null float64
AvgLOB_V                24253 non-null float64
AvgPitchers_H           24253 non-null float64
AvgPitchers_V           24253 non-null float64
AvgER_H                 24253 non-null float64
AvgER_V                 24253 non-null float64
AvgTER_H                24253 non-null float64
AvgTER_V                24253 non-null float64
AvgWP_H                 24253 non-null float64
AvgWP_V                 24253 non-null float64
AvgBalks_H              24253 non-null float64
AvgBalks_V              24253 non-null float64
AvgPO_H                 24253 non-null float6

In [64]:
nullHR = gamelogs[gamelogs['AvgHR_H'].isnull()]
nullHR

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgPower_V,AvgOBP_H,AvgOBP_V,AvgISO_H,AvgISO_V,AvgOffERunsPer9Inn_H,AvgOffERunsPer9Inn_V,AvgReliefERA_H,AvgReliefERA_V,AvgAttendance
0,2009-04-05,0,Sun,ATL,NL,1,PHI,NL,1,4,...,,,,,,,,,,
1,2009-04-06,0,Mon,COL,NL,1,ARI,NL,1,8,...,,,,,,,,,,
2,2009-04-06,0,Mon,NYN,NL,1,CIN,NL,1,2,...,,,,,,,,,,
3,2009-04-06,0,Mon,WAS,NL,1,FLO,NL,1,6,...,,,,,,,,,,
4,2009-04-06,0,Mon,CHN,NL,1,HOU,NL,1,4,...,,,,,,,,,,
5,2009-04-06,0,Mon,LAN,NL,1,SDN,NL,1,4,...,,,,,,,,,,
6,2009-04-06,0,Mon,PIT,NL,1,SLN,NL,1,6,...,,,,,,,,,,
7,2009-04-06,0,Mon,OAK,AL,1,ANA,AL,1,0,...,,,,,,,,,,
8,2009-04-06,0,Mon,NYA,AL,1,BAL,AL,1,5,...,,,,,,,,,,
9,2009-04-06,0,Mon,SEA,AL,1,MIN,AL,1,6,...,,,,,,,,,,


In [66]:
nullHR_teams = nullHR['HomeTeam'].append(nullHR['VisitingTeam'])
nullHR_teams.value_counts()

FLO    3
NYN    3
SDN    3
HOU    3
TOR    3
DET    3
NYA    3
ATL    3
MIN    3
ARI    3
TEX    3
KCA    3
WAS    3
TBA    3
CIN    3
SLN    3
BOS    3
CLE    3
LAN    3
OAK    3
BAL    3
SEA    3
CHA    3
PHI    3
CHN    3
ANA    3
MIL    3
SFN    3
COL    3
PIT    3
dtype: int64

In [67]:
gamelogs.head()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgPower_V,AvgOBP_H,AvgOBP_V,AvgISO_H,AvgISO_V,AvgOffERunsPer9Inn_H,AvgOffERunsPer9Inn_V,AvgReliefERA_H,AvgReliefERA_V,AvgAttendance
0,2009-04-05,0,Sun,ATL,NL,1,PHI,NL,1,4,...,,,,,,,,,,
1,2009-04-06,0,Mon,COL,NL,1,ARI,NL,1,8,...,,,,,,,,,,
2,2009-04-06,0,Mon,NYN,NL,1,CIN,NL,1,2,...,,,,,,,,,,
3,2009-04-06,0,Mon,WAS,NL,1,FLO,NL,1,6,...,,,,,,,,,,
4,2009-04-06,0,Mon,CHN,NL,1,HOU,NL,1,4,...,,,,,,,,,,


Ready to pickle and move on to Inferential Statistics!

In [68]:
outfile = open('../PickledFiles/gamelogs3', 'wb')
pickle.dump(gamelogs, outfile)
outfile.close()