# Feature Engineering- Rolling Averages 

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.proportion import proportions_ztest
import math

### Create the Features - Data Wrangling to get Rolling


In [2]:
def melt_gamelogs(other_id_vars):
    """Function to melt gamelogs so can perform calculation for each team"""
    melted = pd.melt(gamelogs, id_vars= ['Date','HomeWin','DoubleHeader'] + other_id_vars, value_vars = ['HomeTeam','VisitingTeam'], value_name = 'Team')
    melted = melted.sort_values('Team')
    return melted

def merge_twice(df1, df2, df1_home, df1_visit, df2col):
    """Function to merge df twice-once for info about home team, then for info about away team"""
    merged = pd.merge(df1,df2, left_on = ['Date', 'DoubleHeader',df1_home], right_on = ['Date','DoubleHeader', df2col])
    merged = pd.merge(merged,df2, left_on = ['Date', 'DoubleHeader',df1_visit], right_on = ['Date', 'DoubleHeader',df2col], suffixes = ['_H','_V'])
    return merged

def relavent_team_stat(df, home_col, visit_col):
    """Function to get the stat of either the home team or the away team """
    stats = []
    for i, row in df.iterrows():
        if row['variable'] == 'HomeTeam':
            stat = row[home_col]
        else:
            stat = row[visit_col]
        stats.append(stat)
    return stats

def rolling_avg(game):
    """Function used to calculate average statistics for team."""
    avg = game.rolling(window=window, min_periods = 2).mean().shift()
    return avg

def rolling_avg_pitch(start):
    """Function used to calculate average statistics for starting pitchers."""
    avg = start.rolling(window=window_pitch, min_periods = 2).mean().shift()
    return avg

def rolling_avg_hva(game):
    """Function used to calculate team stats just for home games or away games."""
    avg = game.rolling(window=window_hva, min_periods = 2).mean().shift()
    return avg



In [3]:
infile = open('../PickledFiles/gamelogs2', 'rb')
gamelogs = pickle.load(infile)
infile.close

infile = open('../PickledFiles/pitchlogs', 'rb')
pitchlogs = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [4]:
window = 162
window_pitch = 30
window_hva = window//2

In [5]:
cols = ['GS', 'CG', 'GF', 'W', 'L', 'Sv','Sho']
for col in cols:
    pitchlogs[col] = pd.to_numeric(pitchlogs[col])
pitchlogs[cols].dtypes

GS     int64
CG     int64
GF     int64
W      int64
L      int64
Sv     int64
Sho    int64
dtype: object

In [6]:
#These columns are irrelevant because the starting pitcher always will always have 1 in the 'GS' (game started) and O
#  in the Sv ('Game Saved') column, so we can drop them
pitchlogs = pitchlogs.drop(['GS', 'Sv'], axis=1)
pitchlogs.columns

Index(['Game Date', 'Pitcher', 'Opponent', 'CG', 'GF', 'W', 'L', 'Sho', 'IP',
       'H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP', 'PitchAbbrev'],
      dtype='object')

In [7]:
pitchlogs = pitchlogs.reset_index(drop=True)
pitchlogs.head()

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev
0,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.0,8,4,4,3,1,6,0,0,0,Bre Myers
1,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.0,2,0,0,0,0,4,0,0,0,Der Lowe
2,2009-04-06,Joe Saunders,Vs. OAK,0,0,1,0,0,6.666667,3,0,0,0,2,2,1,0,0,Joe Saunders
3,2009-04-06,Paul Maholm,At SLN,0,0,0,0,0,6.666667,7,2,1,0,1,1,1,0,1,Pau Maholm
4,2009-04-06,Johan Santana,At CIN,0,0,1,0,0,5.666667,3,1,1,0,4,7,0,0,0,Joh Santana


In [8]:
pitchlogs['Opp']=pitchlogs['Opponent'].str[-3:]
pitchlogs['Opp'].head()

0    ATL
1    PHI
2    OAK
3    SLN
4    CIN
Name: Opp, dtype: object

In [9]:
bypitcher = pitchlogs.groupby('Pitcher')
florida = {'Vs. MIA: Vs. FLA', 'At MIA: At FLA'}
pitchlogs.replace(florida, inplace=True)
by_opp = pitchlogs.groupby('Opp')

In [10]:
new_cols = []
#not including 'CG' and 'GF' because we found they are very rare in Inferential Statistics
for col in ['W', 'L', 'Sho']:
    new_col = 'pct'+col
    new_cols.append(new_col)
for col in ['IP','H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP']:
    new_col = 'Avg'+col
    new_cols.append(new_col)

for new_col in new_cols:
    col = new_col[3:]
    pitchlogs[new_col] = bypitcher[col].apply(rolling_avg_pitch)
    pitchlogs['temp'] = by_opp[col].apply(rolling_avg)
    pitchlogs['temp2'] = pitchlogs.groupby('Pitcher')[col].shift()
    pitchlogs['temp3'] = (pitchlogs['temp']+pitchlogs['temp2'])/2
    pitchlogs[new_col] = pitchlogs[new_col].fillna(pitchlogs['temp3']).fillna(pitchlogs['temp'])
    pitchlogs.drop(['temp','temp2','temp3'], axis = 1, inplace=True)

In [11]:
#check it worked
RP = pitchlogs[pitchlogs.Pitcher == 'Rick Porcello']
RP[['Game Date', 'Opp','HR','AvgHR']].head(32)

Unnamed: 0,Game Date,Opp,HR,AvgHR
96,2009-04-09,TOR,2,0.666667
353,2009-04-19,SEA,1,1.291667
479,2009-04-24,KCA,2,1.5
622,2009-04-29,NYA,1,1.666667
789,2009-05-05,MIN,0,1.5
935,2009-05-10,CLE,0,1.2
1071,2009-05-16,OAK,0,1.0
1224,2009-05-22,COL,0,0.857143
1398,2009-05-27,KCA,1,0.75
1529,2009-06-02,BOS,1,0.777778


In [12]:
TOR = pitchlogs[pitchlogs.Opp == 'TOR']
TOR[['Game Date', 'Pitcher', 'Opp','HR','AvgHR']].head()

Unnamed: 0,Game Date,Pitcher,Opp,HR,AvgHR
19,2009-04-06,Justin Verlander,TOR,1,
29,2009-04-07,Edwin Jackson,TOR,1,
74,2009-04-08,Zach Miner,TOR,0,1.0
96,2009-04-09,Rick Porcello,TOR,2,0.666667
111,2009-04-10,Scott Lewis,TOR,2,1.0


In [13]:
pitchlogs['AvgERA'] = pitchlogs['AvgER'] * 9 / pitchlogs['AvgIP']

In [14]:
pitchlogs['AvgERA'].describe()

count    48536.000000
mean         4.126522
std          1.188707
min          0.000000
25%          3.401163
50%          4.035835
75%          4.700893
max         34.500000
Name: AvgERA, dtype: float64

In [15]:
CMW = pitchlogs[pitchlogs['Pitcher'] == 'Chien-Ming Wang']
CMW[['Game Date','ER','IP','AvgERA']].head()

Unnamed: 0,Game Date,ER,IP,AvgERA
59,2009-04-08,7,3.666667,
188,2009-04-13,8,1.0,9.384146
333,2009-04-18,8,1.333333,28.928571
1592,2009-06-04,5,4.666667,34.5
1760,2009-06-10,3,2.666667,23.625


In [16]:
pitchlogs['FIPnumerator'] = 13*pitchlogs['AvgHR'] + 3*(pitchlogs['AvgBB']+pitchlogs['AvgHBP']) - 2*pitchlogs['AvgK']
pitchlogs['AvgFIPnoConst'] = pitchlogs['FIPnumerator']/pitchlogs['AvgIP']

In [17]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 36 columns):
Game Date        48596 non-null datetime64[ns]
Pitcher          48596 non-null object
Opponent         48596 non-null object
CG               48596 non-null int64
GF               48596 non-null int64
W                48596 non-null int64
L                48596 non-null int64
Sho              48596 non-null int64
IP               48596 non-null float64
H                48596 non-null int64
R                48596 non-null int64
ER               48596 non-null int64
HR               48596 non-null int64
BB               48596 non-null int64
K                48596 non-null int64
HBP              48596 non-null int64
BK               48596 non-null int64
WP               48596 non-null int64
PitchAbbrev      48596 non-null object
Opp              48596 non-null object
pctW             48536 non-null float64
pctL             48536 non-null float64
pctSho           48536 non-null floa

In [18]:
pitchlogs.replace(np.inf,np.nan, inplace=True)

In [19]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 36 columns):
Game Date        48596 non-null datetime64[ns]
Pitcher          48596 non-null object
Opponent         48596 non-null object
CG               48596 non-null int64
GF               48596 non-null int64
W                48596 non-null int64
L                48596 non-null int64
Sho              48596 non-null int64
IP               48596 non-null float64
H                48596 non-null int64
R                48596 non-null int64
ER               48596 non-null int64
HR               48596 non-null int64
BB               48596 non-null int64
K                48596 non-null int64
HBP              48596 non-null int64
BK               48596 non-null int64
WP               48596 non-null int64
PitchAbbrev      48596 non-null object
Opp              48596 non-null object
pctW             48536 non-null float64
pctL             48536 non-null float64
pctSho           48536 non-null floa

In [20]:
nulls = pitchlogs[pitchlogs.AvgIP.isnull()]
nulls

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA,FIPnumerator,AvgFIPnoConst
0,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.0,8,...,,,,,,,,,,
1,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.0,2,...,,,,,,,,,,
2,2009-04-06,Joe Saunders,Vs. OAK,0,0,1,0,0,6.666667,3,...,,,,,,,,,,
3,2009-04-06,Paul Maholm,At SLN,0,0,0,0,0,6.666667,7,...,,,,,,,,,,
4,2009-04-06,Johan Santana,At CIN,0,0,1,0,0,5.666667,3,...,,,,,,,,,,
5,2009-04-06,John Lannan,At FLA,0,0,0,1,0,3.0,6,...,,,,,,,,,,
6,2009-04-06,Hiroki Kuroda,At SDN,0,0,1,0,0,5.666667,4,...,,,,,,,,,,
7,2009-04-06,Roy Oswalt,Vs. CHN,0,0,0,1,0,7.0,7,...,,,,,,,,,,
8,2009-04-06,Ricky Nolasco,Vs. WSH,0,0,1,0,0,6.0,7,...,,,,,,,,,,
9,2009-04-06,Jeremy Guthrie,Vs. NYA,0,0,1,0,0,6.0,7,...,,,,,,,,,,


In [21]:
cols_to_drop = [col for col in gamelogs.columns if col[-2:] in ['_V','_H']]
gamelogs = gamelogs.drop(cols_to_drop, axis=1)

#also delete columns with player IDs and position because we will not be using them
cols_to_drop = [col for col in gamelogs.columns if col.endswith(('Position','erID'))]
gamelogs = gamelogs.drop(cols_to_drop, axis=1) 

In [22]:
gamelogs.columns
gamelogs.shape

(24298, 123)

In [23]:
gamelogs_hsp = pd.merge(gamelogs, pitchlogs, how='left', left_on = ['Date', 'HomeSPAbbrev'], right_on=['Game Date', 'PitchAbbrev']) 
gamelogs = pd.merge(gamelogs_hsp, pitchlogs, how='left', left_on = ['Date', 'VisitSPAbbrev'], right_on=['Game Date', 'PitchAbbrev'], suffixes = ['_HSP', '_VSP'])

In [24]:
gamelogs.shape

(24298, 195)

In [25]:
gamelogs['NumHalfInnings'] = gamelogs['LengthInOuts']/3

In [26]:
def HomeTeamOffenseInnings(lengthinouts, numhalfinnings):
    if lengthinouts % 3 == 0:
        HomeTeamOffenseInnings = numhalfinnings // 2
    else:
        HomeTeamOffenseInnings = numhalfinnings - math.ceil(numhalfinnings/2)
    return HomeTeamOffenseInnings

for idx, game in gamelogs.iterrows():
    gamelogs.loc[idx,'HomeOffenseInnings'] = HomeTeamOffenseInnings(game['LengthInOuts'], game['NumHalfInnings'])

gamelogs['VisitorOffenseInnings'] = gamelogs['NumHalfInnings']-gamelogs['HomeOffenseInnings']
gamelogs['HomeDefInnings'] = gamelogs['VisitorOffenseInnings']
gamelogs['VisitorDefInnings'] = gamelogs['HomeOffenseInnings']

gamelogs[['NumHalfInnings','HomeOffenseInnings','VisitorOffenseInnings']].head(20)

Unnamed: 0,NumHalfInnings,HomeOffenseInnings,VisitorOffenseInnings
0,18.0,9.0,9.0
1,17.0,8.0,9.0
2,18.0,9.0,9.0
3,17.0,8.0,9.0
4,18.0,9.0,9.0
5,18.0,9.0,9.0
6,18.0,9.0,9.0
7,17.0,8.0,9.0
8,17.0,8.0,9.0
9,18.0,9.0,9.0


In [27]:
#create new columns with Relief Pitching Info
gamelogs['HomeReliefIP'] = gamelogs['VisitorOffenseInnings'] - gamelogs['IP_HSP']
gamelogs['VisitorReliefIP'] = gamelogs['HomeOffenseInnings'] - gamelogs['IP_VSP']

gamelogs['HomeReliefWP'] = gamelogs['VisitorWP'] - gamelogs['WP_HSP']
gamelogs['VisitorReliefWP'] = gamelogs['HomeWP'] - gamelogs['WP_VSP']

gamelogs['HomeReliefER'] = gamelogs['HomeER'] - gamelogs['ER_HSP']
gamelogs['VisitorReliefER'] = gamelogs['VisitorER'] - gamelogs['ER_VSP']

gamelogs['HomeReliefK'] = gamelogs['VisitorK'] - gamelogs['K_HSP']
gamelogs['VisitorReliefK'] = gamelogs['HomeK'] - gamelogs['K_VSP']

gamelogs['HomeReliefBB'] = gamelogs['VisitorBB'] - gamelogs['BB_HSP']
gamelogs['VisitorReliefBB'] = gamelogs['HomeBB'] - gamelogs['BB_VSP']

gamelogs['HomeReliefHBP'] = gamelogs['VisitorHBP'] - gamelogs['HBP_HSP']
gamelogs['VisitorReliefHBP'] = gamelogs['HomeHBP'] - gamelogs['HBP_VSP']

gamelogs['HomeReliefHR'] = gamelogs['VisitorHR'] - gamelogs['HR_HSP']
gamelogs['VisitorReliefHR'] = gamelogs['HomeHR'] - gamelogs['HR_VSP']

gamelogs['HomeReliefH'] = gamelogs['VisitorH'] - gamelogs['H_HSP']
gamelogs['VisitorReliefH'] = gamelogs['HomeH'] - gamelogs['H_VSP']

gamelogs['HomePitchK'] = gamelogs['VisitorK']
gamelogs['VisitorPitchK'] = gamelogs['HomeK']

gamelogs['HomeEROff'] = gamelogs['VisitorER']
gamelogs['VisitorEROff'] = gamelogs['HomeER']

#calculate times reaching base and plate appearances
gamelogs['HomeRB'] = gamelogs['HomeH'] + gamelogs['HomeBB'] + gamelogs['HomeHBP']
gamelogs['HomePA'] = gamelogs['HomeAB']+gamelogs['HomeBB']+gamelogs['HomeHBP']+gamelogs['HomeSF']

gamelogs['VisitorRB'] = gamelogs['VisitorH'] + gamelogs['VisitorBB'] + gamelogs['VisitorHBP']
gamelogs['VisitorPA'] = gamelogs['VisitorAB']+gamelogs['VisitorBB']+gamelogs['VisitorHBP']+gamelogs['VisitorSF']

gamelogs['HomePower'] = gamelogs['HomeD'] + 2 * gamelogs['HomeT'] + 3 * gamelogs['HomeHR']
gamelogs['VisitorPower'] = gamelogs['VisitorD'] + 2 * gamelogs['VisitorT'] + 3 * gamelogs['VisitorHR']

gamelogs['HomeBIP'] = gamelogs['HomeAB']-gamelogs['HomeK']-gamelogs['HomeHR']+gamelogs['HomeSF']+gamelogs['HomeSH']
gamelogs['VisitorBIP'] = gamelogs['VisitorAB']-gamelogs['VisitorK']-gamelogs['VisitorHR']+gamelogs['VisitorSF']+gamelogs['VisitorSH']

gamelogs['HomePitchBIP'] = gamelogs['VisitorBIP']
gamelogs['VisitorPitchBIP'] = gamelogs['HomeBIP']

In [28]:
#hits allowed by defense
gamelogs['HomeDefHminusHR'] = gamelogs['VisitorH']- gamelogs['VisitorHR']
gamelogs['VisitorDefHminusHR'] = gamelogs['HomeH'] - gamelogs['HomeHR']

gamelogs.rename(columns = {'HomeRunsScore': 'HomeRunsScored'}, inplace=True)
gamelogs['HomeRunDiff'] = gamelogs['HomeRunsScored'] - gamelogs['VisitorRunsScored']
gamelogs['VisitorRunDiff'] = -1*gamelogs['HomeRunDiff']

In [29]:
gamelogs = gamelogs.drop(['HomeOBP','VisitOBP'], axis=1)

In [30]:
home_idx1 = gamelogs.columns.get_loc('HomeAB')
home_idx2 = gamelogs.columns.get_loc('HomeTP')
print(home_idx1)
print(home_idx2)
cols1 = gamelogs[['HomeRunsScored', 'HomeEROff']]
cols2 = gamelogs.iloc[:, home_idx1 : home_idx2+1]
HomeRelCols = [col for col in gamelogs.columns if col.startswith('HomeRelief')]
cols3 = gamelogs[HomeRelCols]
cols4 = gamelogs[['HomeOffenseInnings','HomeDefInnings','HomeRB','HomePA', 'HomePower', 'HomeBIP', 'HomePitchBIP','HomeDefHminusHR','HomeRunDiff']]
cols = pd.concat([cols1,cols2,cols3, cols4], axis=1)
cols.columns

49
76


Index(['HomeRunsScored', 'HomeEROff', 'HomeAB', 'HomeH', 'HomeD', 'HomeT',
       'HomeHR', 'HomeRBI', 'HomeSH', 'HomeSF', 'HomeHBP', 'HomeBB', 'HomeIBB',
       'HomeK', 'HomeSB', 'HomeCS', 'HomeGDP', 'HomeCI', 'HomeLOB',
       'HomePitchers', 'HomeER', 'HomeTER', 'HomeWP', 'HomeBalks', 'HomePO',
       'HomeA', 'HomeE', 'HomePassed', 'HomeDB', 'HomeTP', 'HomeReliefIP',
       'HomeReliefWP', 'HomeReliefER', 'HomeReliefK', 'HomeReliefBB',
       'HomeReliefHBP', 'HomeReliefHR', 'HomeReliefH', 'HomeOffenseInnings',
       'HomeDefInnings', 'HomeRB', 'HomePA', 'HomePower', 'HomeBIP',
       'HomePitchBIP', 'HomeDefHminusHR', 'HomeRunDiff'],
      dtype='object')

In [31]:
#calculate team averages
for home_col in cols.columns:
    stat = home_col[4:]
    visit_col = 'Visitor' + stat
    melted = melt_gamelogs([home_col, visit_col])  
    melted['stat'] = relavent_team_stat(melted, home_col, visit_col)
    melted = melted.sort_values(['Team','Date'])
    new_col = 'Avg' + stat
    melted_grpd = melted.groupby(['Team'])
    melted[new_col] = melted_grpd['stat'].apply(rolling_avg)                                                                               
    melted = melted[['Date','DoubleHeader','Team',new_col]]                                                                                
    gamelogs = merge_twice(gamelogs, melted, 'HomeTeam', 'VisitingTeam', 'Team')                                                                                    

In [32]:
gamelogs.tail()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,Team_V,AvgPitchBIP_V,Team_H,AvgDefHminusHR_H,Team_V.1,AvgDefHminusHR_V,Team_H.1,AvgRunDiff_H,Team_V.2,AvgRunDiff_V
24293,2018-09-30,0,Sun,CHA,AL,162,MIN,AL,162,4,...,CHA,25.567901,MIN,7.561728,CHA,7.45679,MIN,-0.209877,CHA,-1.191358
24294,2018-09-30,0,Sun,TEX,AL,162,SEA,AL,162,1,...,TEX,26.697531,SEA,7.438272,TEX,8.0,SEA,-0.246914,TEX,-0.691358
24295,2018-09-30,0,Sun,TOR,AL,162,TBA,AL,162,4,...,TOR,25.5,TBA,6.580247,TOR,7.790123,TBA,0.438272,TOR,-0.722222
24296,2018-10-01,0,Mon,MIL,NL,163,CHN,NL,163,3,...,MIL,23.938272,CHN,7.098765,MIL,6.691358,CHN,0.728395,MIL,0.574074
24297,2018-10-01,0,Mon,COL,NL,163,LAN,NL,163,2,...,COL,24.438272,LAN,6.777778,COL,7.320988,LAN,1.179012,COL,0.234568


In [33]:
#calculate more stats

#OBP
gamelogs['AvgOBP_H'] = gamelogs['AvgRB_H']/gamelogs['AvgPA_H']
gamelogs['AvgOBP_V'] = gamelogs['AvgRB_V']/gamelogs['AvgPA_V']

#isolated power
gamelogs['AvgISO_H'] = gamelogs['AvgPower_H']/gamelogs['AvgAB_H']
gamelogs['AvgISO_V'] = gamelogs['AvgPower_V']/gamelogs['AvgAB_V']

#Offense earned runs scored per 9 innings
gamelogs['AvgOffERunsPer9Inn_H'] = gamelogs['AvgEROff_H'] * 9 / gamelogs['AvgOffenseInnings_H']
gamelogs['AvgOffERunsPer9Inn_V'] = gamelogs['AvgEROff_V'] * 9 / gamelogs['AvgOffenseInnings_V']

#Relief ERA
gamelogs['AvgReliefERA_H'] = gamelogs['AvgReliefER_H'] * 9 / gamelogs['AvgReliefIP_H']
gamelogs['AvgReliefERA_V'] = gamelogs['AvgReliefER_V'] * 9 / gamelogs['AvgReliefIP_V']

#Relief FIP no Constant
gamelogs['FIPnumerator_H'] = 13*gamelogs['AvgReliefHR_H'] + 3*(gamelogs['AvgReliefBB_H']+gamelogs['AvgReliefHBP_H']) - 2*gamelogs['AvgReliefK_H']
gamelogs['AvgRelFIPnoConst_H'] = gamelogs['FIPnumerator_H']/gamelogs['AvgReliefIP_H']
gamelogs['FIPnumerator_V'] = 13*gamelogs['AvgReliefHR_V'] + 3*(gamelogs['AvgReliefBB_V']+gamelogs['AvgReliefHBP_V']) - 2*gamelogs['AvgReliefK_V']
gamelogs['AvgRelFIPnoConst_V'] = gamelogs['FIPnumerator_V']/gamelogs['AvgReliefIP_V']

#BABIP - (H - HR)/(AB - K - HR + SF + SH)
gamelogs['AvgBABIP_H'] = (gamelogs['AvgH_H']-gamelogs['AvgHR_H'])/gamelogs['AvgBIP_H']
gamelogs['AvgBABIP_V'] = (gamelogs['AvgH_V']-gamelogs['AvgHR_V'])/gamelogs['AvgBIP_V']

#Pitcher BABIP - outs per ba ball in play
gamelogs['AvgPitchBABIP_H'] = gamelogs['AvgDefHminusHR_H']/gamelogs['AvgPitchBIP_H']
gamelogs['AvgPitchBABIP_V'] = gamelogs['AvgDefHminusHR_V']/gamelogs['AvgPitchBIP_V']

#Avg assists per 9 innings
gamelogs['AvgAper9_H'] = gamelogs['AvgA_H']*9/gamelogs['AvgDefInnings_H']
gamelogs['AvgAper9_V'] = gamelogs['AvgA_V']*9/gamelogs['AvgDefInnings_V']

#Avg errors per 9 innings
gamelogs['AvgEper9_H'] = gamelogs['AvgE_H']*9/gamelogs['AvgDefInnings_H']
gamelogs['AvgEper9_V'] = gamelogs['AvgE_V']*9/gamelogs['AvgDefInnings_V']

#Avg strikeouts by offense per 9 innings
gamelogs['AvgKper9_H'] = gamelogs['AvgK_H']*9/gamelogs['AvgOffenseInnings_H']
gamelogs['AvgKper9_V'] = gamelogs['AvgK_V']*9/gamelogs['AvgOffenseInnings_V']

#pct games won by starting pitcher - percent games lost
gamelogs['pctWminL_HSP'] = gamelogs['pctW_HSP']-gamelogs['pctL_HSP']
gamelogs['pctWminL_VSP'] = gamelogs['pctW_VSP']-gamelogs['pctL_VSP']

In [34]:
gamelogs.Attendance.head()

0    44532.0
1    48799.0
2    42177.0
3    34323.0
4    43827.0
Name: Attendance, dtype: float64

In [35]:
bypark = gamelogs.groupby('ParkID')
byHome = gamelogs.groupby('HomeTeam')
gamelogs['AvgAttendance'] = bypark['Attendance'].apply(rolling_avg_hva)
gamelogs['temp'] = byHome['Attendance'].apply(rolling_avg_hva)
gamelogs['AvgAttendance'] = gamelogs['AvgAttendance'].fillna(gamelogs['temp'])
gamelogs.drop('temp', axis=1,inplace=True)

In [36]:
gamelogs[['Date','HomeTeam','AvgAttendance']].tail()

Unnamed: 0,Date,HomeTeam,AvgAttendance
24293,2018-09-30,MIN,24848.555093
24294,2018-09-30,SEA,28420.222222
24295,2018-09-30,TBA,14292.320988
24296,2018-10-01,CHN,38798.012346
24297,2018-10-01,LAN,47033.135802


In [37]:
gamelogs.columns

Index(['Date', 'DoubleHeader', 'DayOfWeek', 'VisitingTeam',
       'VisitingTeamLeague', 'VisitingTeamGameNumber', 'HomeTeam',
       'HomeTeamLeague', 'HomeTeamGameNumber', 'VisitorRunsScored',
       ...
       'AvgPitchBABIP_V', 'AvgAper9_H', 'AvgAper9_V', 'AvgEper9_H',
       'AvgEper9_V', 'AvgKper9_H', 'AvgKper9_V', 'pctWminL_HSP',
       'pctWminL_VSP', 'AvgAttendance'],
      dtype='object', length=445)

In [38]:
gamelogs = gamelogs.drop(['Team_V', 'Team_H'], axis=1)
gamelogs.shape

(24298, 351)

In [39]:
gamelogs['VisitorWin'] = np.abs(1-gamelogs['HomeWin'])

gamelogs[['HomeWin','VisitorWin']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,HomeWin,VisitorWin
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1
5,0,1
6,0,1
7,1,0
8,1,0
9,0,1


In [40]:
byHteam = gamelogs.groupby(['HomeTeam'])
byAteam = gamelogs.groupby(['VisitingTeam'])
gamelogs['AvgRunDiffAtHome_H'] = byHteam['HomeRunDiff'].apply(rolling_avg_hva)
gamelogs['AvgRunDiffOnRoad_V'] = byAteam['VisitorRunDiff'].apply(rolling_avg_hva)
gamelogs['pctWinAtHome_H'] = byHteam['HomeWin'].apply(rolling_avg_hva)
gamelogs['pctWinOnRoad_V'] = byAteam['VisitorWin'].apply(rolling_avg_hva)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [41]:
gamelogs[['AvgRunDiffAtHome_H','AvgRunDiffOnRoad_V','pctWinAtHome_H','pctWinOnRoad_V']].describe()

Unnamed: 0,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
count,24238.0,24238.0,24238.0,24238.0
mean,0.152084,-0.153548,0.537773,0.462221
std,0.754668,0.79297,0.083786,0.085034
min,-8.333333,-7.0,0.0,0.0
25%,-0.382716,-0.679012,0.481481,0.407407
50%,0.135802,-0.098765,0.54321,0.469136
75%,0.666667,0.382716,0.597403,0.518519
max,7.0,5.0,1.0,1.0


In [42]:
gamelogs[['AvgRunDiffAtHome_H','AvgRunDiffOnRoad_V','pctWinAtHome_H','pctWinOnRoad_V']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 4 columns):
AvgRunDiffAtHome_H    24238 non-null float64
AvgRunDiffOnRoad_V    24238 non-null float64
pctWinAtHome_H        24238 non-null float64
pctWinOnRoad_V        24238 non-null float64
dtypes: float64(4)
memory usage: 949.1 KB


Each team's first 2 home games and first 2 away games of the 2009 season should be null. 30*2 = 60 - 60 null in Home cols and 60 null in Away cols 

In [43]:
nulls = gamelogs[(gamelogs['AvgRunDiffAtHome_H'].isnull())|(gamelogs['AvgRunDiffOnRoad_V'].isnull())]
teams = nulls['VisitingTeam'].append(nulls['HomeTeam'])
teams.value_counts()

NYA    8
SFN    8
SEA    6
ATL    6
ANA    6
CLE    6
WAS    6
TEX    6
DET    6
CHA    6
SDN    6
CHN    6
MIL    6
HOU    6
COL    6
ARI    6
PHI    6
SLN    6
KCA    4
FLO    4
TBA    4
MIN    4
BAL    4
OAK    4
BOS    4
NYN    4
LAN    4
TOR    4
PIT    4
CIN    4
dtype: int64

In [44]:
nulls.tail()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgKper9_H,AvgKper9_V,pctWminL_HSP,pctWminL_VSP,AvgAttendance,VisitorWin,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
134,2009-04-16,0,Thu,PHI,NL,8,WAS,NL,8,2,...,10.125,5.080645,0.142857,0.428571,,0,,0.0,,0.75
136,2009-04-16,0,Thu,CLE,AL,10,NYA,AL,10,10,...,7.0,9.5625,0.0,-1.0,,1,,-3.666667,,0.166667
145,2009-04-17,0,Fri,ARI,NL,10,SFN,NL,10,0,...,8.43038,6.439331,-0.222222,-1.0,37852.666667,0,2.666667,,0.666667,
149,2009-04-17,0,Fri,CLE,AL,11,NYA,AL,11,5,...,6.9,9.303371,0.25,0.55,,0,,-2.0,,0.285714
160,2009-04-18,0,Sat,ARI,NL,11,SFN,NL,11,2,...,8.586207,6.496241,-0.5,-1.0,37114.0,1,2.5,,0.75,


In [45]:
#check that it worked
RS = gamelogs[(gamelogs['HomeTeam']=='BOS') | (gamelogs['VisitingTeam']=='BOS')].sort_values('Date')
RS_H = RS[['Date', 'HomeTeam', 'VisitingTeam','HomeH','AvgH_H','VisitorH', 'AvgH_V']]
RS_H.iloc[155:170,:]

Unnamed: 0,Date,HomeTeam,VisitingTeam,HomeH,AvgH_H,VisitorH,AvgH_V
2338,2009-09-28,BOS,TOR,7,9.219355,14,9.282051
2351,2009-09-29,BOS,TOR,14,9.205128,11,9.312102
2367,2009-09-30,BOS,TOR,3,9.235669,17,9.322785
2380,2009-10-01,BOS,CLE,12,9.196203,3,9.107595
2393,2009-10-02,BOS,CLE,8,9.213836,8,9.069182
2408,2009-10-03,BOS,CLE,11,9.20625,10,9.0625
2423,2009-10-04,BOS,CLE,11,9.217391,8,9.068323
2430,2010-04-04,BOS,NYA,12,9.228395,12,9.901235
2445,2010-04-06,BOS,NYA,9,9.246914,9,9.907407
2452,2010-04-07,BOS,NYA,7,9.253086,6,9.907407


In [46]:
gamelogs = gamelogs.replace(np.inf,np.nan)

In [47]:
rel_cols = [col for col in gamelogs.columns if col.startswith(('pct','Avg'))]
gamelogs[rel_cols].shape

(24298, 151)

In [48]:
gamelogs[rel_cols].iloc[:,:60].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 60 columns):
pctW_HSP             24268 non-null float64
pctL_HSP             24268 non-null float64
pctSho_HSP           24268 non-null float64
AvgIP_HSP            24268 non-null float64
AvgH_HSP             24268 non-null float64
AvgR_HSP             24268 non-null float64
AvgER_HSP            24268 non-null float64
AvgHR_HSP            24268 non-null float64
AvgBB_HSP            24268 non-null float64
AvgK_HSP             24268 non-null float64
AvgHBP_HSP           24268 non-null float64
AvgBK_HSP            24268 non-null float64
AvgWP_HSP            24268 non-null float64
AvgERA_HSP           24268 non-null float64
AvgFIPnoConst_HSP    24268 non-null float64
pctW_VSP             24268 non-null float64
pctL_VSP             24268 non-null float64
pctSho_VSP           24268 non-null float64
AvgIP_VSP            24268 non-null float64
AvgH_VSP             24268 non-null float64
AvgR_VSP   

In [49]:
gamelogs[rel_cols].iloc[:,60:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 91 columns):
AvgCS_H                 24268 non-null float64
AvgCS_V                 24268 non-null float64
AvgGDP_H                24268 non-null float64
AvgGDP_V                24268 non-null float64
AvgCI_H                 24268 non-null float64
AvgCI_V                 24268 non-null float64
AvgLOB_H                24268 non-null float64
AvgLOB_V                24268 non-null float64
AvgPitchers_H           24268 non-null float64
AvgPitchers_V           24268 non-null float64
AvgER_H                 24268 non-null float64
AvgER_V                 24268 non-null float64
AvgTER_H                24268 non-null float64
AvgTER_V                24268 non-null float64
AvgWP_H                 24268 non-null float64
AvgWP_V                 24268 non-null float64
AvgBalks_H              24268 non-null float64
AvgBalks_V              24268 non-null float64
AvgPO_H                 24268 non-null float6

In [50]:
gamelogs.head()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgKper9_H,AvgKper9_V,pctWminL_HSP,pctWminL_VSP,AvgAttendance,VisitorWin,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
0,2009-04-05,0,Sun,ATL,NL,1,PHI,NL,1,4,...,,,,,,1,,,,
1,2009-04-06,0,Mon,COL,NL,1,ARI,NL,1,8,...,,,,,,0,,,,
2,2009-04-06,0,Mon,NYN,NL,1,CIN,NL,1,2,...,,,,,,1,,,,
3,2009-04-06,0,Mon,WAS,NL,1,FLO,NL,1,6,...,,,,,,0,,,,
4,2009-04-06,0,Mon,CHN,NL,1,HOU,NL,1,4,...,,,,,,1,,,,


In [51]:
#rename it to make clear that these are avgs since 2009
gamelogsRoll = gamelogs

In [52]:
gamelogsRoll.head()


Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgKper9_H,AvgKper9_V,pctWminL_HSP,pctWminL_VSP,AvgAttendance,VisitorWin,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
0,2009-04-05,0,Sun,ATL,NL,1,PHI,NL,1,4,...,,,,,,1,,,,
1,2009-04-06,0,Mon,COL,NL,1,ARI,NL,1,8,...,,,,,,0,,,,
2,2009-04-06,0,Mon,NYN,NL,1,CIN,NL,1,2,...,,,,,,1,,,,
3,2009-04-06,0,Mon,WAS,NL,1,FLO,NL,1,6,...,,,,,,0,,,,
4,2009-04-06,0,Mon,CHN,NL,1,HOU,NL,1,4,...,,,,,,1,,,,


In [53]:
outfile = open('../PickledFiles/gamelogsRoll', 'wb')
pickle.dump(gamelogsRoll, outfile)
outfile.close()