# Capstone 1: Data Wrangling Part 2:
Calculating average stats leading into games, to start creating features for a predictive model.

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
import math

In [2]:
infile = open('../PickledFiles/gamelogs2', 'rb')
gamelogs = pickle.load(infile)
infile.close

infile = open('../PickledFiles/pitchlogs', 'rb')
pitchlogs = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [3]:
def melt_gamelogs(other_id_vars):
    """Function to melt gamelogs so can perform calculation for each team"""
    melted = pd.melt(gamelogs, id_vars= ['Date','HomeWin','DoubleHeader'] + other_id_vars, value_vars = ['HomeTeam','VisitingTeam'], value_name = 'Team')
    melted = melted.sort_values('Team')
    return melted

def merge_twice(df1, df2, df1_home, df1_visit, df2col):
    """Function to merge df twice-once for info about home team, then for info about away team"""
    merged = pd.merge(df1,df2, left_on = ['Date', 'DoubleHeader',df1_home], right_on = ['Date','DoubleHeader', df2col])
    merged = pd.merge(merged,df2, left_on = ['Date', 'DoubleHeader',df1_visit], right_on = ['Date', 'DoubleHeader',df2col], suffixes = ['_H','_V'])
    return merged

def relavent_team_stat(df, home_col, visit_col):
    """Function to get the stat of either the home team or the away team """
    stats = []
    for i, row in df.iterrows():
        if row['variable'] == 'HomeTeam':
            stat = row[home_col]
        else:
            stat = row[visit_col]
        stats.append(stat)
    return stats

def calculate_teamavg(game):
    """Function used to calculate average statistics for team."""
    avg = game.expanding(3).mean().shift()
    return avg

def calculate_pitchavg(game):
    """Function used to calculate rolling averages for starting pitchers."""
    avg = game.expanding(3).mean().shift()
    return avg

def fill_with_EOPS_data(grouped_df, df, col, col_to_groupby, new_col, min_games, limit):
    """Fill NA with data from end of prior season."""
    EOS_col = 'EOS' + col
    df[EOS_col] = grouped_df[col].apply(lambda x: x.expanding(min_games).mean())
    #fill first 2 games of season with prior season data
    df[EOS_col] = df.groupby(col_to_groupby)[EOS_col].fillna(method='ffill',limit=limit)
    df[new_col] = df[new_col].fillna(df[EOS_col])
    df.drop(labels=EOS_col, axis=1, inplace=True)
 
    return df[new_col]

def fill_second_game(df, col, new_col, col_to_groupby):
    """Fill info for second game of season with average of first game and end of prior season"""
    temp_col = 'Temp'+col
    df[temp_col] = (df[new_col] + df[col])/2
    df[temp_col] = df.groupby(col_to_groupby)[temp_col].shift()
    df[new_col] = df[new_col].fillna(df[temp_col],limit=1)
    df.drop(labels=temp_col, axis=1, inplace=True)
    return df[new_col]

def fill_2ndand3rd_game(grouped_df, df, col, new_col, col_to_groupby):
    """Fill info for second game of season with average of first and second game and end of prior season
    All with equal weights, so 2/3 of weight is given to first and second game of current season when
    filling data for 3rd game.  Fills 3rd game and then second game"""
    temp_col = 'Temp'+ col
    temp_col2 = 'Temp2' + col
    temp_col3 = 'Temp3' + col
    temp_col4 = 'Temp4' + col
    #df[temp_col] = grouped_df[col].apply(lambda x: x.expanding(2).mean().shift())
    df[temp_col] = grouped_df[col].shift()
    df[temp_col2] = grouped_df[new_col].shift()
    df[temp_col3] = (df[col] + df[temp_col] + df[temp_col2])/3
    df[temp_col3] = df.groupby(col_to_groupby)[temp_col3].shift()
    df[temp_col4] = (df[col] + df[new_col])/2
    df[temp_col4] = df.groupby(col_to_groupby)[temp_col4].shift()
    df[temp_col3] = df[temp_col3].fillna(df[temp_col4])    
    df[new_col] = df[new_col].fillna(df[temp_col3])
    df.drop([temp_col, temp_col2,temp_col3, temp_col4], axis=1, inplace=True)
    return df[new_col]

In [4]:
pitchlogs.head()

Unnamed: 0,Game Date,Pitcher,Opponent,GS,CG,GF,W,L,Sv,Sho,...,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev
40498,2009-04-05,Brett Myers,Vs. ATL,1,0,0,0,1,0,0,...,8,4,4,3,1,6,0,0,0,Bre Myers
24300,2009-04-05,Derek Lowe,At PHI,1,0,0,1,0,0,0,...,2,0,0,0,0,4,0,0,0,Der Lowe
0,2009-04-06,Joe Saunders,Vs. OAK,1,0,0,1,0,0,0,...,3,0,0,0,2,2,1,0,0,Joe Saunders
42118,2009-04-06,Paul Maholm,At SLN,1,0,0,0,0,0,0,...,7,2,1,0,1,1,1,0,1,Pau Maholm
38878,2009-04-06,Johan Santana,At CIN,1,0,0,1,0,0,0,...,3,1,1,0,4,7,0,0,0,Joh Santana


## Calculate Averages for Starting Pitchers

In [5]:
pitchlogs.columns

Index(['Game Date', 'Pitcher', 'Opponent', 'GS', 'CG', 'GF', 'W', 'L', 'Sv',
       'Sho', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP',
       'PitchAbbrev'],
      dtype='object')

In [6]:
cols = ['GS', 'CG', 'GF', 'W', 'L', 'Sv','Sho']
for col in cols:
    pitchlogs[col] = pd.to_numeric(pitchlogs[col])
pitchlogs[cols].dtypes

GS     int64
CG     int64
GF     int64
W      int64
L      int64
Sv     int64
Sho    int64
dtype: object

In [7]:
pitchlogs[(pitchlogs.GS != 1) | (pitchlogs.Sv != 0)]

Unnamed: 0,Game Date,Pitcher,Opponent,GS,CG,GF,W,L,Sv,Sho,...,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev


In [8]:
#These columns are irrelevant because the starting pitcher always will always have 1 in the 'GS' (game started) and O
#  in the Sv ('Game Saved') column, so we can drop them
pitchlogs = pitchlogs.drop(['GS', 'Sv'], axis=1)
pitchlogs.columns

Index(['Game Date', 'Pitcher', 'Opponent', 'CG', 'GF', 'W', 'L', 'Sho', 'IP',
       'H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP', 'PitchAbbrev'],
      dtype='object')

In [9]:
pitchlogs['Opp']=pitchlogs['Opponent'].str[-3:]
pitchlogs['Opp'].head()

40498    ATL
24300    PHI
0        OAK
42118    SLN
38878    CIN
Name: Opp, dtype: object

In [10]:
pitchlogs = pitchlogs.reset_index(drop=True)
pitchlogs.head()

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,R,ER,HR,BB,K,HBP,BK,WP,PitchAbbrev,Opp
0,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.0,8,4,4,3,1,6,0,0,0,Bre Myers,ATL
1,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.0,2,0,0,0,0,4,0,0,0,Der Lowe,PHI
2,2009-04-06,Joe Saunders,Vs. OAK,0,0,1,0,0,6.666667,3,0,0,0,2,2,1,0,0,Joe Saunders,OAK
3,2009-04-06,Paul Maholm,At SLN,0,0,0,0,0,6.666667,7,2,1,0,1,1,1,0,1,Pau Maholm,SLN
4,2009-04-06,Johan Santana,At CIN,0,0,1,0,0,5.666667,3,1,1,0,4,7,0,0,0,Joh Santana,CIN


In [11]:
#group by pitcher and season so we can calculate stats per season for individual starting pitchers
bypitcher = pitchlogs.groupby(['Pitcher', pitchlogs['Game Date'].dt.year])
florida = {'Vs. MIA: Vs. FLA', 'At MIA: At FLA'}
pitchlogs.replace(florida, inplace=True)
by_opp = pitchlogs.groupby('Opp')

In [12]:
#pitchlogs.reset_index(inplace=True)
new_cols = []
for col in ['CG', 'GF', 'W', 'L', 'Sho']:
    new_col = 'pct'+col
    new_cols.append(new_col)
for col in ['IP','H', 'R', 'ER', 'HR', 'BB', 'K', 'HBP', 'BK', 'WP']:
    new_col = 'Avg'+col
    new_cols.append(new_col)
    
for new_col in new_cols:
    col = new_col[3:]
    pitchlogs[new_col] = bypitcher[col].apply(calculate_pitchavg)
    #fill first start of season with data from end of prior season
    pitchlogs[new_col] = fill_with_EOPS_data(bypitcher, pitchlogs, col,'Pitcher',new_col, 4, 1)
    #fill second start of season with avg of end of prior season and first start
    pitchlogs[new_col] = fill_2ndand3rd_game(bypitcher, pitchlogs, col, new_col, 'Pitcher')


    #fill second start with just first start info - this will cover second starts of careers
    #temp_col = 'Temp'+col
    #pitchlogs[temp_col] = pitchlogs.groupby('Pitcher')[col].apply(lambda x: x.expanding().mean().shift())
    #pitchlogs[new_col] = pitchlogs[new_col].fillna(pitchlogs[temp_col])
    #pitchlogs.drop([temp_col], axis=1, inplace=True)
    #now first second and third career start will be left blank.  To fill it we 
    temp_col = 'Temp' +col
    temp_col2 = 'Temp2' + col
    temp_col3 = 'Temp3' + col
    temp_col4 = 'Temp4' + col
    temp_col5 = 'Temp5' + col
    pitchlogs[temp_col] = by_opp[col].apply(lambda x: x.rolling(window = 162, min_periods = 5).mean().shift())
    pitchlogs[temp_col2] = pitchlogs.groupby('Pitcher')[col].shift()
    pitchlogs[temp_col3] = (pitchlogs[temp_col]+pitchlogs[temp_col2])/2
    pitchlogs[temp_col4] = pitchlogs.groupby('Pitcher')[col].shift(2)
    pitchlogs[temp_col5] = (pitchlogs[temp_col]+pitchlogs[temp_col2] + pitchlogs[temp_col4])/3
    pitchlogs[new_col] = pitchlogs[new_col].fillna(pitchlogs[temp_col5]).fillna(pitchlogs[temp_col3])
    pitchlogs[new_col] = pitchlogs[new_col].fillna(pitchlogs[temp_col])
    cols_to_drop = [col for col in pitchlogs.columns if col.startswith('Temp')]
    pitchlogs.drop(cols_to_drop, axis = 1, inplace=True)

In [13]:
pitchlogs.head()

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgIP,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP
0,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.0,8,...,,,,,,,,,,
1,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.0,2,...,,,,,,,,,,
2,2009-04-06,Joe Saunders,Vs. OAK,0,0,1,0,0,6.666667,3,...,,,,,,,,,,
3,2009-04-06,Paul Maholm,At SLN,0,0,0,0,0,6.666667,7,...,,,,,,,,,,
4,2009-04-06,Johan Santana,At CIN,0,0,1,0,0,5.666667,3,...,,,,,,,,,,


In [14]:
pitchlogs['AvgERA'] = pitchlogs['AvgER'] * 9 / pitchlogs['AvgIP']

In [15]:
#check that it worked
RP = pitchlogs.loc[pitchlogs.Pitcher == 'Rick Porcello']
RP[['Game Date', 'ER','IP', 'AvgER','AvgIP', 'AvgERA']].tail(36).head(10)


Unnamed: 0,Game Date,ER,IP,AvgER,AvgIP,AvgERA
43306,2017-09-16,0,7.333333,3.2,6.211111,4.636852
43473,2017-09-22,4,4.0,3.096774,6.247312,4.461274
43622,2017-09-27,5,5.666667,3.125,6.177083,4.55312
43794,2018-03-31,1,5.333333,3.181818,6.161616,4.647541
43968,2018-04-07,3,7.333333,2.090909,5.747475,3.274165
44091,2018-04-12,0,7.0,2.393939,6.276094,3.43294
44248,2018-04-18,0,6.0,1.333333,6.555556,1.830508
44403,2018-04-24,3,7.0,1.0,6.416667,1.402597
44547,2018-04-29,3,7.666667,1.4,6.533333,1.928571
44665,2018-05-04,1,6.0,1.666667,6.722222,2.231405


In [16]:
(5.3333+6.161616+7.3333)/3

6.276072

In [17]:
RP[['Game Date', 'ER','IP', 'AvgER','AvgIP', 'AvgERA']].head(10)

Unnamed: 0,Game Date,ER,IP,AvgER,AvgIP,AvgERA
96,2009-04-09,4,5.0,,,
353,2009-04-19,1,7.0,3.458333,5.652778,5.506143
479,2009-04-24,4,6.0,2.422222,6.111111,3.567273
622,2009-04-29,6,3.666667,3.0,6.0,4.5
789,2009-05-05,0,7.0,3.75,5.416667,6.230769
935,2009-05-10,1,5.0,3.0,5.733333,4.709302
1071,2009-05-16,1,6.0,2.666667,5.611111,4.277228
1224,2009-05-22,1,6.0,2.428571,5.666667,3.857143
1398,2009-05-27,2,6.0,2.25,5.708333,3.547445
1529,2009-06-02,3,4.333333,2.222222,5.740741,3.483871


In [18]:
pitchlogs['AvgERA'].describe()

count    48446.000000
mean         4.131073
std          1.463543
min          0.000000
25%          3.182510
50%          3.970588
75%          4.858131
max         34.500000
Name: AvgERA, dtype: float64

In [19]:
RP.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307 entries, 96 to 48565
Data columns (total 36 columns):
Game Date      307 non-null datetime64[ns]
Pitcher        307 non-null object
Opponent       307 non-null object
CG             307 non-null int64
GF             307 non-null int64
W              307 non-null int64
L              307 non-null int64
Sho            307 non-null int64
IP             307 non-null float64
H              307 non-null int64
R              307 non-null int64
ER             307 non-null int64
HR             307 non-null int64
BB             307 non-null int64
K              307 non-null int64
HBP            307 non-null int64
BK             307 non-null int64
WP             307 non-null int64
PitchAbbrev    307 non-null object
Opp            307 non-null object
pctCG          306 non-null float64
pctGF          306 non-null float64
pctW           306 non-null float64
pctL           306 non-null float64
pctSho         306 non-null float64
AvgIP          30

The first 5 starts against each team should be null... 5*30teams = 150

In [20]:
num_pitchers = pitchlogs['Pitcher'].nunique()
num_pitchers

946

In [21]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 36 columns):
Game Date      48596 non-null datetime64[ns]
Pitcher        48596 non-null object
Opponent       48596 non-null object
CG             48596 non-null int64
GF             48596 non-null int64
W              48596 non-null int64
L              48596 non-null int64
Sho            48596 non-null int64
IP             48596 non-null float64
H              48596 non-null int64
R              48596 non-null int64
ER             48596 non-null int64
HR             48596 non-null int64
BB             48596 non-null int64
K              48596 non-null int64
HBP            48596 non-null int64
BK             48596 non-null int64
WP             48596 non-null int64
PitchAbbrev    48596 non-null object
Opp            48596 non-null object
pctCG          48446 non-null float64
pctGF          48446 non-null float64
pctW           48446 non-null float64
pctL           48446 non-null float64
pctS

In [22]:
48596-48446

150

In [23]:
nulls = pitchlogs[pitchlogs.AvgIP.isnull()]
nulls

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgH,AvgR,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA
0,2009-04-05,Brett Myers,Vs. ATL,0,0,0,1,0,6.000000,8,...,,,,,,,,,,
1,2009-04-05,Derek Lowe,At PHI,0,0,1,0,0,8.000000,2,...,,,,,,,,,,
2,2009-04-06,Joe Saunders,Vs. OAK,0,0,1,0,0,6.666667,3,...,,,,,,,,,,
3,2009-04-06,Paul Maholm,At SLN,0,0,0,0,0,6.666667,7,...,,,,,,,,,,
4,2009-04-06,Johan Santana,At CIN,0,0,1,0,0,5.666667,3,...,,,,,,,,,,
5,2009-04-06,John Lannan,At FLA,0,0,0,1,0,3.000000,6,...,,,,,,,,,,
6,2009-04-06,Hiroki Kuroda,At SDN,0,0,1,0,0,5.666667,4,...,,,,,,,,,,
7,2009-04-06,Roy Oswalt,Vs. CHN,0,0,0,1,0,7.000000,7,...,,,,,,,,,,
8,2009-04-06,Ricky Nolasco,Vs. WSH,0,0,1,0,0,6.000000,7,...,,,,,,,,,,
9,2009-04-06,Jeremy Guthrie,Vs. NYA,0,0,1,0,0,6.000000,7,...,,,,,,,,,,


In [24]:
pitchlogs['FIPnumerator'] = 13*pitchlogs['AvgHR'] + 3*(pitchlogs['AvgBB']+pitchlogs['AvgHBP']) - 2*pitchlogs['AvgK']
pitchlogs['AvgFIPnoConst'] = pitchlogs['FIPnumerator']/pitchlogs['AvgIP']

In [25]:
pitchlogs.replace(np.inf,np.nan, inplace=True)

In [26]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 38 columns):
Game Date        48596 non-null datetime64[ns]
Pitcher          48596 non-null object
Opponent         48596 non-null object
CG               48596 non-null int64
GF               48596 non-null int64
W                48596 non-null int64
L                48596 non-null int64
Sho              48596 non-null int64
IP               48596 non-null float64
H                48596 non-null int64
R                48596 non-null int64
ER               48596 non-null int64
HR               48596 non-null int64
BB               48596 non-null int64
K                48596 non-null int64
HBP              48596 non-null int64
BK               48596 non-null int64
WP               48596 non-null int64
PitchAbbrev      48596 non-null object
Opp              48596 non-null object
pctCG            48446 non-null float64
pctGF            48446 non-null float64
pctW             48446 non-null floa

In [27]:
pitchlogs[['AvgERA','AvgFIPnoConst']].describe()

Unnamed: 0,AvgERA,AvgFIPnoConst
count,48446.0,48446.0
mean,4.131073,1.03442
std,1.463543,1.012233
min,0.0,-2.7
25%,3.18251,0.373802
50%,3.970588,0.961165
75%,4.858131,1.584396
max,34.5,8.742857


In [28]:
highERA = pitchlogs[pitchlogs['AvgERA']>10]
print(highERA.shape)
highERA[['Game Date', 'Pitcher', 'IP', 'ER','AvgIP','AvgER','AvgERA']].head(10)

(190, 38)


Unnamed: 0,Game Date,Pitcher,IP,ER,AvgIP,AvgER,AvgERA
161,2009-04-12,Scott Olsen,5.0,5,3.9,6.2,14.307692
202,2009-04-13,Brian Moehler,2.333333,5,4.111111,4.583333,10.033784
226,2009-04-14,Carl Pavano,6.0,4,3.619048,5.214286,12.967105
248,2009-04-15,Kris Benson,6.0,4,4.979167,5.75,10.393305
255,2009-04-15,Oliver Perez,6.0,1,4.729167,5.625,10.704846
280,2009-04-17,Cole Hamels,6.0,5,4.433333,5.15,10.454887
322,2009-04-18,Scott Olsen,7.0,2,4.466667,5.5,11.08209
333,2009-04-18,Chien-Ming Wang,1.333333,8,3.383838,5.787879,15.39403
348,2009-04-19,Carl Pavano,6.0,1,4.25,5.361111,11.352941
363,2009-04-19,James McDonald,4.333333,0,3.75,4.35,10.44


In [29]:
highERA['Pitcher'].nunique()

118

In [30]:
highERA[['Game Date', 'Pitcher', 'IP', 'ER','AvgIP','AvgER','AvgERA']].tail()

Unnamed: 0,Game Date,Pitcher,IP,ER,AvgIP,AvgER,AvgERA
44951,2018-05-15,Wei-Yin Chen,4.333333,0,4.111111,4.666667,10.216216
45112,2018-05-21,Jason Vargas,5.0,0,4.111111,6.333333,13.864865
45369,2018-05-30,Jason Vargas,5.0,0,4.066667,4.8,10.622951
45423,2018-06-01,Sergio Romo,1.333333,0,0.833333,1.0,10.8
47745,2018-08-30,Cody Reed,4.666667,1,3.517147,4.23251,10.830538


In [31]:
pitchlogs[pitchlogs.Pitcher == 'Jason Vargas']

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA,FIPnumerator,AvgFIPnoConst
961,2009-05-12,Jason Vargas,At TEX,0,0,0,0,0,5.000000,5,...,3.354839,1.193548,1.677419,5.064516,0.129032,0.000000,0.096774,5.707317,10.806452,2.042683
1112,2009-05-17,Jason Vargas,Vs. BOS,0,0,0,0,0,5.333333,7,...,2.175676,0.878378,2.256757,3.648649,0.121622,0.013514,0.054054,3.640704,11.256757,2.092965
1226,2009-05-22,Jason Vargas,Vs. SFN,0,0,0,0,0,7.000000,2,...,1.483333,0.783333,2.283333,2.883333,0.116667,0.025000,0.091667,2.411440,11.616667,2.098344
1416,2009-05-29,Jason Vargas,At LAA,0,0,1,0,0,6.333333,4,...,1.000000,1.000000,2.000000,3.666667,0.000000,0.000000,0.000000,1.557692,11.666667,2.019231
1572,2009-06-03,Jason Vargas,Vs. BAL,0,0,0,0,0,5.333333,8,...,1.250000,1.000000,2.000000,3.250000,0.000000,0.000000,0.000000,1.901408,12.500000,2.112676
1733,2009-06-09,Jason Vargas,At BAL,0,0,0,1,0,5.666667,7,...,1.400000,1.000000,1.600000,3.000000,0.000000,0.000000,0.000000,2.172414,11.800000,2.034483
1856,2009-06-14,Jason Vargas,At COL,0,0,0,1,0,4.666667,12,...,1.666667,1.000000,1.500000,3.166667,0.000000,0.000000,0.000000,2.596154,11.166667,1.932692
2013,2009-06-20,Jason Vargas,Vs. ARI,0,0,1,0,0,7.000000,3,...,2.428571,0.857143,1.571429,3.285714,0.000000,0.000000,0.000000,3.889831,9.285714,1.652542
2169,2009-06-26,Jason Vargas,At LAN,0,0,0,1,0,4.666667,9,...,2.250000,0.875000,1.375000,3.375000,0.000000,0.000000,0.000000,3.496403,8.750000,1.510791
2341,2009-07-02,Jason Vargas,At NYA,0,0,0,0,0,4.000000,4,...,2.555556,1.000000,1.333333,3.222222,0.000000,0.000000,0.000000,4.058824,10.555556,1.862745


In [32]:
pitchlogs[pitchlogs.Pitcher == 'Wei-Yin Chen']

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA,FIPnumerator,AvgFIPnoConst
14714,2012-04-10,Wei-Yin Chen,Vs. NYA,0,0,0,0,0,5.666667,7,...,3.432099,0.882716,2.481481,4.388889,0.327160,0.000000,0.179012,5.386437,11.123457,1.939720
14879,2012-04-17,Wei-Yin Chen,At CHA,0,0,1,0,0,5.333333,6,...,2.265432,0.839506,1.435185,5.154321,0.669753,0.012346,0.098765,3.396983,6.919753,1.152897
15035,2012-04-22,Wei-Yin Chen,At LAA,0,0,0,0,0,6.333333,5,...,2.187243,0.541152,1.553498,4.833333,0.405350,0.004115,0.397119,3.426985,3.244856,0.564896
15176,2012-04-28,Wei-Yin Chen,Vs. OAK,0,0,1,0,0,7.000000,6,...,1.666667,0.333333,2.000000,5.000000,0.333333,0.000000,0.333333,2.596154,1.333333,0.230769
15342,2012-05-04,Wei-Yin Chen,At BOS,0,0,0,0,0,5.000000,5,...,1.500000,0.500000,2.000000,4.750000,0.250000,0.000000,0.250000,2.219178,3.750000,0.616438
15516,2012-05-10,Wei-Yin Chen,Vs. TEX,0,0,1,0,0,7.666667,6,...,1.800000,0.400000,2.200000,4.600000,0.200000,0.000000,0.200000,2.761364,3.200000,0.545455
15644,2012-05-15,Wei-Yin Chen,Vs. NYA,0,0,1,0,0,7.000000,4,...,1.666667,0.333333,2.000000,4.666667,0.166667,0.000000,0.166667,2.432432,1.500000,0.243243
15804,2012-05-20,Wei-Yin Chen,At WSH,0,0,0,1,0,4.333333,8,...,1.714286,0.428571,2.000000,4.571429,0.142857,0.000000,0.142857,2.454545,2.857143,0.454545
15953,2012-05-26,Wei-Yin Chen,Vs. KCA,0,0,0,0,0,6.000000,6,...,2.250000,0.625000,2.125000,4.625000,0.125000,0.000000,0.125000,3.351724,5.625000,0.931034
16122,2012-06-01,Wei-Yin Chen,At TBA,0,0,0,1,0,5.666667,5,...,2.222222,0.666667,2.000000,4.666667,0.111111,0.000000,0.111111,3.312883,5.666667,0.938650


In [33]:
#first start fpr each pitcher from 2009-2018 season
indexes = []
unique_starters =  pitchlogs['Pitcher'].unique()
for starter in unique_starters.flatten():
    idx = list(pitchlogs['Pitcher']).index(starter)
    indexes.append(idx)

In [34]:
first_start = pitchlogs.iloc[indexes,:]
first_start[['Game Date', 'Pitcher', 'IP', 'H', 'ER', 'BB', 'K']].head(10)

Unnamed: 0,Game Date,Pitcher,IP,H,ER,BB,K
0,2009-04-05,Brett Myers,6.0,8,4,1,6
1,2009-04-05,Derek Lowe,8.0,2,0,0,4
2,2009-04-06,Joe Saunders,6.666667,3,0,2,2
3,2009-04-06,Paul Maholm,6.666667,7,1,1,1
4,2009-04-06,Johan Santana,5.666667,3,1,4,7
5,2009-04-06,John Lannan,3.0,6,6,0,1
6,2009-04-06,Hiroki Kuroda,5.666667,4,1,1,2
7,2009-04-06,Roy Oswalt,7.0,7,3,1,2
8,2009-04-06,Ricky Nolasco,6.0,7,5,0,6
9,2009-04-06,Jeremy Guthrie,6.0,7,3,3,3


Since we started collecting data in 2009, most pitchers whose first start in the df was in early April 2004,  were not actually having their first career start.  

In [35]:
apr2009 = first_start[(first_start['Game Date'].dt.year == 2009) & (first_start['Game Date'].dt.month == 4)]
apr2009.shape

(173, 38)

In [36]:
pitchlogs[(pitchlogs['Game Date'].dt.year == 2009) & (pitchlogs['Game Date'].dt.month == 4)].shape[0]/2

323.0

There were 323 games played in April and 173 pitchers made their first starts (since 2009).
For most of these people it wasn't actually their first start of their career so, it might not make sense to leave them in the calculations. 

In [37]:
earlyApr2009 = apr2009[apr2009['Game Date'].dt.day<20]
earlyApr2009.shape

(160, 38)

In [38]:
lateApr2009 = apr2009[apr2009['Game Date'].dt.day>=20]
lateApr2009

Unnamed: 0,Game Date,Pitcher,Opponent,CG,GF,W,L,Sho,IP,H,...,AvgER,AvgHR,AvgBB,AvgK,AvgHBP,AvgBK,AvgWP,AvgERA,FIPnumerator,AvgFIPnoConst
367,2009-04-20,Jordan Zimmermann,Vs. ATL,0,0,1,0,0,6.0,6,...,3.333333,0.833333,2.083333,3.0,0.166667,0.083333,0.0,5.4,11.583333,2.085
370,2009-04-20,Justin Masterson,Vs. BAL,0,0,1,0,0,5.333333,4,...,4.25,0.583333,3.0,3.083333,0.166667,0.0,0.333333,7.443243,10.916667,2.124324
402,2009-04-21,Brad Bergesen,Vs. CHA,0,0,1,0,0,5.666667,4,...,3.333333,0.916667,2.75,4.25,0.25,0.0,0.083333,5.268293,12.416667,2.180488
427,2009-04-22,Brian Bannister,At CLE,0,0,1,0,0,6.0,4,...,3.071429,0.714286,3.214286,4.857143,0.285714,0.0,0.642857,5.425234,10.071429,1.976636
447,2009-04-23,Matt Palmer,Vs. DET,0,0,1,0,0,6.0,6,...,3.642857,0.928571,2.071429,3.928571,0.285714,0.0,0.142857,5.53012,11.285714,1.903614
494,2009-04-25,Anthony Ortega,Vs. SEA,0,0,0,1,0,5.0,5,...,2.882353,0.647059,1.588235,3.529412,0.176471,0.0,0.235294,4.186709,6.647059,1.072785
498,2009-04-25,Brian Burres,At CHA,0,0,0,1,0,4.333333,7,...,3.0,0.75,2.5625,4.4375,0.25,0.0,0.0625,4.645161,9.3125,1.602151
499,2009-04-25,Mitchell Boggs,Vs. CHN,0,0,1,0,0,5.666667,6,...,2.533333,0.466667,2.6,3.733333,0.333333,0.066667,0.066667,4.222222,7.4,1.37037
500,2009-04-25,Scott Feldman,At BAL,0,0,1,0,0,5.0,4,...,4.0,0.588235,2.882353,2.882353,0.176471,0.058824,0.235294,6.876404,11.058824,2.11236
543,2009-04-26,Graham Taylor,Vs. PHI,0,0,0,1,0,3.666667,4,...,2.125,0.5625,1.875,3.25,0.1875,0.0,0.125,3.1875,7.0,1.166667


In [39]:
print('Number of First Starts per Season')
for year in range(2009,2019):
    num_fs = first_start[first_start['Game Date'].dt.year == year].shape[0]
    print(year, ': ', num_fs)

Number of First Starts per Season
2009 :  306
2010 :  66
2011 :  66
2012 :  64
2013 :  64
2014 :  59
2015 :  77
2016 :  69
2017 :  73
2018 :  102


We see there were a lot more pitchers making their first start in 2009 compared to the other seasons- again, this we collected data starting in the 2009 season, so for many of them it wasn't actually their first start.  <br>
The reason there are so many first time starting pitchers in the 2018 season is because they started using "openers."

In [40]:
first_start[['IP', 'H', 'ER', 'K', 'BB', 'HBP', 'WP']].describe()

Unnamed: 0,IP,H,ER,K,BB,HBP,WP
count,946.0,946.0,946.0,946.0,946.0,946.0,946.0
mean,4.863284,4.997886,2.606765,3.567653,2.001057,0.221987,0.179704
std,1.499554,2.258903,1.988782,2.157838,1.380514,0.472954,0.454781
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,3.0,1.0,2.0,1.0,0.0,0.0
50%,5.0,5.0,2.0,3.0,2.0,0.0,0.0
75%,6.0,7.0,4.0,5.0,3.0,0.0,0.0
max,8.333333,12.0,9.0,14.0,7.0,3.0,4.0


In [41]:
fs_noApr2009 = first_start.drop(apr2009.index,axis=0)
fs_noApr2009.shape

(773, 38)

In [42]:
fs_noApr2009[['IP', 'H', 'ER', 'K', 'BB', 'HBP', 'WP']].describe()

Unnamed: 0,IP,H,ER,K,BB,HBP,WP
count,773.0,773.0,773.0,773.0,773.0,773.0,773.0
mean,4.739974,4.865459,2.531695,3.486417,1.945666,0.225097,0.177232
std,1.530096,2.288745,1.957606,2.141711,1.394715,0.478601,0.456274
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,3.0,1.0,2.0,1.0,0.0,0.0
50%,5.0,5.0,2.0,3.0,2.0,0.0,0.0
75%,6.0,6.0,4.0,5.0,3.0,0.0,0.0
max,8.333333,12.0,9.0,14.0,7.0,3.0,4.0


In [43]:
pitchlogs[['IP', 'H', 'ER', 'K', 'BB', 'HBP', 'WP']].describe()

Unnamed: 0,IP,H,ER,K,BB,HBP,WP
count,48596.0,48596.0,48596.0,48596.0,48596.0,48596.0,48596.0
mean,5.790346,5.76282,2.687217,4.700942,1.883859,0.206807,0.189357
std,1.463569,2.248152,1.990737,2.513381,1.359948,0.46185,0.452623
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,4.0,1.0,3.0,1.0,0.0,0.0
50%,6.0,6.0,2.0,4.0,2.0,0.0,0.0
75%,7.0,7.0,4.0,6.0,3.0,0.0,0.0
max,10.0,15.0,13.0,20.0,9.0,4.0,5.0


In [44]:
pitchlogs.replace(np.inf,np.nan, inplace=True)

In [45]:
pitchlogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48596 entries, 0 to 48595
Data columns (total 38 columns):
Game Date        48596 non-null datetime64[ns]
Pitcher          48596 non-null object
Opponent         48596 non-null object
CG               48596 non-null int64
GF               48596 non-null int64
W                48596 non-null int64
L                48596 non-null int64
Sho              48596 non-null int64
IP               48596 non-null float64
H                48596 non-null int64
R                48596 non-null int64
ER               48596 non-null int64
HR               48596 non-null int64
BB               48596 non-null int64
K                48596 non-null int64
HBP              48596 non-null int64
BK               48596 non-null int64
WP               48596 non-null int64
PitchAbbrev      48596 non-null object
Opp              48596 non-null object
pctCG            48446 non-null float64
pctGF            48446 non-null float64
pctW             48446 non-null floa

In [46]:
#Delete columns from gamelogs with starting pitcher info so they don't duplicate when we merge
cols_to_drop = [col for col in gamelogs.columns if col[-2:] in ['_V','_H']]
gamelogs = gamelogs.drop(cols_to_drop, axis=1)

#also delete columns with player IDs and position because we will not be using them
cols_to_drop = [col for col in gamelogs.columns if col.endswith(('Position','erID'))]
gamelogs = gamelogs.drop(cols_to_drop, axis=1) 

In [47]:
gamelogs.columns
gamelogs.shape

(24298, 123)

In [48]:
gamelogs_hsp = pd.merge(gamelogs, pitchlogs, how='left', left_on = ['Date', 'HomeSPAbbrev'], right_on=['Game Date', 'PitchAbbrev']) 
gamelogs = pd.merge(gamelogs_hsp, pitchlogs, how='left', left_on = ['Date', 'VisitSPAbbrev'], right_on=['Game Date', 'PitchAbbrev'], suffixes = ['_HSP', '_VSP'])

In [49]:
gamelogs.shape

(24298, 199)

## Calculate Averages for Teams
~to get relief pitchers stats alone~

In [50]:
gamelogs['NumHalfInnings'] = gamelogs['LengthInOuts']/3

In [51]:
def HomeTeamOffenseInnings(lengthinouts, numhalfinnings):
    if lengthinouts % 3 == 0:
        HomeTeamOffenseInnings = numhalfinnings // 2
    else:
        HomeTeamOffenseInnings = numhalfinnings - math.ceil(numhalfinnings/2)
    return HomeTeamOffenseInnings

for idx, game in gamelogs.iterrows():
    gamelogs.loc[idx,'HomeOffenseInnings'] = HomeTeamOffenseInnings(game['LengthInOuts'], game['NumHalfInnings'])

gamelogs['VisitorOffenseInnings'] = gamelogs['NumHalfInnings']-gamelogs['HomeOffenseInnings']
gamelogs['HomeDefInnings'] = gamelogs['VisitorOffenseInnings']
gamelogs['VisitorDefInnings'] = gamelogs['HomeOffenseInnings']

In [52]:
gamelogs[['NumHalfInnings','HomeOffenseInnings','VisitorOffenseInnings']].head(20)

Unnamed: 0,NumHalfInnings,HomeOffenseInnings,VisitorOffenseInnings
0,18.0,9.0,9.0
1,17.0,8.0,9.0
2,18.0,9.0,9.0
3,17.0,8.0,9.0
4,18.0,9.0,9.0
5,18.0,9.0,9.0
6,18.0,9.0,9.0
7,17.0,8.0,9.0
8,17.0,8.0,9.0
9,18.0,9.0,9.0


In [53]:
#create new columns with Relief Pitching Info
gamelogs['HomeReliefIP'] = gamelogs['VisitorOffenseInnings'] - gamelogs['IP_HSP']
gamelogs['VisitorReliefIP'] = gamelogs['HomeOffenseInnings'] - gamelogs['IP_VSP']

gamelogs['HomeReliefWP'] = gamelogs['VisitorWP'] - gamelogs['WP_HSP']
gamelogs['VisitorReliefWP'] = gamelogs['HomeWP'] - gamelogs['WP_VSP']

gamelogs['HomeReliefER'] = gamelogs['HomeER'] - gamelogs['ER_HSP']
gamelogs['VisitorReliefER'] = gamelogs['VisitorER'] - gamelogs['ER_VSP']

gamelogs['HomeReliefK'] = gamelogs['VisitorK'] - gamelogs['K_HSP']
gamelogs['VisitorReliefK'] = gamelogs['HomeK'] - gamelogs['K_VSP']

gamelogs['HomeReliefBB'] = gamelogs['VisitorBB'] - gamelogs['BB_HSP']
gamelogs['VisitorReliefBB'] = gamelogs['HomeBB'] - gamelogs['BB_VSP']

gamelogs['HomeReliefHBP'] = gamelogs['VisitorHBP'] - gamelogs['HBP_HSP']
gamelogs['VisitorReliefHBP'] = gamelogs['HomeHBP'] - gamelogs['HBP_VSP']

gamelogs['HomeReliefHR'] = gamelogs['VisitorHR'] - gamelogs['HR_HSP']
gamelogs['VisitorReliefHR'] = gamelogs['HomeHR'] - gamelogs['HR_VSP']

gamelogs['HomeReliefH'] = gamelogs['VisitorH'] - gamelogs['H_HSP']
gamelogs['VisitorReliefH'] = gamelogs['HomeH'] - gamelogs['H_VSP']

gamelogs['HomePitchK'] = gamelogs['VisitorK']
gamelogs['VisitorPitchK'] = gamelogs['HomeK']

gamelogs['HomeEROff'] = gamelogs['VisitorER']
gamelogs['VisitorEROff'] = gamelogs['HomeER']

In [54]:
#calculate times reaching base and plate appearances
gamelogs['HomeRB'] = gamelogs['HomeH'] + gamelogs['HomeBB'] + gamelogs['HomeHBP']
gamelogs['HomePA'] = gamelogs['HomeAB']+gamelogs['HomeBB']+gamelogs['HomeHBP']+gamelogs['HomeSF']

gamelogs['VisitorRB'] = gamelogs['VisitorH'] + gamelogs['VisitorBB'] + gamelogs['VisitorHBP']
gamelogs['VisitorPA'] = gamelogs['VisitorAB']+gamelogs['VisitorBB']+gamelogs['VisitorHBP']+gamelogs['VisitorSF']

gamelogs['HomePower'] = gamelogs['HomeD'] + 2 * gamelogs['HomeT'] + 3 * gamelogs['HomeHR']
gamelogs['VisitorPower'] = gamelogs['VisitorD'] + 2 * gamelogs['VisitorT'] + 3 * gamelogs['VisitorHR']

gamelogs['HomeBIP'] = gamelogs['HomeAB']-gamelogs['HomeK']-gamelogs['HomeHR']+gamelogs['HomeSF']+gamelogs['HomeSH']
gamelogs['VisitorBIP'] = gamelogs['VisitorAB']-gamelogs['VisitorK']-gamelogs['VisitorHR']+gamelogs['VisitorSF']+gamelogs['VisitorSH']

gamelogs['HomePitchBIP'] = gamelogs['VisitorBIP']
gamelogs['VisitorPitchBIP'] = gamelogs['HomeBIP']

#hits allowed by defense
gamelogs['HomeDefHminusHR'] = gamelogs['VisitorH']- gamelogs['VisitorHR']
gamelogs['VisitorDefHminusHR'] = gamelogs['HomeH'] - gamelogs['HomeHR']

In [55]:
gamelogs.rename(columns = {'HomeRunsScore': 'HomeRunsScored'}, inplace=True)
gamelogs['HomeRunDiff'] = gamelogs['HomeRunsScored'] - gamelogs['VisitorRunsScored']
gamelogs['VisitorRunDiff'] = -1*gamelogs['HomeRunDiff']

In [56]:
gamelogs = gamelogs.drop(['HomeOBP','VisitOBP'], axis=1)

In [57]:
home_idx1 = gamelogs.columns.get_loc('HomeAB')
home_idx2 = gamelogs.columns.get_loc('HomeTP')
print(home_idx1)
print(home_idx2)
cols1 = gamelogs[['HomeRunsScored', 'HomeEROff']]
cols2 = gamelogs.iloc[:, home_idx1 : home_idx2+1]
HomeRelCols = [col for col in gamelogs.columns if col.startswith('HomeRelief')]
cols3 = gamelogs[HomeRelCols]
cols4 = gamelogs[['HomeOffenseInnings','HomeDefInnings','HomeRB','HomePA', 'HomePower', 'HomeBIP', 'HomePitchBIP','HomeDefHminusHR','HomeRunDiff']]
cols = pd.concat([cols1,cols2,cols3, cols4], axis=1)
cols.columns

49
76


Index(['HomeRunsScored', 'HomeEROff', 'HomeAB', 'HomeH', 'HomeD', 'HomeT',
       'HomeHR', 'HomeRBI', 'HomeSH', 'HomeSF', 'HomeHBP', 'HomeBB', 'HomeIBB',
       'HomeK', 'HomeSB', 'HomeCS', 'HomeGDP', 'HomeCI', 'HomeLOB',
       'HomePitchers', 'HomeER', 'HomeTER', 'HomeWP', 'HomeBalks', 'HomePO',
       'HomeA', 'HomeE', 'HomePassed', 'HomeDB', 'HomeTP', 'HomeReliefIP',
       'HomeReliefWP', 'HomeReliefER', 'HomeReliefK', 'HomeReliefBB',
       'HomeReliefHBP', 'HomeReliefHR', 'HomeReliefH', 'HomeOffenseInnings',
       'HomeDefInnings', 'HomeRB', 'HomePA', 'HomePower', 'HomeBIP',
       'HomePitchBIP', 'HomeDefHminusHR', 'HomeRunDiff'],
      dtype='object')

In [58]:
#calculate team averages
for home_col in cols.columns:
    stat = home_col[4:]
    visit_col = 'Visitor' + stat
    melted = melt_gamelogs([home_col, visit_col])  
    melted['stat'] = relavent_team_stat(melted, home_col, visit_col)
    melted = melted.sort_values(['Team','Date'])
    new_col = 'Avg' + stat
    melted_grpd = melted.groupby(['Team', melted['Date'].dt.year])
    melted[new_col] = melted_grpd['stat'].apply(calculate_teamavg)
    melted[new_col] = fill_with_EOPS_data(melted_grpd, melted, 'stat', 'Team', new_col, 4, 1)
    #fill second start of season with avg of end of prior season and first game
    melted[new_col] = fill_2ndand3rd_game(melted_grpd, melted, 'stat', new_col, 'Team')
    #melted[new_col] = melted.groupby('Team')[new_col].fillna(method='ffill')                                                                                   
    melted = melted[['Date','DoubleHeader','Team',new_col]]                                                                                
    gamelogs = merge_twice(gamelogs, melted, 'HomeTeam', 'VisitingTeam', 'Team')                                                                                    

In [59]:
gamelogs.tail()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,Team_V,AvgPitchBIP_V,Team_H,AvgDefHminusHR_H,Team_V.1,AvgDefHminusHR_V,Team_H.1,AvgRunDiff_H,Team_V.2,AvgRunDiff_V
24293,2018-09-30,0,Sun,CHA,AL,162,MIN,AL,162,4,...,CHA,25.559006,MIN,7.57764,CHA,7.47205,MIN,-0.236025,CHA,-1.186335
24294,2018-09-30,0,Sun,TEX,AL,162,SEA,AL,162,1,...,TEX,26.701863,SEA,7.440994,TEX,7.993789,SEA,-0.223602,TEX,-0.677019
24295,2018-09-30,0,Sun,TOR,AL,162,TBA,AL,162,4,...,TOR,25.521739,TBA,6.602484,TOR,7.813665,TBA,0.403727,TOR,-0.732919
24296,2018-10-01,0,Mon,MIL,NL,163,CHN,NL,163,3,...,MIL,23.938272,CHN,7.098765,MIL,6.691358,CHN,0.728395,MIL,0.574074
24297,2018-10-01,0,Mon,COL,NL,163,LAN,NL,163,2,...,COL,24.438272,LAN,6.777778,COL,7.320988,LAN,1.179012,COL,0.234568


In [60]:
#OBP
gamelogs['AvgOBP_H'] = gamelogs['AvgRB_H']/gamelogs['AvgPA_H']
gamelogs['AvgOBP_V'] = gamelogs['AvgRB_V']/gamelogs['AvgPA_V']

#isolated power
gamelogs['AvgISO_H'] = gamelogs['AvgPower_H']/gamelogs['AvgAB_H']
gamelogs['AvgISO_V'] = gamelogs['AvgPower_V']/gamelogs['AvgAB_V']

#Offense earned runs scored per 9 innings
gamelogs['AvgOffERunsPer9Inn_H'] = gamelogs['AvgEROff_H'] * 9 / gamelogs['AvgOffenseInnings_H']
gamelogs['AvgOffERunsPer9Inn_V'] = gamelogs['AvgEROff_V'] * 9 / gamelogs['AvgOffenseInnings_V']

#Relief ERA
gamelogs['AvgReliefERA_H'] = gamelogs['AvgReliefER_H'] * 9 / gamelogs['AvgReliefIP_H']
gamelogs['AvgReliefERA_V'] = gamelogs['AvgReliefER_V'] * 9 / gamelogs['AvgReliefIP_V']

#Relief FIP no Constant
gamelogs['FIPnumerator_H'] = 13*gamelogs['AvgReliefHR_H'] + 3*(gamelogs['AvgReliefBB_H']+gamelogs['AvgReliefHBP_H']) - 2*gamelogs['AvgReliefK_H']
gamelogs['AvgRelFIPnoConst_H'] = gamelogs['FIPnumerator_H']/gamelogs['AvgReliefIP_H']
gamelogs['FIPnumerator_V'] = 13*gamelogs['AvgReliefHR_V'] + 3*(gamelogs['AvgReliefBB_V']+gamelogs['AvgReliefHBP_V']) - 2*gamelogs['AvgReliefK_V']
gamelogs['AvgRelFIPnoConst_V'] = gamelogs['FIPnumerator_V']/gamelogs['AvgReliefIP_V']

#BABIP - (H - HR)/(AB - K - HR + SF + SH)
gamelogs['AvgBABIP_H'] = (gamelogs['AvgH_H']-gamelogs['AvgHR_H'])/gamelogs['AvgBIP_H']
gamelogs['AvgBABIP_V'] = (gamelogs['AvgH_V']-gamelogs['AvgHR_V'])/gamelogs['AvgBIP_V']

#Pitcher BABIP - outs per ba ball in play
gamelogs['AvgPitchBABIP_H'] = gamelogs['AvgDefHminusHR_H']/gamelogs['AvgPitchBIP_H']
gamelogs['AvgPitchBABIP_V'] = gamelogs['AvgDefHminusHR_V']/gamelogs['AvgPitchBIP_V']

#Avg assists per 9 innings
gamelogs['AvgAper9_H'] = gamelogs['AvgA_H']*9/gamelogs['AvgDefInnings_H']
gamelogs['AvgAper9_V'] = gamelogs['AvgA_V']*9/gamelogs['AvgDefInnings_V']

In [61]:
gamelogs.Attendance.head()

0    44532.0
1    48799.0
2    42177.0
3    34323.0
4    43827.0
Name: Attendance, dtype: float64

In [62]:
byparkyear = gamelogs.groupby(['ParkID',gamelogs['Date'].dt.year])
byHomeyear = gamelogs.groupby(['HomeTeam',gamelogs['Date'].dt.year])

gamelogs['AvgAttendance'] = byparkyear['Attendance'].apply(lambda x: x.expanding().mean().shift())
gamelogs['AvgAttendance'] = gamelogs.groupby('ParkID')['AvgAttendance'].fillna(method='ffill')
gamelogs['temp'] = byHomeyear['Attendance'].apply(lambda x: x.expanding().mean().shift())
gamelogs['temp'] = gamelogs.groupby('HomeTeam')['AvgAttendance'].fillna(method='ffill')
gamelogs['AvgAttendance'] = gamelogs['AvgAttendance'].fillna(gamelogs['temp'])
gamelogs.drop('temp', axis=1,inplace=True)

In [63]:
gamelogs[['Date','HomeTeam','AvgAttendance']].tail()

Unnamed: 0,Date,HomeTeam,AvgAttendance
24293,2018-09-30,MIN,24544.743109
24294,2018-09-30,SEA,28479.2875
24295,2018-09-30,TBA,14270.75
24296,2018-10-01,CHN,38798.012346
24297,2018-10-01,LAN,47033.135802


In [64]:
gamelogs.columns

Index(['Date', 'DoubleHeader', 'DayOfWeek', 'VisitingTeam',
       'VisitingTeamLeague', 'VisitingTeamGameNumber', 'HomeTeam',
       'HomeTeamLeague', 'HomeTeamGameNumber', 'VisitorRunsScored',
       ...
       'AvgRelFIPnoConst_H', 'FIPnumerator_V', 'AvgRelFIPnoConst_V',
       'AvgBABIP_H', 'AvgBABIP_V', 'AvgPitchBABIP_H', 'AvgPitchBABIP_V',
       'AvgAper9_H', 'AvgAper9_V', 'AvgAttendance'],
      dtype='object', length=443)

In [65]:
gamelogs = gamelogs.drop(['Team_V', 'Team_H'], axis=1)
gamelogs.shape

(24298, 349)

In [66]:
gamelogs['VisitorWin'] = np.abs(1-gamelogs['HomeWin'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [67]:
gamelogs[['HomeWin','VisitorWin']].head(10)

Unnamed: 0,HomeWin,VisitorWin
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1
5,0,1
6,0,1
7,1,0
8,1,0
9,0,1


In [68]:
byHteamyear = gamelogs.groupby(['HomeTeam', gamelogs['Date'].dt.year])
byAteamyear = gamelogs.groupby(['VisitingTeam', gamelogs['Date'].dt.year])
gamelogs['AvgRunDiffAtHome_H'] = byHteamyear['HomeRunDiff'].apply(calculate_teamavg)
gamelogs['AvgRunDiffOnRoad_V'] = byAteamyear['VisitorRunDiff'].apply(calculate_teamavg)
gamelogs['pctWinAtHome_H'] = byHteamyear['HomeWin'].apply(calculate_teamavg)
gamelogs['pctWinOnRoad_V'] = byAteamyear['VisitorWin'].apply(calculate_teamavg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [69]:
for avg_col in ['AvgRunDiffAtHome_H','AvgRunDiffOnRoad_V','pctWinAtHome_H','pctWinOnRoad_V']:
    if avg_col.endswith('_H'):
        team = 'Home'
        col_to_groupby = 'HomeTeam'
    else: 
        team = 'Visitor'
        col_to_groupby = 'VisitingTeam'
    if avg_col.startswith('pct'):
        stat = 'Win'
    else:
        stat = 'RunDiff'
    col = team + stat
    grouped_df = gamelogs.groupby([col_to_groupby, gamelogs['Date'].dt.year])
    gamelogs[avg_col] = fill_with_EOPS_data(grouped_df, gamelogs, col, col_to_groupby, avg_col, 4, 1)
    gamelogs[avg_col] = fill_2ndand3rd_game(grouped_df, gamelogs, col, avg_col, col_to_groupby)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a 

In [70]:
gamelogs[['AvgRunDiffAtHome_H','AvgRunDiffOnRoad_V','pctWinAtHome_H','pctWinOnRoad_V']].describe()

Unnamed: 0,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
count,24208.0,24208.0,24208.0,24208.0
mean,0.115918,-0.120507,0.531998,0.468115
std,1.06969,1.105634,0.122217,0.120694
min,-8.333333,-8.193416,0.0,0.0
25%,-0.513514,-0.762054,0.461538,0.397059
50%,0.12,-0.09375,0.534247,0.467742
75%,0.741518,0.5,0.607595,0.535714
max,7.909465,8.333333,1.0,1.0


In [71]:
gamelogs[['AvgRunDiffAtHome_H','AvgRunDiffOnRoad_V','pctWinAtHome_H','pctWinOnRoad_V']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 4 columns):
AvgRunDiffAtHome_H    24208 non-null float64
AvgRunDiffOnRoad_V    24208 non-null float64
pctWinAtHome_H        24208 non-null float64
pctWinOnRoad_V        24208 non-null float64
dtypes: float64(4)
memory usage: 1.6 MB


Each team's first 3 home games and first 3 away games of the 2009 season should be null. 30*3 = 90 - 90 null in Home cols and 90 null in Away cols 

In [72]:
nulls = gamelogs[(gamelogs['AvgRunDiffAtHome_H'].isnull())|(gamelogs['AvgRunDiffOnRoad_V'].isnull())]
teams = nulls['VisitingTeam'].append(nulls['HomeTeam'])
teams.value_counts()

NYA    12
SFN    12
SLN    10
ARI     9
SEA     9
CHN     9
CLE     9
TEX     9
ANA     9
CHA     9
WAS     9
MIL     9
ATL     9
HOU     9
SDN     9
PHI     8
DET     8
COL     8
TBA     7
FLO     7
KCA     6
PIT     6
CIN     6
BAL     6
OAK     6
LAN     6
TOR     6
BOS     6
NYN     6
MIN     6
dtype: int64

In [73]:
nulls.tail()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgPitchBABIP_H,AvgPitchBABIP_V,AvgAper9_H,AvgAper9_V,AvgAttendance,VisitorWin,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
146,2009-04-17,0,Fri,FLO,NL,10,WAS,NL,9,3,...,0.316964,0.286885,10.425743,9.555556,30440.0,1,,4.666667,,1.0
149,2009-04-17,0,Fri,CLE,AL,11,NYA,AL,11,5,...,0.304348,0.330798,9.0,8.152941,48271.0,0,,-2.0,,0.285714
160,2009-04-18,0,Sat,ARI,NL,11,SFN,NL,11,2,...,0.308642,0.269784,7.577075,10.8,37114.0,1,2.5,,0.75,
164,2009-04-18,0,Sat,CLE,AL,12,NYA,AL,12,22,...,0.307971,0.314685,9.094737,8.032258,46686.0,1,,-1.875,,0.25
174,2009-04-19,0,Sun,ARI,NL,12,SFN,NL,12,0,...,0.312977,0.274834,7.714286,10.636364,37173.0,0,1.6,,0.6,


In [74]:
#check that it worked
RS = gamelogs[(gamelogs['HomeTeam']=='BOS') | (gamelogs['VisitingTeam']=='BOS')].sort_values('Date')
RS_H = RS[['Date', 'HomeTeam', 'VisitingTeam','HomeH','AvgH_H','VisitorH', 'AvgH_V']]
RS_H.iloc[155:170,:]

Unnamed: 0,Date,HomeTeam,VisitingTeam,HomeH,AvgH_H,VisitorH,AvgH_V
2338,2009-09-28,BOS,TOR,7,9.219355,14,9.282051
2351,2009-09-29,BOS,TOR,14,9.205128,11,9.312102
2367,2009-09-30,BOS,TOR,3,9.235669,17,9.322785
2380,2009-10-01,BOS,CLE,12,9.196203,3,9.107595
2393,2009-10-02,BOS,CLE,8,9.213836,8,9.069182
2408,2009-10-03,BOS,CLE,11,9.20625,10,9.0625
2423,2009-10-04,BOS,CLE,11,9.217391,8,9.068323
2430,2010-04-04,BOS,NYA,12,9.228395,12,9.901235
2445,2010-04-06,BOS,NYA,9,10.614198,9,10.950617
2452,2010-04-07,BOS,NYA,7,10.076132,6,10.300412


In [75]:
gamelogs = gamelogs.replace(np.inf,np.nan)

In [76]:
rel_cols = [col for col in gamelogs.columns if col.startswith(('pct','Avg'))]
gamelogs[rel_cols].shape

(24298, 149)

In [77]:
gamelogs[rel_cols].iloc[:,:60].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 60 columns):
pctCG_HSP            24224 non-null float64
pctGF_HSP            24224 non-null float64
pctW_HSP             24224 non-null float64
pctL_HSP             24224 non-null float64
pctSho_HSP           24224 non-null float64
AvgIP_HSP            24224 non-null float64
AvgH_HSP             24224 non-null float64
AvgR_HSP             24224 non-null float64
AvgER_HSP            24224 non-null float64
AvgHR_HSP            24224 non-null float64
AvgBB_HSP            24224 non-null float64
AvgK_HSP             24224 non-null float64
AvgHBP_HSP           24224 non-null float64
AvgBK_HSP            24224 non-null float64
AvgWP_HSP            24224 non-null float64
AvgERA_HSP           24224 non-null float64
AvgFIPnoConst_HSP    24224 non-null float64
pctCG_VSP            24222 non-null float64
pctGF_VSP            24222 non-null float64
pctW_VSP             24222 non-null float64
pctL_VSP   

In [78]:
gamelogs[rel_cols].iloc[:,60:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24298 entries, 0 to 24297
Data columns (total 89 columns):
AvgK_H                  24253 non-null float64
AvgK_V                  24253 non-null float64
AvgSB_H                 24253 non-null float64
AvgSB_V                 24253 non-null float64
AvgCS_H                 24253 non-null float64
AvgCS_V                 24253 non-null float64
AvgGDP_H                24253 non-null float64
AvgGDP_V                24253 non-null float64
AvgCI_H                 24253 non-null float64
AvgCI_V                 24253 non-null float64
AvgLOB_H                24253 non-null float64
AvgLOB_V                24253 non-null float64
AvgPitchers_H           24253 non-null float64
AvgPitchers_V           24253 non-null float64
AvgER_H                 24253 non-null float64
AvgER_V                 24253 non-null float64
AvgTER_H                24253 non-null float64
AvgTER_V                24253 non-null float64
AvgWP_H                 24253 non-null float6

In [79]:
gamelogs.head()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgPitchBABIP_H,AvgPitchBABIP_V,AvgAper9_H,AvgAper9_V,AvgAttendance,VisitorWin,AvgRunDiffAtHome_H,AvgRunDiffOnRoad_V,pctWinAtHome_H,pctWinOnRoad_V
0,2009-04-05,0,Sun,ATL,NL,1,PHI,NL,1,4,...,,,,,,1,,,,
1,2009-04-06,0,Mon,COL,NL,1,ARI,NL,1,8,...,,,,,,0,,,,
2,2009-04-06,0,Mon,NYN,NL,1,CIN,NL,1,2,...,,,,,,1,,,,
3,2009-04-06,0,Mon,WAS,NL,1,FLO,NL,1,6,...,,,,,,0,,,,
4,2009-04-06,0,Mon,CHN,NL,1,HOU,NL,1,4,...,,,,,,1,,,,


Ready to pickle and move on to Inferential Statistics!

In [80]:
outfile = open('../PickledFiles/gamelogs3', 'wb')
pickle.dump(gamelogs, outfile)
outfile.close()