In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers and 6 defensemen

In [32]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/4c_8w_6d_game_team_player_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/4c_8w_6d_game_team_player_ranking_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(1620, 17)

In [33]:
dc = dc.rename(columns={'PlayerPosition': 'Position'})
dc['playercount'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
dc['rosterposition'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20023,LA,11.0,KOPITAR,C,1.0,1,1,0,LA,VAN,18.0,4.0,4.0,8.0,6.0,1.0,1.0
1,2010,20023,LA,23.0,BROWN,W,2.0,1,1,0,LA,VAN,18.0,8.0,4.0,8.0,6.0,1.0,3.0
2,2010,20023,LA,94.0,SMYTH,W,2.0,1,1,0,LA,VAN,18.0,8.0,4.0,8.0,6.0,1.0,3.0
3,2010,20023,LA,8.0,DOUGHTY,D,1.0,1,1,0,LA,VAN,18.0,6.0,4.0,8.0,6.0,1.0,2.0
4,2010,20023,LA,33.0,MITCHELL,D,2.0,1,1,0,LA,VAN,18.0,6.0,4.0,8.0,6.0,1.0,3.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [34]:
dc = pd.pivot_table(dc, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
dc = dc.reset_index()
dc.columns = ['_'.join(str(s).strip() for s in col if s) for col in dc.columns]
dc.reset_index()
dc = dc.fillna(0)
dc = dc.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dc.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20023,LA,VAN,1,1,0,LA,18.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,2.0
1,2010,20023,LA,VAN,1,1,0,VAN,18.0,4.0,8.0,6.0,2.0,0.0,1.0,1.0,4.0,0.0,2.0,3.0,0.0,3.0,2.0
2,2010,20025,CGY,LA,1,3,2,LA,18.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,2.0
3,2010,20025,CGY,LA,3,1,2,CGY,18.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,3.0,2.0,1.0,3.0,3.0,0.0,2.0
4,2010,20037,WSH,NYI,1,2,1,NYI,18.0,4.0,8.0,6.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,1.0,3.0,1.0,3.0
5,2010,20037,WSH,NYI,2,1,1,WSH,18.0,4.0,8.0,6.0,1.0,0.0,2.0,1.0,2.0,3.0,1.0,3.0,4.0,0.0,1.0
6,2010,20041,ANA,VAN,3,4,1,VAN,18.0,4.0,8.0,6.0,2.0,0.0,1.0,1.0,4.0,0.0,2.0,3.0,0.0,3.0,2.0
7,2010,20041,ANA,VAN,4,3,1,ANA,18.0,4.0,8.0,6.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,3.0,1.0,2.0,2.0
8,2010,20053,ATL,ANA,4,5,-1,ANA,18.0,4.0,8.0,6.0,1.0,1.0,2.0,0.0,2.0,3.0,1.0,3.0,1.0,2.0,2.0
9,2010,20053,ATL,ANA,5,4,-1,ATL,18.0,4.0,8.0,6.0,0.0,3.0,1.0,0.0,2.0,3.0,1.0,0.0,5.0,2.0,1.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [35]:
dc['Win'] = dc.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dc['MeanC'] = ((dc['C1']*1) + (dc['C2']*2) + (dc['C3']*3) + (dc['C4'] *4))/dc['CCount']
dc['MeanW'] = ((dc['W1']*1) + (dc['W2']*2) + (dc['W3']*3) + (dc['W4'] *4))/dc['WCount']
dc['MeanD'] = ((dc['D1']*1) + (dc['D2']*2) + (dc['D3']*3))/dc['DCount']
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD']
dc.sort_values(['GameNumber'], ascending=[True], inplace=True)
dc.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,Sum
0,2010,20023,LA,VAN,1,1,0,LA,18.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,2.0,1,2.5,2.625,1.833333,6.958333
1,2010,20023,LA,VAN,1,1,0,VAN,18.0,4.0,8.0,6.0,2.0,0.0,1.0,1.0,4.0,0.0,2.0,3.0,0.0,3.0,2.0,0,2.25,2.5,1.666667,6.416667
2,2010,20025,CGY,LA,1,3,2,LA,18.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,2.0,0,2.5,2.625,1.833333,6.958333
3,2010,20025,CGY,LA,3,1,2,CGY,18.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,3.0,2.0,1.0,3.0,3.0,0.0,2.0,1,2.25,2.125,1.666667,6.041667
4,2010,20037,WSH,NYI,1,2,1,NYI,18.0,4.0,8.0,6.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,1.0,3.0,1.0,3.0,0,2.5,2.75,2.0,7.25


- summary analysis

In [36]:
dc.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,45.0,45.0,45.0
0,mean,2.233333,2.319444,1.744444
0,std,0.338781,0.344967,0.179787
0,min,1.5,1.5,1.333333
0,25%,2.25,2.25,1.666667
0,50%,2.25,2.375,1.666667
0,75%,2.5,2.5,1.833333
0,max,2.75,2.75,2.166667
1,count,45.0,45.0,45.0
1,mean,2.261111,2.297222,1.848148


### model estimation

- regress **win** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [37]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

dc['DC1'] = dc['C1'] - dc['C4']
dc['DC2'] = dc['C2'] - dc['C4']
dc['DC3'] = dc['C3'] - dc['C4']

dc['DW1'] = dc['W1'] - dc['W4']
dc['DW2'] = dc['W2'] - dc['W4']
dc['DW3'] = dc['W3'] - dc['W4']

dc['DD1'] = dc['D1'] - dc['D3']
dc['DD2'] = dc['D2'] - dc['D3']


In [38]:
y = dc['Win'] 

X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )

X3 = dc[['meanc', 'meanw', 'meand']]
X4 = dc[['MeanC', 'MeanW', 'MeanD']]

X5 = dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2']]
X6 =  sm.add_constant(dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2']])



m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()
m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()
m7 = sm.OLS(y, X7).fit()
m8 = sm.OLS(y, X8).fit()

#m1.summary()
#m2.summary()
m3.summary()
#m4.summary()
#m5.summary()
#m6.summary()
#m7.summary()
#m8.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.211
Model:,OLS,Adj. R-squared:,0.183
Method:,Least Squares,F-statistic:,7.74
Date:,"Wed, 02 May 2018",Prob (F-statistic):,0.000122
Time:,21:36:32,Log-Likelihood:,-85.867
No. Observations:,90,AIC:,177.7
Df Residuals:,87,BIC:,185.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.3523,0.195,1.809,0.074,-0.035 0.739
meanw,0.4271,0.243,1.761,0.082,-0.055 0.909
meand,0.2629,0.293,0.896,0.373,-0.320 0.846

0,1,2,3
Omnibus:,37.028,Durbin-Watson:,2.441
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.926
Skew:,-0.036,Prob(JB):,0.0517
Kurtosis:,1.745,Cond. No.,2.61


In [39]:
print ('win contribution of each roster position')
y = dc['Win']  
X = dc[['meanc', 'meanw', 'meand']]
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/game_level_win_ols_mean_position.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

win contribution of each roster position
                            OLS Regression Results                            
Dep. Variable:                    Win   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     7.740
Date:                Wed, 02 May 2018   Prob (F-statistic):           0.000122
Time:                        21:36:32   Log-Likelihood:                -85.867
No. Observations:                  90   AIC:                             177.7
Df Residuals:                      87   BIC:                             185.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
meanc      

### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

In [40]:
# For each X1, calculate VIF and save in dataframe
vif1 = pd.DataFrame()
vif1['VIF Factor'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif1['features'] = X1.columns
vif1.round(3)

Unnamed: 0,VIF Factor,features
0,102.747,const
1,1.134,MeanC
2,1.233,MeanW
3,1.179,MeanD


the variance inflation factors for the constant of this model is **not acceptable** as $VIF<2.5$

In [41]:
# For each X2, calculate VIF and save in dataframe
vif2 = pd.DataFrame()
vif2['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif2['features'] = X2.columns
vif2.round(3)

Unnamed: 0,VIF Factor,features
0,2.142,const
1,1.134,meanc
2,1.233,meanw
3,1.179,meand


the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [42]:
# For each X3, calculate VIF and save in dataframe
vif3 = pd.DataFrame()
vif3['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif3['features'] = X3.columns
vif3.round(3)

Unnamed: 0,VIF Factor,features
0,1.561,meanc
1,1.688,meanw
2,1.721,meand


In [13]:
beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/vif_2010-2011.tex', 'w')
f.write(beginningtex)
f.write(vif3.to_latex())
f.write(endtex)
f.close()

the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [43]:
# For each X4, calculate VIF and save in dataframe
vif4 = pd.DataFrame()
vif4['VIF Factor'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif4['features'] = X4.columns
vif4.round(3)

Unnamed: 0,VIF Factor,features
0,42.19,MeanC
1,60.911,MeanW
2,58.034,MeanD


the variance inflation factors for all variables of this model are **not acceptable** as $VIF>2.5$

In [44]:
# For each X5, calculate VIF and save in dataframe
vif5 = pd.DataFrame()
vif5['VIF Factor'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif5['features'] = X5.columns
vif5.round(3)

Unnamed: 0,VIF Factor,features
0,2.071,DC1
1,2.489,DC2
2,1.544,DC3
3,2.098,DW1
4,3.018,DW2
5,2.233,DW3
6,2.74,DD1
7,2.705,DD2


the variance inflation factors for DW2, DD1 and DD2 of this model are ** not acceptable** as $VIF>2.5$

In [45]:
# For each X6, calculate VIF and save in dataframe
vif6 = pd.DataFrame()
vif6['VIF Factor'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif6['features'] = X6.columns
vif6.round(3)

Unnamed: 0,VIF Factor,features
0,3.467,const
1,1.914,DC1
2,1.396,DC2
3,1.246,DC3
4,1.883,DW1
5,1.674,DW2
6,1.706,DW3
7,1.554,DD1
8,2.007,DD2


the variance inflation factors for the constant of this model are ** is acceptable** as $VIF>2.5$

### games with 12 forwards and 6 defensemen

In [46]:
df = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv')
#df = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv')
df = df.drop('Unnamed: 0', axis=1)
df.shape

(32940, 17)

In [47]:
df = df.rename(columns={'PlayerPosition': 'Position'})
df['playercount'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
df['rosterposition'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0,1.0,3.0
1,2010,20001,MTL,21.0,GIONTA,W,1.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,2.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,3.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,4.0
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,4.0


- pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [48]:
df = pd.pivot_table(df, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
df = df.reset_index()
df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]
df.reset_index()
df = df.fillna(0)
df = df.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
df.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,0.0,3.0,2.0,2.0,2.0,4.0,0.0,2.0,3.0,0.0,0.0
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,2.0,1.0,2.0,3.0,1.0,2.0,1.0,1.0,2.0,0.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,3.0,0.0,2.0,0.0,3.0,2.0,1.0,3.0,2.0,0.0,2.0
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,3.0,2.0,1.0,4.0,1.0,2.0
5,2010,20003,CAR,MIN,4,3,-1,CAR,18.0,6.0,6.0,6.0,2.0,1.0,1.0,2.0,4.0,0.0,2.0,2.0,1.0,3.0,0.0
6,2010,20004,COL,CHI,3,4,1,CHI,18.0,3.0,9.0,6.0,1.0,1.0,1.0,0.0,2.0,3.0,1.0,3.0,2.0,3.0,1.0
7,2010,20004,COL,CHI,4,3,1,COL,18.0,6.0,6.0,6.0,1.0,3.0,2.0,0.0,2.0,3.0,1.0,2.0,2.0,2.0,0.0
8,2010,20005,EDM,CGY,0,4,4,CGY,18.0,3.0,9.0,6.0,1.0,2.0,0.0,0.0,3.0,2.0,1.0,3.0,3.0,0.0,3.0
9,2010,20005,EDM,CGY,4,0,4,EDM,18.0,6.0,6.0,6.0,0.0,3.0,2.0,1.0,2.0,1.0,3.0,1.0,3.0,1.0,1.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [49]:
df['Win'] = df.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
df['MeanC'] = ((df['C1']*1) + (df['C2']*2) + (df['C3']*3) + (df['C4'] *4))/df['CCount']
df['MeanW'] = ((df['W1']*1) + (df['W2']*2) + (df['W3']*3) + (df['W4'] *4))/df['WCount']
df['MeanD'] = ((df['D1']*1) + (df['D2']*2) + (df['D3']*3))/df['DCount']
df['Sum'] = df['MeanC'] + df['MeanW'] + df['MeanD']
df.sort_values(['GameNumber'], ascending=[True], inplace=True)
df.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,Sum
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,0.0,3.0,2.0,2.0,2.0,4.0,0.0,2.0,3.0,0.0,0.0,0,2.857143,1.6,1.666667,6.12381
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1,2.6,2.571429,2.0,7.171429
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,2.0,1.0,2.0,3.0,1.0,2.0,1.0,1.0,2.0,0.0,0,2.25,2.25,1.833333,6.333333
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,3.0,0.0,2.0,0.0,3.0,2.0,1.0,3.0,2.0,0.0,2.0,1,1.8,2.142857,1.666667,5.609524
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,3.0,2.0,1.0,4.0,1.0,2.0,0,2.0,2.5,2.166667,6.666667


- summary analysis

In [50]:
df.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,915.0,915.0,915.0
0,mean,2.271302,2.216352,1.827687
0,std,0.34791,0.326599,0.271081
0,min,1.0,1.2,1.333333
0,25%,2.0,2.0,1.666667
0,50%,2.25,2.25,1.833333
0,75%,2.5,2.428571,2.0
0,max,3.142857,3.5,2.833333
1,count,915.0,915.0,915.0
1,mean,2.190364,2.18607,1.794536


### model estimation

- regress **win** on the mean of players by position for games with 12 forwards and 6 defensemen.

In [51]:
df['meanc'] = 2.5 - df['MeanC']
df['meanw'] = 2.5 - df['MeanW']
df['meand'] = 2 - df['MeanD']

df['DC1'] = df['C1'] - df['C4']
df['DC2'] = df['C2'] - df['C4']
df['DC3'] = df['C3'] - df['C4']

df['DW1'] = df['W1'] - df['W4']
df['DW2'] = df['W2'] - df['W4']
df['DW3'] = df['W3'] - df['W4']

df['DD1'] = df['D1'] - df['D3']
df['DD2'] = df['D2'] - df['D3']


In [53]:
w = df['Win'] 

Z1 = sm.add_constant(df[['MeanC', 'MeanW', 'MeanD']] )
Z2 = sm.add_constant(df[['meanc', 'meanw', 'meand']] )

Z3 = df[['meanc', 'meanw', 'meand']]
Z4 = df[['MeanC', 'MeanW', 'MeanD']]

Z5 = df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2']]
Z6 =  sm.add_constant(df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2']])



n1 = sm.OLS(w, Z1).fit()
n2 = sm.OLS(w, Z2).fit()
n3 = sm.OLS(w, Z3).fit()
n4 = sm.OLS(w, Z4).fit()
n5 = sm.OLS(w, Z5).fit()
n6 = sm.OLS(w, Z6).fit()


#n1.summary()
#n2.summary()
n3.summary()
#n4.summary()
#n5.summary()
#n6.summary()


0,1,2,3
Dep. Variable:,Win,R-squared:,0.342
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,316.8
Date:,"Wed, 02 May 2018",Prob (F-statistic):,1.3199999999999998e-165
Time:,21:45:07,Log-Likelihood:,-1579.2
No. Observations:,1830,AIC:,3164.0
Df Residuals:,1827,BIC:,3181.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.4011,0.039,10.303,0.000,0.325 0.477
meanw,0.4972,0.037,13.426,0.000,0.425 0.570
meand,0.3014,0.054,5.550,0.000,0.195 0.408

0,1,2,3
Omnibus:,955.243,Durbin-Watson:,2.786
Prob(Omnibus):,0.0,Jarque-Bera (JB):,100.073
Skew:,-0.021,Prob(JB):,1.86e-22
Kurtosis:,1.855,Cond. No.,2.6


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

In [54]:
# For each Z1, calculate VIF and save in dataframe
vif11 = pd.DataFrame()
vif11['VIF Factor'] = [variance_inflation_factor(Z1.values, i) for i in range(Z1.shape[1])]
vif11['features'] = Z1.columns
vif11.round(3)

Unnamed: 0,VIF Factor,features
0,91.785,const
1,1.177,MeanC
2,1.047,MeanW
3,1.197,MeanD


the variance inflation factors for the constant of this model is ** not acceptable** as $VIF>2.5$

In [55]:
# For each Z2, calculate VIF and save in dataframe
vif12 = pd.DataFrame()
vif12['VIF Factor'] = [variance_inflation_factor(Z2.values, i) for i in range(Z2.shape[1])]
vif12['features'] = Z2.columns
vif12.round(3)

Unnamed: 0,VIF Factor,features
0,2.368,const
1,1.177,meanc
2,1.047,meanw
3,1.197,meand


the variance inflation factors for all variables of this model are ** acceptable** as $VIF<2.5$

In [56]:
# For each Z3, calculate VIF and save in dataframe
vif13 = pd.DataFrame()
vif13['VIF Factor'] = [variance_inflation_factor(Z3.values, i) for i in range(Z3.shape[1])]
vif13['features'] = Z3.columns
vif13.round(3)

Unnamed: 0,VIF Factor,features
0,1.684,meanc
1,1.469,meanw
2,1.722,meand


the variance inflation factors for all variables of this model are ** acceptable** as $VIF<2.5$

In [57]:
# For each Z4, calculate VIF and save in dataframe
vif14 = pd.DataFrame()
vif14['VIF Factor'] = [variance_inflation_factor(Z4.values, i) for i in range(Z4.shape[1])]
vif14['features'] = Z4.columns
vif14.round(3)

Unnamed: 0,VIF Factor,features
0,40.894,MeanC
1,34.708,MeanW
2,47.679,MeanD


the variance inflation factors for all variables of this model are ** not acceptable** as $VIF>2.5$

In [58]:
# For each Z5, calculate VIF and save in dataframe
vif15 = pd.DataFrame()
vif15['VIF Factor'] = [variance_inflation_factor(Z5.values, i) for i in range(Z5.shape[1])]
vif15['features'] = Z5.columns
vif15.round(3)

Unnamed: 0,VIF Factor,features
0,1.574,DC1
1,2.256,DC2
2,1.635,DC3
3,1.915,DW1
4,2.161,DW2
5,1.916,DW3
6,2.13,DD1
7,1.564,DD2


the variance inflation factors for all variables of this model are **acceptable** as $VIF<2.5$

In [60]:
# For each Z6, calculate VIF and save in dataframe
vif16 = pd.DataFrame()
vif16['VIF Factor'] = [variance_inflation_factor(Z6.values, i) for i in range(Z6.shape[1])]
vif16['features'] = Z6.columns
vif16.round(3)

Unnamed: 0,VIF Factor,features
0,3.557,const
1,1.304,DC1
2,1.244,DC2
3,1.338,DC3
4,1.347,DW1
5,1.248,DW2
6,1.375,DW3
7,1.437,DD1
8,1.171,DD2


the variance inflation factors for the constant of this model is ** not acceptable** as $VIF>2.5$