In [2]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + \beta_{4}MeanG + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers, 6 defensemen and 1 goaltender

In [5]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/4c_8w_6d_1g_game_team_player_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/4c_8w_4d_game_team_player_ranking_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(1406, 18)

In [6]:
dc = dc.rename(columns={'PlayerPosition': 'Position'})
dc['playercount'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
dc['rosterposition'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,playercount,rosterposition
0,2010,20023,LA,11.0,KOPITAR,C,1.0,1,1,0,LA,VAN,19.0,4.0,4.0,8.0,6.0,1.0,1.0,1.0
1,2010,20023,LA,23.0,BROWN,W,2.0,1,1,0,LA,VAN,19.0,8.0,4.0,8.0,6.0,1.0,1.0,4.0
2,2010,20023,LA,94.0,SMYTH,W,2.0,1,1,0,LA,VAN,19.0,8.0,4.0,8.0,6.0,1.0,1.0,4.0
3,2010,20023,LA,8.0,DOUGHTY,D,1.0,1,1,0,LA,VAN,19.0,6.0,4.0,8.0,6.0,1.0,1.0,2.0
4,2010,20023,LA,33.0,MITCHELL,D,2.0,1,1,0,LA,VAN,19.0,6.0,4.0,8.0,6.0,1.0,1.0,3.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [7]:
dc = pd.pivot_table(dc, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dc = dc.reset_index()
dc.columns = ['_'.join(str(s).strip() for s in col if s) for col in dc.columns]
dc.reset_index()
dc = dc.fillna(0)
dc = dc.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3', 'rosterposition_G_1.0': 'G1', 'rosterposition_G_2.0': 'G2', 'rosterposition_G_3.0': 'G3' })
dc.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4
0,2010,20023,LA,VAN,1,1,0,LA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,1.0,4.0,2.0,1.0
1,2010,20023,LA,VAN,1,1,0,VAN,19.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,2.0
2,2010,20025,CGY,LA,1,3,2,LA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,1.0,4.0,2.0,1.0
3,2010,20025,CGY,LA,3,1,2,CGY,19.0,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,2.0,0.0,1.0,0.0,2.0,4.0,0.0,2.0
4,2010,20037,WSH,NYI,1,2,1,NYI,19.0,4.0,8.0,6.0,1.0,1.0,2.0,0.0,1.0,0.0,4.0,2.0,0.0,1.0,0.0,1.0,3.0,2.0,2.0
5,2010,20037,WSH,NYI,2,1,1,WSH,19.0,4.0,8.0,6.0,1.0,1.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,1.0,0.0,3.0,4.0,0.0,1.0
6,2010,20041,ANA,VAN,3,4,1,VAN,19.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,2.0
7,2010,20041,ANA,VAN,4,3,1,ANA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,3.0,1.0,0.0,0.0,3.0,1.0,3.0,1.0
8,2010,20053,ATL,ANA,4,5,-1,ANA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,3.0,1.0,3.0,1.0
9,2010,20053,ATL,ANA,5,4,-1,ATL,19.0,4.0,8.0,6.0,1.0,0.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,4.0,3.0,1.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [8]:
dc['Win'] = dc.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dc['MeanC'] = ((dc['C1']*1) + (dc['C2']*2) + (dc['C3']*3) + (dc['C4'] *4))/dc['CCount']
dc['MeanW'] = ((dc['W1']*1) + (dc['W2']*2) + (dc['W3']*3) + (dc['W4'] *4))/dc['WCount']
dc['MeanD'] = ((dc['D1']*1) + (dc['D2']*2) + (dc['D3']*3))/dc['DCount']
dc['MeanG'] = ((dc['G1']*1) + (dc['G2']*2) + (dc['G3']*3))/dc['GCount']
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD'] + dc['MeanG']
dc.sort_values(['GameNumber'], ascending=[True], inplace=True)
dc.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,MeanG,Sum
0,2010,20023,LA,VAN,1,1,0,LA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,1.0,4.0,2.0,1.0,1,2.5,2.375,1.833333,2.0,8.708333
1,2010,20023,LA,VAN,1,1,0,VAN,19.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,2.0,0,2.25,2.25,1.666667,1.0,7.166667
2,2010,20025,CGY,LA,1,3,2,LA,19.0,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,1.0,4.0,2.0,1.0,0,2.5,2.375,1.833333,2.0,8.708333
3,2010,20025,CGY,LA,3,1,2,CGY,19.0,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,2.0,0.0,1.0,0.0,2.0,4.0,0.0,2.0,1,2.0,2.25,2.166667,2.0,8.416667
4,2010,20037,WSH,NYI,1,2,1,NYI,19.0,4.0,8.0,6.0,1.0,1.0,2.0,0.0,1.0,0.0,4.0,2.0,0.0,1.0,0.0,1.0,3.0,2.0,2.0,0,2.25,2.625,2.333333,2.0,9.208333


- summary analysis

In [9]:
dc.groupby(['Win'])['MeanC', 'MeanW', 'MeanD', 'MeanG'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD,MeanG
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,37.0,37.0,37.0,37.0
0,mean,2.101351,2.358108,1.873874,1.864865
0,std,0.383964,0.328298,0.223625,0.630791
0,min,1.0,1.75,1.5,1.0
0,25%,2.0,2.125,1.666667,1.0
0,50%,2.25,2.375,1.833333,2.0
0,75%,2.25,2.5,2.0,2.0
0,max,2.75,3.125,2.333333,3.0
1,count,37.0,37.0,37.0,37.0
1,mean,2.135135,2.277027,1.923423,1.783784


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [12]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']
dc['meang'] = 2 - dc['MeanG']

dc['DC1'] = dc['C1'] - dc['C4']
dc['DC2'] = dc['C2'] - dc['C4']
dc['DC3'] = dc['C3'] - dc['C4']

dc['DW1'] = dc['W1'] - dc['W4']
dc['DW2'] = dc['W2'] - dc['W4']
dc['DW3'] = dc['W3'] - dc['W4']

dc['DD1'] = dc['D1'] - dc['D3']
dc['DD2'] = dc['D2'] - dc['D3']

dc['DG1'] = dc['G1'] - dc['G3']
dc['DG2'] = dc['G2'] - dc['G3']

In [22]:
y = dc['Win'] 

X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD', 'MeanG']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand', 'meang']] )

X3 = dc[['meanc', 'meanw', 'meand', 'meang']]
X4 = dc[['MeanC', 'MeanW', 'MeanD', 'MeanG']]

X5 = dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']]
X6 =  sm.add_constant(dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']])



m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()
m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()


#m1.summary()
#m2.summary()
m3.summary()
#m4.summary()
#m5.summary()
#m6.summary()


0,1,2,3
Dep. Variable:,Win,R-squared:,0.34
Model:,OLS,Adj. R-squared:,0.303
Method:,Least Squares,F-statistic:,9.027
Date:,"Wed, 02 May 2018",Prob (F-statistic):,6.14e-06
Time:,13:11:05,Log-Likelihood:,-63.964
No. Observations:,74,AIC:,135.9
Df Residuals:,70,BIC:,145.1
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.5606,0.148,3.792,0.000,0.266 0.855
meanw,0.6710,0.256,2.621,0.011,0.160 1.182
meand,-0.4962,0.321,-1.543,0.127,-1.137 0.145
meang,0.1116,0.108,1.037,0.303,-0.103 0.326

0,1,2,3
Omnibus:,5.391,Durbin-Watson:,2.501
Prob(Omnibus):,0.068,Jarque-Bera (JB):,2.808
Skew:,-0.227,Prob(JB):,0.246
Kurtosis:,2.16,Cond. No.,3.7


In [8]:
print ('win contribution of each roster position')
y = dc['Win']  
X = dc[['meanc', 'meanw', 'meand']]
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/game_level_win_ols_mean_position.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

win contribution of each roster position
                            OLS Regression Results                            
Dep. Variable:                    Win   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     7.740
Date:                Wed, 02 May 2018   Prob (F-statistic):           0.000122
Time:                        12:04:02   Log-Likelihood:                -85.867
No. Observations:                  90   AIC:                             177.7
Df Residuals:                      87   BIC:                             185.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
meanc      

### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD  \beta_{4}MeanG + e_{s}$

In [23]:
# For each X1, calculate VIF and save in dataframe
vif1 = pd.DataFrame()
vif1['VIF Factor'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif1['features'] = X1.columns
vif1.round(3)

Unnamed: 0,VIF Factor,features
0,98.338,const
1,1.065,MeanC
2,1.493,MeanW
3,1.498,MeanD
4,1.066,MeanG


the variance inflation factors for the constant of this model is **not acceptable** as $VIF<2.5$

In [24]:
# For each X2, calculate VIF and save in dataframe
vif2 = pd.DataFrame()
vif2['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif2['features'] = X2.columns
vif2.round(3)

Unnamed: 0,VIF Factor,features
0,2.483,const
1,1.065,meanc
2,1.493,meanw
3,1.498,meand
4,1.066,meang


the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [25]:
# For each X3, calculate VIF and save in dataframe
vif3 = pd.DataFrame()
vif3['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif3['features'] = X3.columns
vif3.round(3)

Unnamed: 0,VIF Factor,features
0,1.266,meanc
1,1.874,meanw
2,1.721,meand
3,1.095,meang


In [13]:
beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/vif_2010-2011.tex', 'w')
f.write(beginningtex)
f.write(vif3.to_latex())
f.write(endtex)
f.close()

the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [26]:
# For each X4, calculate VIF and save in dataframe
vif4 = pd.DataFrame()
vif4['VIF Factor'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif4['features'] = X4.columns
vif4.round(3)

Unnamed: 0,VIF Factor,features
0,29.265,MeanC
1,71.427,MeanW
2,71.774,MeanD
3,8.827,MeanG


the variance inflation factors for this model are **not acceptable** as $VIF > 2.5$

In [27]:
# For each X5, calculate VIF and save in dataframe
vif5 = pd.DataFrame()
vif5['VIF Factor'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif5['features'] = X5.columns
vif5.round(3)

Unnamed: 0,VIF Factor,features
0,1.741,DC1
1,2.899,DC2
2,2.093,DC3
3,2.053,DW1
4,3.12,DW2
5,3.055,DW3
6,2.992,DD1
7,2.246,DD2
8,1.255,DG1
9,2.097,DG2


the variance inflation factors for DC2, DW2, DW3, DD1 in this model are **not acceptable** as $VIF > 2.5$ 

In [28]:
# For each X6, calculate VIF and save in dataframe
vif6 = pd.DataFrame()
vif6['VIF Factor'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif6['features'] = X6.columns
vif6.round(3)

Unnamed: 0,VIF Factor,features
0,5.514,const
1,1.675,DC1
2,2.153,DC2
3,1.772,DC3
4,1.871,DW1
5,1.916,DW2
6,2.311,DW3
7,2.707,DD1
8,2.157,DD2
9,1.325,DG1


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$

## games with 12 forwards and 6 defensemen

In [43]:
df = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/12f_6d_g_game_team_player_ranking_nhl_positions.csv')
#df = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/12f_6d_g_game_team_player_ranking_nhl_positions.csv')
df = df.drop('Unnamed: 0', axis=1)
df.shape

(34917, 19)

In [44]:
df = df.rename(columns={'PlayerPosition': 'Position'})
df['playercount'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
df['rosterposition'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,FCount,playercount,rosterposition
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,19.0,7.0,7.0,5.0,6.0,1.0,12.0,1.0,2.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,19.0,5.0,7.0,5.0,6.0,1.0,12.0,1.0,3.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,19.0,5.0,7.0,5.0,6.0,1.0,12.0,1.0,3.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,19.0,6.0,7.0,5.0,6.0,1.0,12.0,1.0,4.0
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL,19.0,6.0,7.0,5.0,6.0,1.0,12.0,1.0,4.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [45]:
df = pd.pivot_table(df, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount'], columns=['Position', 'Rank'], values=['rosterposition'])
df = df.reset_index()
df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]
df.reset_index()
df = df.fillna(0)
df = df.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3', 'rosterposition_G_1.0': 'G1', 'rosterposition_G_2.0': 'G2', 'rosterposition_G_3.0': 'G3' })
df.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4
0,2010,20001,TOR,MTL,2,3,1,MTL,19.0,7.0,5.0,6.0,1.0,1.0,2.0,2.0,2.0,2.0,4.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,0.0
1,2010,20001,TOR,MTL,3,2,1,TOR,19.0,5.0,7.0,6.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,19.0,8.0,4.0,6.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,19.0,5.0,7.0,6.0,1.0,3.0,0.0,2.0,0.0,3.0,1.0,2.0,0.0,1.0,0.0,3.0,2.0,0.0,2.0
4,2010,20003,CAR,MIN,3,4,-1,MIN,19.0,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,4.0,0.0,1.0,0.0,1.0,2.0,3.0,2.0
5,2010,20003,CAR,MIN,4,3,-1,CAR,19.0,6.0,6.0,6.0,1.0,2.0,0.0,2.0,2.0,4.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,3.0,0.0
6,2010,20004,COL,CHI,3,4,1,CHI,19.0,3.0,9.0,6.0,1.0,2.0,0.0,1.0,0.0,2.0,2.0,2.0,0.0,0.0,1.0,3.0,5.0,1.0,0.0
7,2010,20004,COL,CHI,4,3,1,COL,19.0,6.0,6.0,6.0,1.0,1.0,4.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,2.0,2.0,1.0,1.0
8,2010,20005,EDM,CGY,0,4,4,CGY,19.0,3.0,9.0,6.0,1.0,2.0,1.0,0.0,0.0,1.0,3.0,2.0,0.0,1.0,0.0,2.0,4.0,0.0,3.0
9,2010,20005,EDM,CGY,4,0,4,EDM,19.0,6.0,6.0,6.0,1.0,0.0,4.0,1.0,1.0,2.0,1.0,3.0,0.0,0.0,1.0,1.0,3.0,1.0,1.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [46]:
df['Win'] = df.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
df['MeanC'] = ((df['C1']*1) + (df['C2']*2) + (df['C3']*3) + (df['C4'] *4))/df['CCount']
df['MeanW'] = ((df['W1']*1) + (df['W2']*2) + (df['W3']*3) + (df['W4'] *4))/df['WCount']
df['MeanD'] = ((df['D1']*1) + (df['D2']*2) + (df['D3']*3))/df['DCount']
df['MeanG'] = ((df['G1']*1) + (df['G2']*2) + (df['G3']*3))/df['GCount']
df['Sum'] = df['MeanC'] + df['MeanW'] + df['MeanD'] + df['MeanG']
df.sort_values(['GameNumber'], ascending=[True], inplace=True)
df.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,MeanG,Sum
0,2010,20001,TOR,MTL,2,3,1,MTL,19.0,7.0,5.0,6.0,1.0,1.0,2.0,2.0,2.0,2.0,4.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,0.0,0,2.714286,2.0,1.666667,1.0,7.380952
1,2010,20001,TOR,MTL,3,2,1,TOR,19.0,5.0,7.0,6.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,1,2.6,2.285714,2.0,3.0,9.885714
2,2010,20002,PHI,PIT,2,3,-1,PIT,19.0,8.0,4.0,6.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0,2.125,2.0,2.0,2.0,8.125
3,2010,20002,PHI,PIT,3,2,-1,PHI,19.0,5.0,7.0,6.0,1.0,3.0,0.0,2.0,0.0,3.0,1.0,2.0,0.0,1.0,0.0,3.0,2.0,0.0,2.0,1,1.8,2.142857,1.833333,2.0,7.77619
4,2010,20003,CAR,MIN,3,4,-1,MIN,19.0,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,4.0,0.0,1.0,0.0,1.0,2.0,3.0,2.0,0,2.0,2.75,2.5,2.0,9.25


- summary analysis

In [48]:
df.groupby(['Win'])['MeanC', 'MeanW', 'MeanD', 'MeanG'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD,MeanG
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,915.0,915.0,915.0,915.0
0,mean,2.164813,2.22839,1.959563,1.902732
0,std,0.381268,0.353588,0.290153,0.686971
0,min,1.0,1.2,1.333333,1.0
0,25%,2.0,2.0,1.666667,1.0
0,50%,2.2,2.285714,1.833333,2.0
0,75%,2.4,2.5,2.166667,2.0
0,max,3.166667,3.4,2.833333,3.0
1,count,915.0,915.0,915.0,915.0
1,mean,2.095431,2.147842,1.907286,1.733333


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [49]:
df['meanc'] = 2.5 - df['MeanC']
df['meanw'] = 2.5 - df['MeanW']
df['meand'] = 2 - df['MeanD']
df['meang'] = 2 - df['MeanG']

df['DC1'] = df['C1'] - df['C4']
df['DC2'] = df['C2'] - df['C4']
df['DC3'] = df['C3'] - df['C4']

df['DW1'] = df['W1'] - df['W4']
df['DW2'] = df['W2'] - df['W4']
df['DW3'] = df['W3'] - df['W4']

df['DD1'] = df['D1'] - df['D3']
df['DD2'] = df['D2'] - df['D3']

df['DG1'] = df['G1'] - df['G3']
df['DG2'] = df['G2'] - df['G3']

In [63]:
w = df['Win'] 

Z1 = sm.add_constant(df[['MeanC', 'MeanW', 'MeanD', 'MeanG']] )
Z2 = sm.add_constant(df[['meanc', 'meanw', 'meand', 'meang']] )

Z3 = df[['meanc', 'meanw', 'meand', 'meang']]
Z4 = df[['MeanC', 'MeanW', 'MeanD', 'MeanG']]

Z5 = df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']]
Z6 =  sm.add_constant(df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']])



n1 = sm.OLS(w, Z1).fit()
n2 = sm.OLS(w, Z2).fit()
n3 = sm.OLS(w, Z3).fit()
n4 = sm.OLS(w, Z4).fit()
n5 = sm.OLS(w, Z5).fit()
n6 = sm.OLS(w, Z6).fit()


#n1.summary()
#n2.summary()
n3.summary()
#n4.summary()
#n5.summary()
#n6.summary()


0,1,2,3
Dep. Variable:,Win,R-squared:,0.382
Model:,OLS,Adj. R-squared:,0.381
Method:,Least Squares,F-statistic:,282.3
Date:,"Wed, 02 May 2018",Prob (F-statistic):,4.71e-189
Time:,17:13:40,Log-Likelihood:,-1522.0
No. Observations:,1830,AIC:,3052.0
Df Residuals:,1826,BIC:,3074.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.4879,0.031,15.819,0.000,0.427 0.548
meanw,0.5019,0.036,13.789,0.000,0.431 0.573
meand,-0.2267,0.052,-4.368,0.000,-0.329 -0.125
meang,0.1086,0.020,5.488,0.000,0.070 0.147

0,1,2,3
Omnibus:,708.275,Durbin-Watson:,2.845
Prob(Omnibus):,0.0,Jarque-Bera (JB):,92.868
Skew:,-0.05,Prob(JB):,6.820000000000001e-21
Kurtosis:,1.901,Cond. No.,3.14


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

In [54]:
# For each Z1, calculate VIF and save in dataframe
vif11 = pd.DataFrame()
vif11['VIF Factor'] = [variance_inflation_factor(Z1.values, i) for i in range(Z1.shape[1])]
vif11['features'] = Z1.columns
vif11.round(3)

Unnamed: 0,VIF Factor,features
0,74.942,const
1,1.197,MeanC
2,1.211,MeanW
3,1.337,MeanD
4,1.053,MeanG


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$. 

In [56]:
# For each Z2, calculate VIF and save in dataframe
vif12 = pd.DataFrame()
vif12['VIF Factor'] = [variance_inflation_factor(Z2.values, i) for i in range(Z2.shape[1])]
vif12['features'] = Z2.columns
vif12.round(3)

Unnamed: 0,VIF Factor,features
0,2.652,const
1,1.197,meanc
2,1.211,meanw
3,1.337,meand
4,1.053,meang


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$

In [57]:
# For each Z3, calculate VIF and save in dataframe
vif13 = pd.DataFrame()
vif13['VIF Factor'] = [variance_inflation_factor(Z3.values, i) for i in range(Z3.shape[1])]
vif13['features'] = Z3.columns
vif13.round(3)

Unnamed: 0,VIF Factor,features
0,1.599,meanc
1,1.688,meanw
2,1.315,meand
3,1.124,meang


the variance inflation factors for this model are **acceptable** as $VIF > 2.5$

In [58]:
# For each Z4, calculate VIF and save in dataframe
vif14 = pd.DataFrame()
vif14['VIF Factor'] = [variance_inflation_factor(Z4.values, i) for i in range(Z4.shape[1])]
vif14['features'] = Z4.columns
vif14.round(3)

Unnamed: 0,VIF Factor,features
0,33.817,MeanC
1,41.657,MeanW
2,54.643,MeanD
3,8.543,MeanG


the variance inflation factors for all variables in this model are **not acceptable** as $VIF > 2.5$

In [59]:
# For each Z5, calculate VIF and save in dataframe
vif15 = pd.DataFrame()
vif15['VIF Factor'] = [variance_inflation_factor(Z5.values, i) for i in range(Z5.shape[1])]
vif15['features'] = Z5.columns
vif15.round(3)

Unnamed: 0,VIF Factor,features
0,1.867,DC1
1,1.66,DC2
2,1.544,DC3
3,2.079,DW1
4,2.24,DW2
5,1.613,DW3
6,1.648,DD1
7,1.615,DD2
8,1.259,DG1
9,1.319,DG2


the variance inflation factors for all variables in this model is **are acceptable** as $VIF > 2.5$

In [61]:
# For each Z6, calculate VIF and save in dataframe
vif16 = pd.DataFrame()
vif16['VIF Factor'] = [variance_inflation_factor(Z6.values, i) for i in range(Z6.shape[1])]
vif16['features'] = Z6.columns
vif16.round(3)

Unnamed: 0,VIF Factor,features
0,4.282,const
1,1.368,DC1
2,1.178,DC2
3,1.247,DC3
4,1.389,DW1
5,1.259,DW2
6,1.123,DW3
7,1.615,DD1
8,1.476,DD2
9,1.185,DG1


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$