In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + \beta_{4}MeanG + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers, 6 defensemen and 1 goaltender

In [2]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_4c_8w_6d_1g_game_team.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_4c_8w_6d_1g_game_team.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(10982, 17)

In [3]:
dc['playercount'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerName',])['PlayerName'].transform('count')
dc['rosterposition'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,playercount,rosterposition
0,2011,20014,FLA,STEPHEN WEISS,C,2.0,2,0,-2,FLA,NYI,19,4,4.0,8.0,6.0,1.0,1,3
1,2011,20014,FLA,SCOTTIE UPSHALL,W,3.0,2,0,-2,FLA,NYI,19,8,4.0,8.0,6.0,1.0,1,3
2,2011,20014,FLA,MARCEL GOC,C,2.0,2,0,-2,FLA,NYI,19,4,4.0,8.0,6.0,1.0,1,3
3,2011,20014,FLA,RYAN CARTER,C,4.0,2,0,-2,FLA,NYI,19,4,4.0,8.0,6.0,1.0,1,1
4,2011,20014,FLA,SHAWN MATTHIAS,C,2.0,2,0,-2,FLA,NYI,19,4,4.0,8.0,6.0,1.0,1,3


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [4]:
dc = pd.pivot_table(dc, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dc = dc.reset_index()
dc.columns = ['_'.join(str(s).strip() for s in col if s) for col in dc.columns]
dc.reset_index()
dc = dc.fillna(0)
dc = dc.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3', 'rosterposition_G_1.0': 'G1', 'rosterposition_G_2.0': 'G2', 'rosterposition_G_3.0': 'G3' })
dc.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4
0,2011,20014,FLA,NYI,0,2,-2,NYI,19,4.0,8.0,6.0,1.0,1.0,1.0,2.0,0.0,1.0,2.0,3.0,0.0,0.0,1.0,2.0,3.0,2.0,1.0
1,2011,20014,FLA,NYI,2,0,-2,FLA,19,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,3.0,2.0,1.0,0.0,1.0,0.0,2.0,3.0,3.0,0.0
2,2011,20024,EDM,PIT,2,3,1,PIT,19,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,1.0,4.0,2.0,1.0,1.0
3,2011,20024,EDM,PIT,3,2,1,EDM,19,4.0,8.0,6.0,1.0,1.0,0.0,3.0,0.0,0.0,2.0,4.0,0.0,1.0,0.0,2.0,4.0,0.0,2.0
4,2011,20032,PIT,FLA,2,4,2,FLA,19,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,3.0,2.0,1.0,0.0,1.0,0.0,2.0,3.0,3.0,0.0
5,2011,20032,PIT,FLA,4,2,2,PIT,19,4.0,8.0,6.0,1.0,2.0,1.0,0.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,4.0,3.0,1.0,0.0
6,2011,20046,DAL,STL,2,3,1,STL,19,4.0,8.0,6.0,1.0,1.0,2.0,0.0,1.0,4.0,1.0,1.0,1.0,0.0,0.0,3.0,4.0,0.0,1.0
7,2011,20046,DAL,STL,3,2,1,DAL,19,4.0,8.0,6.0,1.0,1.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,1.0,0.0,3.0,3.0,1.0,1.0
8,2011,20061,VAN,EDM,3,4,-1,EDM,19,4.0,8.0,6.0,1.0,1.0,0.0,3.0,0.0,0.0,2.0,4.0,0.0,1.0,0.0,2.0,3.0,1.0,2.0
9,2011,20061,VAN,EDM,4,3,-1,VAN,19,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,4.0,0.0,2.0,0.0,1.0,0.0,5.0,0.0,1.0,2.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [5]:
dc['Win'] = dc.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dc['MeanC'] = ((dc['C1']*1) + (dc['C2']*2) + (dc['C3']*3) + (dc['C4'] *4))/dc['CCount']
dc['MeanW'] = ((dc['W1']*1) + (dc['W2']*2) + (dc['W3']*3) + (dc['W4'] *4))/dc['WCount']
dc['MeanD'] = ((dc['D1']*1) + (dc['D2']*2) + (dc['D3']*3))/dc['DCount']
dc['MeanG'] = ((dc['G1']*1) + (dc['G2']*2) + (dc['G3']*3))/dc['GCount']
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD'] + dc['MeanG']
dc.sort_values(['GameNumber'], ascending=[True], inplace=True)
dc.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,MeanG,Sum
502,2017,20003,EDM,CGY,0,3,3,CGY,19,4.0,8.0,6.0,1.0,1.0,0.0,3.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,2.0,3.0,2.0,1.0,0,2.5,2.25,2.0,1.0,7.75
503,2017,20003,EDM,CGY,3,0,3,EDM,19,4.0,8.0,6.0,1.0,3.0,1.0,0.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,5.0,2.0,1.0,1,1.25,2.5,2.333333,2.0,8.083333
424,2016,20007,PIT,WSH,4,5,1,WSH,19,4.0,8.0,6.0,1.0,2.0,2.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,0,1.5,1.75,1.333333,1.0,5.583333
425,2016,20007,PIT,WSH,5,4,1,PIT,19,4.0,8.0,6.0,1.0,2.0,2.0,0.0,0.0,2.0,4.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,1.0,1,1.5,1.875,1.666667,2.0,7.041667
188,2013,20008,PIT,N.J,0,3,3,N.J,19,4.0,8.0,6.0,1.0,0.0,3.0,1.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,2.0,4.0,2.0,0.0,0,2.25,2.0,1.833333,2.0,8.083333


In [6]:
dc.shape

(578, 33)

In [42]:
dc.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_games_with_4c_8w_6d_1g.csv', index='False', sep=',')
#dc.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/22011_2017_games_with_4c_8w_6d_1g.csv', index='False', sep=',')

- summary analysis

In [7]:
dc.groupby(['Win'])['MeanC', 'MeanW', 'MeanD', 'MeanG'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD,MeanG
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,289.0,289.0,289.0,289.0
0,mean,2.16436,2.143166,1.950404,1.927336
0,std,0.465635,0.334138,0.328731,0.675653
0,min,1.0,1.0,1.333333,1.0
0,25%,1.75,2.0,1.666667,1.0
0,50%,2.25,2.125,2.0,2.0
0,75%,2.5,2.375,2.166667,2.0
0,max,3.5,3.125,2.666667,3.0
1,count,289.0,289.0,289.0,289.0
1,mean,2.049308,2.057526,1.831603,1.785467


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [8]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']
dc['meang'] = 2 - dc['MeanG']

dc['DC1'] = dc['C1'] - dc['C4']
dc['DC2'] = dc['C2'] - dc['C4']
dc['DC3'] = dc['C3'] - dc['C4']

dc['DW1'] = dc['W1'] - dc['W4']
dc['DW2'] = dc['W2'] - dc['W4']
dc['DW3'] = dc['W3'] - dc['W4']

dc['DD1'] = dc['D1'] - dc['D3']
dc['DD2'] = dc['D2'] - dc['D3']

dc['DG1'] = dc['G1'] - dc['G3']
dc['DG2'] = dc['G2'] - dc['G3']

In [43]:
y = dc['Win'] 

X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD', 'MeanG']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand', 'meang']] )

X3 = dc[['meanc', 'meanw', 'meand', 'meang']]
X4 = dc[['MeanC', 'MeanW', 'MeanD', 'MeanG']]

m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()

#m1.summary()
#m2.summary()
m3.summary()
#m4.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.399
Model:,OLS,Adj. R-squared:,0.395
Method:,Least Squares,F-statistic:,95.26
Date:,"Thu, 10 May 2018",Prob (F-statistic):,4e-62
Time:,18:22:58,Log-Likelihood:,-472.68
No. Observations:,578,AIC:,953.4
Df Residuals:,574,BIC:,970.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.2773,0.049,5.642,0.000,0.181 0.374
meanw,0.5691,0.056,10.089,0.000,0.458 0.680
meand,0.0902,0.078,1.161,0.246,-0.062 0.243
meang,0.0327,0.035,0.935,0.350,-0.036 0.101

0,1,2,3
Omnibus:,143.674,Durbin-Watson:,2.501
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.124
Skew:,0.036,Prob(JB):,2.12e-06
Kurtosis:,1.961,Cond. No.,2.9


In [44]:
X5 = dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']]
X6 =  sm.add_constant(dc[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']])

m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()

m5.summary()
#m6.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.441
Model:,OLS,Adj. R-squared:,0.431
Method:,Least Squares,F-statistic:,44.84
Date:,"Thu, 10 May 2018",Prob (F-statistic):,1.83e-65
Time:,18:23:03,Log-Likelihood:,-451.66
No. Observations:,578,AIC:,923.3
Df Residuals:,568,BIC:,966.9
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
DC1,0.0566,0.020,2.786,0.006,0.017 0.097
DC2,0.0710,0.017,4.134,0.000,0.037 0.105
DC3,-0.0032,0.021,-0.152,0.879,-0.044 0.038
DW1,0.0649,0.012,5.192,0.000,0.040 0.089
DW2,0.0650,0.012,5.518,0.000,0.042 0.088
DW3,-0.0116,0.016,-0.726,0.468,-0.043 0.020
DD1,0.0242,0.015,1.641,0.101,-0.005 0.053
DD2,0.0078,0.013,0.593,0.553,-0.018 0.034
DG1,0.0293,0.035,0.830,0.407,-0.040 0.099

0,1,2,3
Omnibus:,745.023,Durbin-Watson:,2.726
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.499
Skew:,0.043,Prob(JB):,2.65e-09
Kurtosis:,1.722,Cond. No.,6.86


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD  \beta_{4}MeanG + e_{s}$

In [10]:
# For each X1, calculate VIF and save in dataframe
vif1 = pd.DataFrame()
vif1['VIF Factor'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif1['features'] = X1.columns
vif1.round(3)

Unnamed: 0,VIF Factor,features
0,60.261,const
1,1.165,MeanC
2,1.142,MeanW
3,1.248,MeanD
4,1.078,MeanG


the variance inflation factors for the constant of this model is **not acceptable** as $VIF<2.5$

In [11]:
# For each X2, calculate VIF and save in dataframe
vif2 = pd.DataFrame()
vif2['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif2['features'] = X2.columns
vif2.round(3)

Unnamed: 0,VIF Factor,features
0,2.771,const
1,1.165,meanc
2,1.142,meanw
3,1.248,meand
4,1.078,meang


the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [12]:
# For each X3, calculate VIF and save in dataframe
vif3 = pd.DataFrame()
vif3['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif3['features'] = X3.columns
vif3.round(3)

Unnamed: 0,VIF Factor,features
0,1.711,meanc
1,1.68,meanw
2,1.364,meand
3,1.125,meang


In [13]:
beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/vif_2010-2011.tex', 'w')
f.write(beginningtex)
f.write(vif3.to_latex())
f.write(endtex)
f.close()

the variance inflation factors for this model are **acceptable** as $VIF<2.5$

In [14]:
# For each X4, calculate VIF and save in dataframe
vif4 = pd.DataFrame()
vif4['VIF Factor'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif4['features'] = X4.columns
vif4.round(3)

Unnamed: 0,VIF Factor,features
0,23.38,MeanC
1,31.807,MeanW
2,36.354,MeanD
3,9.018,MeanG


the variance inflation factors for this model are **not acceptable** as $VIF > 2.5$

In [15]:
# For each X5, calculate VIF and save in dataframe
vif5 = pd.DataFrame()
vif5['VIF Factor'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif5['features'] = X5.columns
vif5.round(3)

Unnamed: 0,VIF Factor,features
0,1.71,DC1
1,1.856,DC2
2,1.364,DC3
3,1.982,DW1
4,2.019,DW2
5,1.412,DW3
6,1.887,DD1
7,2.026,DD2
8,1.226,DG1
9,1.299,DG2


the variance inflation factors for DC2, DW2, DW3, DD1 in this model are **not acceptable** as $VIF > 2.5$ 

In [16]:
# For each X6, calculate VIF and save in dataframe
vif6 = pd.DataFrame()
vif6['VIF Factor'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif6['features'] = X6.columns
vif6.round(3)

Unnamed: 0,VIF Factor,features
0,5.055,const
1,1.253,DC1
2,1.081,DC2
3,1.229,DC3
4,1.282,DW1
5,1.092,DW2
6,1.137,DW3
7,1.71,DD1
8,1.584,DD2
9,1.175,DG1


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$

## games with 12 forwards and 6 defensemen

In [45]:
df = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_12f_6d_g_game_team.csv')
#df = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_12f_6d_g_game_team.csv')
df = df.drop('Unnamed: 0', axis=1)
df.shape

(277752, 18)

In [46]:
df['playercount'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerName',])['PlayerName'].transform('count')
df['rosterposition'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,FCount,playercount,rosterposition
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0,12.0,1,8
1,2011,20001,PHI,CLAUDE GIROUX,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0,12.0,1,8
2,2011,20001,PHI,SEAN COUTURIER,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0,1,2
3,2011,20001,PHI,MAX TALBOT,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0,1,2
4,2011,20001,PHI,ZAC RINALDO,C,4.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0,1,1


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [47]:
df = pd.pivot_table(df, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount'], columns=['Position', 'Rank'], values=['rosterposition'])
df = df.reset_index()
df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]
df.reset_index()
df = df.fillna(0)
df = df.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3', 'rosterposition_G_1.0': 'G1', 'rosterposition_G_2.0': 'G2', 'rosterposition_G_3.0': 'G3' })
df.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4
0,2011,20001,PHI,BOS,1,2,-1,BOS,19,6.0,6.0,6.0,1.0,4.0,1.0,1.0,0.0,3.0,2.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0
1,2011,20001,PHI,BOS,2,1,-1,PHI,19,3.0,9.0,6.0,1.0,0.0,2.0,0.0,1.0,4.0,1.0,1.0,0.0,1.0,0.0,8.0,0.0,1.0,0.0
2,2011,20003,PIT,VAN,3,5,-2,VAN,19,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,0.0,1.0,0.0,5.0,0.0,1.0,2.0
3,2011,20003,PIT,VAN,5,3,-2,PIT,19,5.0,7.0,6.0,1.0,3.0,1.0,0.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,4.0,2.0,1.0,0.0
4,2011,20004,T.B,CAR,1,5,-4,CAR,19,7.0,5.0,6.0,1.0,2.0,3.0,1.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,0.0,4.0,1.0,0.0
5,2011,20004,T.B,CAR,5,1,-4,T.B,19,5.0,7.0,6.0,1.0,2.0,0.0,3.0,0.0,2.0,1.0,3.0,0.0,0.0,1.0,2.0,3.0,2.0,0.0
6,2011,20005,DET,OTT,3,5,2,OTT,19,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,0.0,1.0,0.0,3.0,3.0,0.0,2.0
7,2011,20005,DET,OTT,5,3,2,DET,19,5.0,7.0,6.0,1.0,3.0,2.0,0.0,0.0,4.0,2.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,0.0
8,2011,20006,NSH,CBJ,2,3,-1,CBJ,19,6.0,6.0,6.0,1.0,1.0,3.0,1.0,1.0,1.0,3.0,2.0,0.0,0.0,1.0,1.0,3.0,0.0,2.0
9,2011,20006,NSH,CBJ,3,2,-1,NSH,19,5.0,7.0,6.0,1.0,3.0,0.0,2.0,0.0,2.0,3.0,1.0,0.0,1.0,0.0,4.0,1.0,2.0,0.0


In [48]:
df.shape

(14558, 27)

- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [49]:
df['Win'] = df.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
df['MeanC'] = ((df['C1']*1) + (df['C2']*2) + (df['C3']*3) + (df['C4'] *4))/df['CCount']
df['MeanW'] = ((df['W1']*1) + (df['W2']*2) + (df['W3']*3) + (df['W4'] *4))/df['WCount']
df['MeanD'] = ((df['D1']*1) + (df['D2']*2) + (df['D3']*3))/df['DCount']
df['MeanG'] = ((df['G1']*1) + (df['G2']*2) + (df['G3']*3))/df['GCount']
df['Sum'] = df['MeanC'] + df['MeanW'] + df['MeanD'] + df['MeanG']
df.sort_values(['GameNumber'], ascending=[True], inplace=True)
df.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,MeanG,Sum
0,2011,20001,PHI,BOS,1,2,-1,BOS,19,6.0,6.0,6.0,1.0,4.0,1.0,1.0,0.0,3.0,2.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0,0,1.5,2.0,1.666667,2.0,7.166667
9977,2016,20001,OTT,TOR,5,4,1,OTT,19,6.0,6.0,6.0,1.0,1.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1,2.333333,2.166667,1.833333,1.0,7.333333
9976,2016,20001,OTT,TOR,4,5,1,TOR,19,8.0,4.0,6.0,1.0,5.0,2.0,1.0,0.0,1.0,5.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,2.0,0,1.5,2.75,1.833333,1.0,7.083333
7675,2015,20001,MTL,TOR,3,1,-2,MTL,19,6.0,6.0,6.0,1.0,2.0,3.0,1.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,2.0,4.0,0.0,0.0,1,1.833333,1.666667,1.833333,1.0,6.333333
7674,2015,20001,MTL,TOR,1,3,-2,TOR,19,6.0,6.0,6.0,1.0,0.0,4.0,2.0,0.0,1.0,4.0,1.0,0.0,1.0,0.0,2.0,4.0,0.0,0.0,0,2.333333,1.666667,2.0,2.0,8.0


In [50]:
df.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_games_with_12f_6d.csv', index='False', sep=',')
#df.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/22011_2017_games_with_12f_6d.csv', index='False', sep=',')

- summary analysis

In [22]:
df.groupby(['Win'])['MeanC', 'MeanW', 'MeanD', 'MeanG'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD,MeanG
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,7279.0,7279.0,7279.0,7279.0
0,mean,2.191407,2.142543,1.929569,1.862344
0,std,0.446942,0.364668,0.298823,0.661899
0,min,1.0,1.0,1.166667,1.0
0,25%,1.857143,1.857143,1.666667,1.0
0,50%,2.2,2.142857,1.833333,2.0
0,75%,2.5,2.375,2.166667,2.0
0,max,4.0,3.666667,3.0,3.0
1,count,7279.0,7279.0,7279.0,7279.0
1,mean,2.106741,2.064838,1.865504,1.706759


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [23]:
df['meanc'] = 2.5 - df['MeanC']
df['meanw'] = 2.5 - df['MeanW']
df['meand'] = 2 - df['MeanD']
df['meang'] = 2 - df['MeanG']

df['DC1'] = df['C1'] - df['C4']
df['DC2'] = df['C2'] - df['C4']
df['DC3'] = df['C3'] - df['C4']

df['DW1'] = df['W1'] - df['W4']
df['DW2'] = df['W2'] - df['W4']
df['DW3'] = df['W3'] - df['W4']

df['DD1'] = df['D1'] - df['D3']
df['DD2'] = df['D2'] - df['D3']

df['DG1'] = df['G1'] - df['G3']
df['DG2'] = df['G2'] - df['G3']

In [39]:
w = df['Win'] 

Z1 = sm.add_constant(df[['MeanC', 'MeanW', 'MeanD', 'MeanG']] )
Z2 = sm.add_constant(df[['meanc', 'meanw', 'meand', 'meang']] )

Z3 = df[['meanc', 'meanw', 'meand', 'meang']]
Z4 = df[['MeanC', 'MeanW', 'MeanD', 'MeanG']]


n1 = sm.OLS(w, Z1).fit()
n2 = sm.OLS(w, Z2).fit()
n3 = sm.OLS(w, Z3).fit()
n4 = sm.OLS(w, Z4).fit()


#n1.summary()
#n2.summary()
n3.summary()
#n4.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.374
Model:,OLS,Adj. R-squared:,0.374
Method:,Least Squares,F-statistic:,2173.0
Date:,"Thu, 10 May 2018",Prob (F-statistic):,0.0
Time:,18:20:06,Log-Likelihood:,-12202.0
No. Observations:,14558,AIC:,24410.0
Df Residuals:,14554,BIC:,24440.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.2950,0.010,28.678,0.000,0.275 0.315
meanw,0.5186,0.011,46.686,0.000,0.497 0.540
meand,0.0419,0.017,2.446,0.014,0.008 0.075
meang,0.0909,0.007,12.473,0.000,0.077 0.105

0,1,2,3
Omnibus:,5391.852,Durbin-Watson:,2.181
Prob(Omnibus):,0.0,Jarque-Bera (JB):,712.792
Skew:,0.012,Prob(JB):,1.66e-155
Kurtosis:,1.916,Cond. No.,3.09


In [40]:
Z5 = df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']]
Z6 =  sm.add_constant(df[['DC1', 'DC2', 'DC3', 'DW1', 'DW2', 'DW3', 'DD1', 'DD2', 'DG1', 'DG2']])

n5 = sm.OLS(w, Z5).fit()
n6 = sm.OLS(w, Z6).fit()

n5.summary()
#n6.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.427
Model:,OLS,Adj. R-squared:,0.427
Method:,Least Squares,F-statistic:,1086.0
Date:,"Thu, 10 May 2018",Prob (F-statistic):,0.0
Time:,18:20:23,Log-Likelihood:,-11554.0
No. Observations:,14558,AIC:,23130.0
Df Residuals:,14548,BIC:,23200.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
DC1,0.0499,0.004,14.031,0.000,0.043 0.057
DC2,0.0578,0.003,20.591,0.000,0.052 0.063
DC3,0.0040,0.003,1.168,0.243,-0.003 0.011
DW1,0.0667,0.003,22.912,0.000,0.061 0.072
DW2,0.0620,0.002,26.743,0.000,0.057 0.067
DW3,-0.0058,0.003,-1.754,0.079,-0.012 0.001
DD1,0.0106,0.003,3.371,0.001,0.004 0.017
DD2,0.0185,0.002,7.621,0.000,0.014 0.023
DG1,0.0735,0.007,11.212,0.000,0.061 0.086

0,1,2,3
Omnibus:,101218.584,Durbin-Watson:,2.35
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1053.511
Skew:,0.002,Prob(JB):,1.71e-229
Kurtosis:,1.682,Cond. No.,5.99


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

In [37]:
# For each Z1, calculate VIF and save in dataframe
vif11 = pd.DataFrame()
vif11['VIF Factor'] = [variance_inflation_factor(Z1.values, i) for i in range(Z1.shape[1])]
vif11['features'] = Z1.columns
vif11.round(3)

Unnamed: 0,VIF Factor,features
0,67.28,const
1,1.103,MeanC
2,1.144,MeanW
3,1.165,MeanD
4,1.06,MeanG


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$. 

In [38]:
# For each Z2, calculate VIF and save in dataframe
vif12 = pd.DataFrame()
vif12['VIF Factor'] = [variance_inflation_factor(Z2.values, i) for i in range(Z2.shape[1])]
vif12['features'] = Z2.columns
vif12.round(3)

Unnamed: 0,VIF Factor,features
0,2.517,const
1,1.103,meanc
2,1.144,meanw
3,1.165,meand
4,1.06,meang


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$

In [27]:
# For each Z3, calculate VIF and save in dataframe
vif13 = pd.DataFrame()
vif13['VIF Factor'] = [variance_inflation_factor(Z3.values, i) for i in range(Z3.shape[1])]
vif13['features'] = Z3.columns
vif13.round(3)

Unnamed: 0,VIF Factor,features
0,1.561,meanc
1,1.674,meanw
2,1.303,meand
3,1.171,meang


the variance inflation factors for this model are **acceptable** as $VIF > 2.5$

In [28]:
# For each Z4, calculate VIF and save in dataframe
vif14 = pd.DataFrame()
vif14['VIF Factor'] = [variance_inflation_factor(Z4.values, i) for i in range(Z4.shape[1])]
vif14['features'] = Z4.columns
vif14.round(3)

Unnamed: 0,VIF Factor,features
0,24.45,MeanC
1,32.141,MeanW
2,37.202,MeanD
3,8.854,MeanG


the variance inflation factors for all variables in this model are **not acceptable** as $VIF > 2.5$

In [29]:
# For each Z5, calculate VIF and save in dataframe
vif15 = pd.DataFrame()
vif15['VIF Factor'] = [variance_inflation_factor(Z5.values, i) for i in range(Z5.shape[1])]
vif15['features'] = Z5.columns
vif15.round(3)

Unnamed: 0,VIF Factor,features
0,1.811,DC1
1,1.609,DC2
2,1.292,DC3
3,1.997,DW1
4,1.735,DW2
5,1.421,DW3
6,1.72,DD1
7,1.814,DD2
8,1.197,DG1
9,1.258,DG2


the variance inflation factors for all variables in this model is **are acceptable** as $VIF > 2.5$

In [30]:
# For each Z6, calculate VIF and save in dataframe
vif16 = pd.DataFrame()
vif16['VIF Factor'] = [variance_inflation_factor(Z6.values, i) for i in range(Z6.shape[1])]
vif16['features'] = Z6.columns
vif16.round(3)

Unnamed: 0,VIF Factor,features
0,4.474,const
1,1.276,DC1
2,1.048,DC2
3,1.143,DC3
4,1.271,DW1
5,1.08,DW2
6,1.17,DW3
7,1.535,DD1
8,1.341,DD2
9,1.099,DG1


the variance inflation factors for the constant in this model is **not acceptable** as $VIF > 2.5$