In [12]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $WinPc = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers and 6 defensemen

In [13]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(19, 16)

In [14]:
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD']
dc.sort_values(['Sum'], ascending=[True], inplace=True)
dc.head(20)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,Sum
12,2010,CGY,7,2,5,22,25,0.285714,0.714286,1.821429,2.267857,1.904762,13.0,2.0,6.0,2.0,5.994048
13,2010,PHI,4,1,3,12,16,0.25,0.75,2.0,2.21875,1.791667,14.5,3.5,5.0,1.0,6.010417
4,2010,CAR,9,6,3,30,26,0.666667,0.333333,1.777778,2.361111,1.962963,6.5,1.0,7.0,3.0,6.101852
6,2010,CHI,6,4,2,21,11,0.666667,0.333333,2.916667,1.75,2.027778,6.5,12.5,1.0,7.5,6.694444
16,2010,TOR,1,0,1,1,4,0.0,1.0,2.0,2.75,2.0,18.0,3.5,16.0,5.0,6.75
3,2010,VAN,9,8,1,34,14,0.888889,0.111111,2.416667,2.152778,2.203704,4.0,6.5,4.0,10.0,6.773148
9,2010,STL,6,2,4,12,18,0.333333,0.666667,2.125,2.708333,2.166667,11.0,5.0,15.0,9.0,7.0
18,2010,PHX,2,0,2,3,10,0.0,1.0,3.0,2.0,2.0,18.0,15.5,2.0,5.0,7.0
0,2010,ANA,6,6,0,18,9,1.0,0.0,2.625,2.479167,2.027778,2.0,10.0,9.0,7.5,7.131944
10,2010,NSH,3,1,2,5,5,0.333333,0.666667,2.916667,2.416667,2.0,11.0,12.5,8.0,5.0,7.333333


- summary analysis

In [15]:
dc.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,Sum
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2010.0,4.631579,2.315789,2.315789,12.631579,12.631579,0.477026,0.522974,2.58702,2.496481,2.264237,10.0,10.0,10.0,10.0,7.347737
std,0.0,2.650168,2.26207,1.887168,9.534793,7.544023,0.340239,0.340239,0.458195,0.359575,0.305583,5.570258,5.617433,5.624846,5.587685,0.874223
min,2010.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.777778,1.75,1.791667,2.0,1.0,1.0,1.0,5.994048
25%,2010.0,2.5,1.0,1.0,5.5,6.5,0.25,0.333333,2.270833,2.243304,2.0,6.5,5.75,5.5,5.0,6.761574
50%,2010.0,4.0,2.0,2.0,10.0,12.0,0.333333,0.666667,2.625,2.575,2.203704,11.0,10.0,10.0,10.0,7.333333
75%,2010.0,6.0,3.0,3.0,19.5,17.0,0.666667,0.75,2.975,2.697917,2.5,14.5,14.75,14.5,13.5,8.10625
max,2010.0,9.0,8.0,7.0,34.0,26.0,1.0,1.0,3.166667,3.15625,2.75,18.0,19.0,19.0,19.0,8.885417


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [16]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

# differential in mean of centers, defensemen with respect to wingers
dc['d1'] = dc['meanc'] - dc['meanw']
dc['d2'] = dc['meand'] - dc['meanw']

# differential in mean of centers, wingers with respect to defensemen
dc['D1'] = dc['meanc'] - dc['meand']
dc['D2'] = dc['meanw'] - dc['meand']

# differential in mean of wingers, defensemen with respect to centers
dc['dw'] = dc['meanw'] - dc['meanc']
dc['dd'] = dc['meand'] - dc['meanc']

In [18]:
y = dc['WinPc']   
X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )
X3 = dc[['meanc', 'meanw', 'meand']]
X4 = sm.add_constant(dc[['d1', 'd2']])
X5 = sm.add_constant(dc[['D1', 'D2']])
X6 = sm.add_constant(dc[['dw', 'dd']])


m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()
m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()


#m1.summary()
m2.summary()
#m3.summary()
#m4.summary()
#m5.summary()
#m6.summary()


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.136
Method:,Least Squares,F-statistic:,1.947
Date:,"Mon, 19 Mar 2018",Prob (F-statistic):,0.165
Time:,13:35:17,Log-Likelihood:,-2.8382
No. Observations:,19,AIC:,13.68
Df Residuals:,15,BIC:,17.45
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2430,0.125,1.947,0.070,-0.023 0.509
meanc,0.2439,0.225,1.085,0.295,-0.235 0.723
meanw,0.6187,0.295,2.099,0.053,-0.010 1.247
meand,-0.9578,0.429,-2.235,0.041,-1.871 -0.044

0,1,2,3
Omnibus:,3.27,Durbin-Watson:,2.397
Prob(Omnibus):,0.195,Jarque-Bera (JB):,1.373
Skew:,0.448,Prob(JB):,0.503
Kurtosis:,3.966,Cond. No.,7.5


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $WinPc = \beta_{0} + \beta_{1}meanc + \beta_{2}meanw + \beta_{3}meand + e_{s}$

In [19]:
# For each X2, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif['features'] = X2.columns
vif.round(3)

Unnamed: 0,VIF Factor,features
0,2.96,const
1,1.911,meanc
2,2.023,meanw
3,3.088,meand


- $meand$ is highly correlated at the conservative level as $VIF>2.5$

#### $WinPc = \beta_{1}meanc + \beta_{2}meanw + \beta_{3}meand + e_{s}$

In [20]:
# For each X3, calculate VIF and save in dataframe
vif3 = pd.DataFrame()
vif3['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif3['features'] = X3.columns
vif3.round(3)

Unnamed: 0,VIF Factor,features
0,1.544,meanc
1,1.338,meanw
2,1.939,meand


the variance inflation factors for this model are **acceptable** as $VIF<2.5$

#### $WinPc = \beta_{0} + \beta_{1}d1 + \beta_{2}d2 + e_{s}$

differential in centers, defesnemen with respect to wingers

In [21]:
# For each X4, calculate VIF and save in dataframe
vif4 = pd.DataFrame()
vif4['VIF Factor'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif4['features'] = X4.columns
vif4.round(3)

Unnamed: 0,VIF Factor,features
0,2.759,const
1,2.67,d1
2,2.67,d2


the variance inflation factors for this model are **not acceptable** as $VIF>2.5$

#### $WinPc = \beta_{0} + \beta_{1}D1 + \beta_{2}D2 + e_{s}$

differential in centers, wingers with respect to defensemen

In [22]:
# For each X5, calculate VIF and save in dataframe
vif5 = pd.DataFrame()
vif5['VIF Factor'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif5['features'] = X5.columns
vif5.round(3)

Unnamed: 0,VIF Factor,features
0,2.759,const
1,1.186,D1
2,1.186,D2


the variance inflation factors for this model are **acceptable** as $VIF>2.5$

#### $WinPc = \beta_{0} + \beta_{1}dw + \beta_{2}dd + e_{s}$

differential in wingers, defesnemen with respect to centers

In [23]:
# For each X6, calculate VIF and save in dataframe
vif6 = pd.DataFrame()
vif6['VIF Factor'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif6['features'] = X6.columns
vif6.round(3)

Unnamed: 0,VIF Factor,features
0,2.759,const
1,4.267,dw
2,4.267,dd


the variance inflation factors for this model are **not acceptable** as $VIF>2.5$