In [23]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers and 6 defensemen

In [24]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/4c_8w_4d_game_team_player_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/4c_8w_4d_game_team_player_ranking_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(1584, 17)

In [25]:
dc = dc.rename(columns={'PlayerPosition': 'Position'})
dc['playercount'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
dc['rosterposition'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20025,LA,11.0,KOPITAR,C,1.0,1,3,2,CGY,LA,18.0,4.0,4.0,8.0,6.0,1.0,1.0
1,2010,20025,LA,23.0,BROWN,W,1.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0,1.0,2.0
2,2010,20025,LA,94.0,SMYTH,W,2.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0,1.0,2.0
3,2010,20025,LA,8.0,DOUGHTY,D,1.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0,1.0,2.0
4,2010,20025,LA,33.0,MITCHELL,D,3.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0,1.0,4.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [26]:
dc = pd.pivot_table(dc, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
dc = dc.reset_index()
dc.columns = ['_'.join(str(s).strip() for s in col if s) for col in dc.columns]
dc.reset_index()
dc = dc.fillna(0)
dc = dc.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dc.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20025,CGY,LA,1,3,2,LA,18.0,4.0,8.0,6.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,2.0,1.0,3.0
1,2010,20025,CGY,LA,3,1,2,CGY,18.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,1.0,2.0
2,2010,20037,WSH,NYI,1,2,1,NYI,18.0,4.0,8.0,6.0,0.0,0.0,3.0,1.0,0.0,3.0,3.0,3.0,1.0,0.0,4.0
3,2010,20037,WSH,NYI,2,1,1,WSH,18.0,4.0,8.0,6.0,1.0,1.0,0.0,2.0,1.0,1.0,4.0,2.0,4.0,1.0,1.0
4,2010,20076,MIN,VAN,2,6,4,VAN,18.0,4.0,8.0,6.0,2.0,0.0,0.0,2.0,2.0,1.0,3.0,2.0,3.0,1.0,2.0
5,2010,20076,MIN,VAN,6,2,4,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,2.0
6,2010,20116,LA,MIN,4,5,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,2.0
7,2010,20116,LA,MIN,5,4,-1,LA,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,2.0,2.0,1.0,3.0
8,2010,20363,VAN,CGY,2,7,-5,CGY,18.0,4.0,8.0,6.0,1.0,3.0,0.0,0.0,2.0,3.0,1.0,2.0,3.0,2.0,1.0
9,2010,20363,VAN,CGY,7,2,-5,VAN,18.0,4.0,8.0,6.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,1.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [27]:
dc['Win'] = dc.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dc['MeanC'] = ((dc['C1']*1) + (dc['C2']*2) + (dc['C3']*3) + (dc['C4'] *4))/dc['CCount']
dc['MeanW'] = ((dc['W1']*1) + (dc['W2']*2) + (dc['W3']*3) + (dc['W4'] *4))/dc['WCount']
dc['MeanD'] = ((dc['D1']*1) + (dc['D2']*2) + (dc['D3']*3))/dc['DCount']
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD']
dc.sort_values(['Sum'], ascending=[True], inplace=True)
dc.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,Sum
85,2010,21218,PHI,NYI,7,4,3,PHI,18.0,4.0,8.0,6.0,2.0,1.0,1.0,0.0,2.0,3.0,1.0,3.0,2.0,2.0,1.0,1,1.75,2.125,1.833333,5.708333
24,2010,20636,CAR,CGY,5,6,1,CGY,18.0,4.0,8.0,6.0,1.0,3.0,0.0,0.0,2.0,3.0,1.0,2.0,3.0,3.0,0.0,0,1.75,2.125,1.833333,5.708333
81,2010,21208,CAR,ATL,6,1,-5,CAR,18.0,4.0,8.0,6.0,2.0,1.0,1.0,0.0,3.0,2.0,1.0,3.0,2.0,0.0,3.0,1,1.75,2.375,1.666667,5.791667
77,2010,21170,CAR,NYI,4,2,-2,CAR,18.0,4.0,8.0,6.0,2.0,1.0,1.0,0.0,3.0,2.0,1.0,3.0,2.0,0.0,3.0,1,1.75,2.375,1.666667,5.791667
46,2010,20955,CHI,CGY,4,6,2,CGY,18.0,4.0,8.0,6.0,1.0,3.0,0.0,0.0,2.0,3.0,1.0,2.0,3.0,2.0,1.0,0,1.75,2.25,1.833333,5.833333


- summary analysis

In [28]:
dc.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,44.0,44.0,44.0
0,mean,2.482955,2.525568,2.208333
0,std,0.545589,0.343735,0.307371
0,min,1.25,1.625,1.666667
0,25%,2.0,2.34375,2.0
0,50%,2.5,2.5,2.166667
0,75%,3.0,2.75,2.5
0,max,3.25,3.25,2.833333
1,count,44.0,44.0,44.0
1,mean,2.482955,2.420455,2.223485


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [29]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

# differential in mean of centers, defensemen with respect to wingers
dc['d1'] = dc['meanc'] - dc['meanw']
dc['d2'] = dc['meand'] - dc['meanw']

# differential in mean of centers, wingers with respect to defensemen
dc['D1'] = dc['meanc'] - dc['meand']
dc['D2'] = dc['meanw'] - dc['meand']

# differential in mean of wingers, defensemen with respect to centers
dc['dw'] = dc['meanw'] - dc['meanc']
dc['dd'] = dc['meand'] - dc['meanc']

In [30]:
y = dc['Win']   
X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )
X3 = dc[['meanc', 'meanw', 'meand']]
X4 = sm.add_constant(dc[['d1', 'd2']])
X5 = sm.add_constant(dc[['D1', 'D2']])
X6 = sm.add_constant(dc[['dw', 'dd']])


m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()
m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()


#m1.summary()
m2.summary()
#m3.summary()
#m4.summary()
#m5.summary()
#m6.summary()



0,1,2,3
Dep. Variable:,Win,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1.217
Date:,"Mon, 19 Mar 2018",Prob (F-statistic):,0.309
Time:,13:29:19,Log-Likelihood:,-61.997
No. Observations:,88,AIC:,132.0
Df Residuals:,84,BIC:,141.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4170,0.079,5.288,0.000,0.260 0.574
meanc,0.0659,0.126,0.522,0.603,-0.185 0.317
meanw,0.3480,0.184,1.893,0.062,-0.018 0.714
meand,-0.3357,0.249,-1.347,0.182,-0.831 0.160

0,1,2,3
Omnibus:,0.014,Durbin-Watson:,2.157
Prob(Omnibus):,0.993,Jarque-Bera (JB):,12.558
Skew:,0.028,Prob(JB):,0.00188
Kurtosis:,1.15,Cond. No.,5.64


### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlted: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}meanc + \beta_{2}meanw + \beta_{3}meand + e_{s}$

In [31]:
# For each X2, calculate VIF and save in dataframe
vif2 = pd.DataFrame()
vif2['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif2['features'] = X2.columns
vif2.round(3)

Unnamed: 0,VIF Factor,features
0,2.18,const
1,1.389,meanc
2,1.597,meanw
3,2.077,meand


the variance inflation factors for this model are **acceptable** as $VIF>2.5$

#### $Win = \beta_{1}meanc + \beta_{2}meanw + \beta_{3}meand + e_{s}$

In [32]:
# For each X3, calculate VIF and save in dataframe
vif3 = pd.DataFrame()
vif3['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif3['features'] = X3.columns
vif3.round(3)

Unnamed: 0,VIF Factor,features
0,1.178,meanc
1,1.239,meanw
2,1.426,meand


the variance inflation factors for this model are **acceptable** as $VIF<2.5$

#### $Win = \beta_{0} + \beta_{1}d1 + \beta_{2}d2 + e_{s}$

differential in mean of centers, defensemen with respect to wingers

In [33]:
# For each X4, calculate VIF and save in dataframe
vif4 = pd.DataFrame()
vif4['VIF Factor'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif4['features'] = X4.columns
vif4.round(3)

Unnamed: 0,VIF Factor,features
0,2.048,const
1,1.77,d1
2,1.77,d2


the variance inflation factors for this model are **acceptable** as $VIF>2.5$

#### $Win = \beta_{0} + \beta_{1}D1 + \beta_{2}D2 + e_{s}$

differential in mean of centers, wingers with respect to defensemen

In [34]:
# For each X4, calculate VIF and save in dataframe
vif5 = pd.DataFrame()
vif5['VIF Factor'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif5['features'] = X5.columns
vif5.round(3)

Unnamed: 0,VIF Factor,features
0,2.048,const
1,1.024,D1
2,1.024,D2


the variance inflation factors for this model are **acceptable** as $VIF>2.5$

#### $Win = \beta_{0} + \beta_{1}dw + \beta_{2}dd + e_{s}$

differential in mean of wingers, defensemen with respect to centers

In [35]:
# For each X4, calculate VIF and save in dataframe
vif6 = pd.DataFrame()
vif6['VIF Factor'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif6['features'] = X6.columns
vif6.round(3)

Unnamed: 0,VIF Factor,features
0,2.048,const
1,3.483,dw
2,3.483,dd


the variance inflation factors for this model are **not acceptable** as $VIF>2.5$