In [68]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $WinPc = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### regular season 

In [69]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_roster_ranking_nhl_positions.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_roster_ranking_nhl_positions.csv')
da = da.drop('Unnamed: 0', axis=1)
da.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,VAN,82,54,28,268,190,0.658537,0.341463,2.296748,2.136929,2.228804,1.0,9.0,5.0,14.0
1,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.320093,2.650697,2.022358,2.0,10.0,24.0,8.0
2,2010,SJ,82,48,34,253,216,0.585366,0.414634,1.775465,2.519541,2.382259,3.5,1.0,18.0,17.0
3,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.700958,2.20045,2.521196,3.5,21.0,11.0,26.0
4,2010,DET,82,47,35,263,241,0.573171,0.426829,1.859059,2.170267,1.813298,6.0,2.0,7.0,2.0


- summary analysis

In [82]:
da.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,meanc,meanw,meand
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,82.0,41.0,41.0,234.766667,234.766667,0.5,0.5,2.496457,2.405195,2.260576,15.5,15.5,15.5,15.5,0.003543,0.094805,-0.260576
std,0.0,0.0,6.93815,6.93815,22.535311,26.039338,0.084612,0.084612,0.40163,0.277651,0.249653,8.777911,8.803408,8.803408,8.803408,0.40163,0.277651,0.249653
min,2010.0,82.0,25.0,28.0,177.0,190.0,0.304878,0.341463,1.775465,1.986184,1.735772,1.0,1.0,1.0,1.0,-0.694309,-0.496951,-0.621951
25%,2010.0,82.0,37.25,36.0,223.5,212.25,0.454268,0.439024,2.244004,2.187652,2.030372,9.0,8.25,8.25,8.25,-0.240803,-0.118852,-0.46883
50%,2010.0,82.0,43.0,39.0,238.0,237.0,0.52439,0.47561,2.564097,2.3464,2.235192,15.5,15.5,15.5,15.5,-0.064097,0.1536,-0.235192
75%,2010.0,82.0,46.0,44.75,252.0,246.75,0.560976,0.545732,2.740803,2.618852,2.46883,22.625,22.75,22.75,22.75,0.255996,0.312348,-0.030372
max,2010.0,82.0,54.0,57.0,268.0,289.0,0.658537,0.695122,3.194309,2.996951,2.621951,30.0,30.0,30.0,30.0,0.724535,0.513816,0.264228


In [71]:
da['meanc'] = 2.5 - da['MeanC']
da['meanw'] = 2.5 - da['MeanW']
da['meand'] = 2 - da['MeanD']

#### model estimation 

- regress **team win percent** on the mean of players by position for the full regular season.

In [72]:
y = da['WinPc']   
X1 = sm.add_constant(da[['meanc', 'meanw', 'meand']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()


0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.487
Model:,OLS,Adj. R-squared:,0.428
Method:,Least Squares,F-statistic:,8.221
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.000518
Time:,19:17:42,Log-Likelihood:,42.037
No. Observations:,30,AIC:,-76.07
Df Residuals:,26,BIC:,-70.47
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5037,0.021,24.436,0.000,0.461 0.546
meanc,0.0482,0.033,1.467,0.154,-0.019 0.116
meanw,0.1500,0.046,3.227,0.003,0.054 0.245
meand,0.0696,0.057,1.224,0.232,-0.047 0.186

0,1,2,3
Omnibus:,0.253,Durbin-Watson:,1.108
Prob(Omnibus):,0.881,Jarque-Bera (JB):,0.446
Skew:,-0.011,Prob(JB):,0.8
Kurtosis:,2.403,Cond. No.,5.81


### games with 12 forwards and 6 defensemen

In [73]:
db = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_roster_ranking_nhl_positions.csv')
#db = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_roster_ranking_nhl_positions.csv')
db = db.drop('Unnamed: 0', axis=1)
db.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,VAN,59,41,18,201,138,0.694915,0.305085,2.331921,2.146388,2.223164,1.0,10.0,5.0,14.0
1,2010,DET,60,36,24,196,176,0.6,0.4,1.884762,2.209841,1.808333,2.0,3.0,11.0,2.0
2,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.643617,1.940012,2.148936,3.0,19.0,1.0,11.0
3,2010,ANA,59,35,24,165,163,0.59322,0.40678,3.040234,2.283757,2.09322,4.0,26.0,13.0,10.0
4,2010,BOS,64,37,27,196,156,0.578125,0.421875,2.617708,1.992411,1.960937,5.0,18.0,2.0,5.0


- summary analysis

In [74]:
db.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,56.733333,28.366667,28.366667,161.533333,161.533333,0.492724,0.507276,2.510886,2.41582,2.261042,15.5,15.5,15.5,15.5
std,0.0,13.866191,9.338402,8.715754,45.825556,45.017034,0.107344,0.107344,0.396132,0.295547,0.261604,8.79263,8.803408,8.803408,8.803408
min,2010.0,20.0,5.0,13.0,38.0,62.0,0.238095,0.305085,1.866169,1.940012,1.742647,1.0,1.0,1.0,1.0
25%,2010.0,56.5,21.75,22.5,157.0,142.0,0.448903,0.439394,2.277835,2.190339,2.028299,8.5,8.25,8.25,8.25
50%,2010.0,61.5,30.5,29.0,174.0,166.0,0.520513,0.479487,2.550397,2.328876,2.238819,15.5,15.5,15.5,15.5
75%,2010.0,65.75,35.75,33.0,184.75,193.25,0.560606,0.551097,2.722287,2.629899,2.48319,22.75,22.75,22.75,22.75
max,2010.0,69.0,41.0,44.0,221.0,224.0,0.694915,0.761905,3.175,3.10582,2.65873,30.0,30.0,30.0,30.0


In [75]:
db['meanc'] = 2.5 - db['MeanC']
db['meanw'] = 2.5 - db['MeanW']
db['meand'] = 2 - db['MeanD']

#### model estimation 

- regress **team win percent** on the mean of players by position for games with 12 forwards and 6 defensemen.

In [76]:
y = db['WinPc']   
X1 = sm.add_constant(db[['meanc', 'meanw', 'meand']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.543
Model:,OLS,Adj. R-squared:,0.49
Method:,Least Squares,F-statistic:,10.29
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.000121
Time:,19:17:42,Log-Likelihood:,36.631
No. Observations:,30,AIC:,-65.26
Df Residuals:,26,BIC:,-59.66
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5130,0.025,20.477,0.000,0.462 0.565
meanc,0.0234,0.041,0.566,0.576,-0.062 0.109
meanw,0.1785,0.055,3.243,0.003,0.065 0.292
meand,0.1344,0.070,1.908,0.068,-0.010 0.279

0,1,2,3
Omnibus:,1.651,Durbin-Watson:,1.07
Prob(Omnibus):,0.438,Jarque-Bera (JB):,0.96
Skew:,-0.435,Prob(JB):,0.619
Kurtosis:,3.096,Cond. No.,6.17


### games with 4 centers, 8 wingers and 6 defensemen

In [77]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,ANA,6,6,0,18,9,1.0,0.0,2.625,2.479167,2.027778,2.0,10.0,9.0,7.5
1,2010,WSH,1,1,0,2,1,1.0,0.0,2.75,2.125,2.5,2.0,11.0,3.0,13.5
2,2010,EDM,2,2,0,6,4,1.0,0.0,3.125,2.6875,2.75,2.0,18.0,14.0,19.0
3,2010,VAN,9,8,1,34,14,0.888889,0.111111,2.416667,2.152778,2.203704,4.0,6.5,4.0,10.0
4,2010,CAR,9,6,3,30,26,0.666667,0.333333,1.777778,2.361111,1.962963,6.5,1.0,7.0,3.0


- summary analysis

In [78]:
dc.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2010.0,4.631579,2.315789,2.315789,12.631579,12.631579,0.477026,0.522974,2.58702,2.496481,2.264237,10.0,10.0,10.0,10.0
std,0.0,2.650168,2.26207,1.887168,9.534793,7.544023,0.340239,0.340239,0.458195,0.359575,0.305583,5.570258,5.617433,5.624846,5.587685
min,2010.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.777778,1.75,1.791667,2.0,1.0,1.0,1.0
25%,2010.0,2.5,1.0,1.0,5.5,6.5,0.25,0.333333,2.270833,2.243304,2.0,6.5,5.75,5.5,5.0
50%,2010.0,4.0,2.0,2.0,10.0,12.0,0.333333,0.666667,2.625,2.575,2.203704,11.0,10.0,10.0,10.0
75%,2010.0,6.0,3.0,3.0,19.5,17.0,0.666667,0.75,2.975,2.697917,2.5,14.5,14.75,14.5,13.5
max,2010.0,9.0,8.0,7.0,34.0,26.0,1.0,1.0,3.166667,3.15625,2.75,18.0,19.0,19.0,19.0


#### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [79]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

In [80]:
y = dc['WinPc']   
X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )

m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()

#m1.summary()
m2.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.136
Method:,Least Squares,F-statistic:,1.947
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.165
Time:,19:17:42,Log-Likelihood:,-2.8382
No. Observations:,19,AIC:,13.68
Df Residuals:,15,BIC:,17.45
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2430,0.125,1.947,0.070,-0.023 0.509
meanc,0.2439,0.225,1.085,0.295,-0.235 0.723
meanw,0.6187,0.295,2.099,0.053,-0.010 1.247
meand,-0.9578,0.429,-2.235,0.041,-1.871 -0.044

0,1,2,3
Omnibus:,3.27,Durbin-Watson:,0.601
Prob(Omnibus):,0.195,Jarque-Bera (JB):,1.373
Skew:,0.448,Prob(JB):,0.503
Kurtosis:,3.966,Cond. No.,7.5
