In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $WinPc = \beta_{0} + \beta_{1}C + \beta_{2}W + \beta_{3}D + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### regular season 

In [9]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_roster_ranking_centers_wingers_defensemen.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_roster_ranking_centers_wingers_defensemen.csv')
da = da.drop('Unnamed: 0', axis=1)
da.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,VAN,82,54,28,268,190,0.658537,0.341463,1.801626,2.37079,2.228804,1.0,4.0,9.0,13.0
1,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.79126,2.77727,2.022358,2.0,25.0,23.0,8.0
2,2010,SJ,82,48,34,253,216,0.585366,0.414634,1.583537,2.310414,2.382259,3.5,1.0,6.0,18.0
3,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.379065,2.595901,2.521196,3.5,17.0,18.0,26.0
4,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.72561,2.327575,1.735772,6.0,2.0,7.0,1.0


- summary analysis

In [10]:
da.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,82.0,41.0,41.0,234.766667,234.766667,0.5,0.5,2.326917,2.547296,2.266273,15.5,15.5,15.5,15.5
std,0.0,0.0,6.93815,6.93815,22.535311,26.039338,0.084612,0.084612,0.42399,0.315056,0.253673,8.777911,8.803408,8.803408,8.803408
min,2010.0,82.0,25.0,28.0,177.0,190.0,0.304878,0.341463,1.583537,1.7214,1.735772,1.0,1.0,1.0,1.0
25%,2010.0,82.0,37.25,36.0,223.5,212.25,0.454268,0.439024,2.025661,2.363277,2.030372,9.0,8.25,8.25,8.25
50%,2010.0,82.0,43.0,39.0,238.0,237.0,0.52439,0.47561,2.256504,2.573961,2.242995,15.5,15.5,15.5,15.5
75%,2010.0,82.0,46.0,44.75,252.0,246.75,0.560976,0.545732,2.748882,2.772825,2.442581,22.625,22.75,22.75,22.75
max,2010.0,82.0,54.0,57.0,268.0,289.0,0.658537,0.695122,2.985163,3.129065,2.704007,30.0,30.0,30.0,30.0


In [103]:
da['C'] = 2.5 - da['MeanC'] 
da['W'] = 2.5 - da['MeanW'] 
da['D'] = 2 - da['MeanD'] 

#### model estimation 

- regress **team win percent** on the mean of players by position for the full regular season.

In [104]:
y = da['WinPc']   
X1 = sm.add_constant(da[['C', 'W', 'D']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.379
Model:,OLS,Adj. R-squared:,0.308
Method:,Least Squares,F-statistic:,5.296
Date:,"Sat, 10 Mar 2018",Prob (F-statistic):,0.00552
Time:,03:26:14,Log-Likelihood:,39.184
No. Observations:,30,AIC:,-70.37
Df Residuals:,26,BIC:,-64.76
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5185,0.022,24.028,0.000,0.474 0.563
C,0.0087,0.032,0.271,0.789,-0.057 0.074
W,0.1338,0.052,2.551,0.017,0.026 0.242
D,0.0514,0.064,0.798,0.432,-0.081 0.184

0,1,2,3
Omnibus:,0.708,Durbin-Watson:,1.023
Prob(Omnibus):,0.702,Jarque-Bera (JB):,0.732
Skew:,0.147,Prob(JB):,0.694
Kurtosis:,2.293,Cond. No.,6.21


### games with 12 forwards and 6 defensemen

In [105]:
db = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_roster_ranking_forwards_defensemen.csv')
#db = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_roster_ranking_forwards_defensemen.csv')
db = db.drop('Unnamed: 0', axis=1)
db.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,VAN,58,40,18,198,137,0.689655,0.310345,1.825862,2.377292,2.221264,1.0,4.0,9.0,15.0
1,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.097872,1.87272,2.148936,2.0,9.0,2.0,10.0
2,2010,PHI,66,39,27,219,188,0.590909,0.409091,1.719697,2.323653,1.739899,3.0,1.0,6.0,1.0
3,2010,DET,61,36,25,197,181,0.590164,0.409836,2.064754,2.210122,1.806011,4.0,8.0,4.0,2.0
4,2010,ANA,60,35,25,169,168,0.583333,0.416667,3.0075,2.537434,2.208333,5.0,30.0,13.0,12.0


- summary analysis

In [106]:
db.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,58.6,29.3,29.3,167.7,167.7,0.494979,0.505021,2.33894,2.551102,2.262987,15.5,15.5,15.5,15.5
std,0.0,12.237872,8.50213,8.183878,40.87251,41.150816,0.104917,0.104917,0.409036,0.316061,0.263292,8.80145,8.803408,8.803408,8.803408
min,2010.0,19.0,4.0,15.0,32.0,56.0,0.210526,0.310345,1.719697,1.717833,1.739899,1.0,1.0,1.0,1.0
25%,2010.0,58.0,22.5,25.0,158.5,157.0,0.460623,0.438796,2.073034,2.369532,2.026281,8.25,8.25,8.25,8.25
50%,2010.0,63.5,32.0,29.5,177.0,172.0,0.520398,0.479602,2.264628,2.581613,2.229122,15.5,15.5,15.5,15.5
75%,2010.0,66.0,35.75,33.0,192.5,191.0,0.561204,0.539377,2.744976,2.757698,2.466667,22.75,22.75,22.75,22.75
max,2010.0,70.0,40.0,44.0,219.0,230.0,0.689655,0.789474,3.0075,3.202485,2.688034,30.0,30.0,30.0,30.0


In [107]:
db['C'] = 2.5 - db['MeanC']
db['W'] = 2.5 - db['MeanW']
db['D'] = 2 - db['MeanD']

#### model estimation 

- regress **team win percent** on the mean of players by position for games with 12 forwards and 6 defensemen.

In [108]:
y = db['WinPc']   
X1 = sm.add_constant(db[['C', 'W', 'D']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.496
Model:,OLS,Adj. R-squared:,0.438
Method:,Least Squares,F-statistic:,8.535
Date:,"Sat, 10 Mar 2018",Prob (F-statistic):,0.000411
Time:,03:26:24,Log-Likelihood:,35.861
No. Observations:,30,AIC:,-63.72
Df Residuals:,26,BIC:,-58.12
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5291,0.024,22.023,0.000,0.480 0.578
C,-0.0076,0.037,-0.207,0.838,-0.083 0.068
W,0.1799,0.062,2.924,0.007,0.053 0.306
D,0.0901,0.074,1.220,0.233,-0.062 0.242

0,1,2,3
Omnibus:,0.684,Durbin-Watson:,0.945
Prob(Omnibus):,0.71,Jarque-Bera (JB):,0.693
Skew:,0.076,Prob(JB):,0.707
Kurtosis:,2.271,Cond. No.,6.51


### games with 4 centers, 8 wingers and 6 defensemen

In [109]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_centers_wingers_defensemen.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_ranking_centers_wingers_defensemen.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
0,2010,NSH,16,13,3,55,29,0.8125,0.1875,2.78125,2.546875,2.0,1.0,23.0,14.0,5.5
1,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.009615,2.350962,2.211538,2.0,9.0,8.0,12.0
2,2010,DET,25,18,7,93,63,0.72,0.28,2.0,2.205,1.78,3.0,8.0,5.0,2.0
3,2010,LA,25,17,8,68,54,0.68,0.32,2.59,2.595,2.253333,4.0,20.0,17.0,14.0
4,2010,BUF,18,12,6,62,52,0.666667,0.333333,2.944444,1.923611,2.037037,5.0,27.0,3.0,8.0


- summary analysis

In [110]:
dc.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,18.733333,9.366667,9.366667,53.133333,53.133333,0.480995,0.519005,2.355433,2.541531,2.273266,15.5,15.5,15.5,15.5
std,0.0,7.750121,5.327116,4.810071,26.654537,25.429732,0.182702,0.182702,0.42722,0.33592,0.264274,8.80047,8.802429,8.802429,8.802429
min,2010.0,4.0,0.0,2.0,7.0,10.0,0.0,0.1875,1.71875,1.75,1.725806,1.0,1.0,1.0,1.0
25%,2010.0,13.25,6.0,6.25,31.0,33.25,0.387311,0.402273,2.002404,2.363221,2.040033,7.875,8.25,8.25,8.25
50%,2010.0,17.0,9.0,8.0,58.0,49.0,0.481066,0.518934,2.319648,2.571094,2.31016,15.5,15.5,15.5,15.5
75%,2010.0,25.0,12.75,12.0,74.25,72.0,0.597727,0.612689,2.767187,2.815385,2.480556,22.75,22.75,22.75,22.75
max,2010.0,33.0,21.0,22.0,102.0,106.0,0.8125,1.0,3.0,3.221154,2.690476,30.0,29.5,30.0,30.0


In [111]:
dc['C'] = 2.5 - dc['MeanC']
dc['W'] = 2.5 - dc['MeanW']
dc['D'] = 2 - dc['MeanD']

#### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [112]:
y = dc['WinPc']   
X1 = sm.add_constant(dc[['C', 'W', 'D']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.257
Model:,OLS,Adj. R-squared:,0.172
Method:,Least Squares,F-statistic:,3.003
Date:,"Sat, 10 Mar 2018",Prob (F-statistic):,0.0486
Time:,03:26:32,Log-Likelihood:,13.4
No. Observations:,30,AIC:,-18.8
Df Residuals:,26,BIC:,-13.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5157,0.052,9.884,0.000,0.408 0.623
C,-0.0338,0.075,-0.454,0.654,-0.187 0.119
W,0.2418,0.121,2.006,0.055,-0.006 0.490
D,0.0725,0.155,0.469,0.643,-0.245 0.390

0,1,2,3
Omnibus:,0.669,Durbin-Watson:,0.681
Prob(Omnibus):,0.716,Jarque-Bera (JB):,0.286
Skew:,-0.239,Prob(JB):,0.867
Kurtosis:,3.013,Cond. No.,6.32
