In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $WinPc = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers and 6 defensemen

In [2]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_nhl_positions.csv')
#dc = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.shape

(19, 16)

In [3]:
dc['Sum'] = dc['MeanC'] + dc['MeanW'] + dc['MeanD']
dc.sort_values(['Sum'], ascending=[True], inplace=True)
dc.head(20)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,Sum
12,2010,CGY,7,2,5,22,25,0.285714,0.714286,1.821429,2.267857,1.904762,13.0,2.0,6.0,2.0,5.994048
13,2010,PHI,4,1,3,12,16,0.25,0.75,2.0,2.21875,1.791667,14.5,3.5,5.0,1.0,6.010417
4,2010,CAR,9,6,3,30,26,0.666667,0.333333,1.777778,2.361111,1.962963,6.5,1.0,7.0,3.0,6.101852
6,2010,CHI,6,4,2,21,11,0.666667,0.333333,2.916667,1.75,2.027778,6.5,12.5,1.0,7.5,6.694444
16,2010,TOR,1,0,1,1,4,0.0,1.0,2.0,2.75,2.0,18.0,3.5,16.0,5.0,6.75
3,2010,VAN,9,8,1,34,14,0.888889,0.111111,2.416667,2.152778,2.203704,4.0,6.5,4.0,10.0,6.773148
9,2010,STL,6,2,4,12,18,0.333333,0.666667,2.125,2.708333,2.166667,11.0,5.0,15.0,9.0,7.0
18,2010,PHX,2,0,2,3,10,0.0,1.0,3.0,2.0,2.0,18.0,15.5,2.0,5.0,7.0
0,2010,ANA,6,6,0,18,9,1.0,0.0,2.625,2.479167,2.027778,2.0,10.0,9.0,7.5,7.131944
10,2010,NSH,3,1,2,5,5,0.333333,0.666667,2.916667,2.416667,2.0,11.0,12.5,8.0,5.0,7.333333


- summary analysis

In [4]:
dc.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,Sum
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2010.0,4.631579,2.315789,2.315789,12.631579,12.631579,0.477026,0.522974,2.58702,2.496481,2.264237,10.0,10.0,10.0,10.0,7.347737
std,0.0,2.650168,2.26207,1.887168,9.534793,7.544023,0.340239,0.340239,0.458195,0.359575,0.305583,5.570258,5.617433,5.624846,5.587685,0.874223
min,2010.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.777778,1.75,1.791667,2.0,1.0,1.0,1.0,5.994048
25%,2010.0,2.5,1.0,1.0,5.5,6.5,0.25,0.333333,2.270833,2.243304,2.0,6.5,5.75,5.5,5.0,6.761574
50%,2010.0,4.0,2.0,2.0,10.0,12.0,0.333333,0.666667,2.625,2.575,2.203704,11.0,10.0,10.0,10.0,7.333333
75%,2010.0,6.0,3.0,3.0,19.5,17.0,0.666667,0.75,2.975,2.697917,2.5,14.5,14.75,14.5,13.5,8.10625
max,2010.0,9.0,8.0,7.0,34.0,26.0,1.0,1.0,3.166667,3.15625,2.75,18.0,19.0,19.0,19.0,8.885417


#### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [5]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

In [6]:
y = dc['WinPc']   
X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )
X3 = dc[['meanc', 'meanw', 'meand']]

m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()

#m1.summary()
#m2.summary()
m3.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.707
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,12.85
Date:,"Thu, 15 Mar 2018",Prob (F-statistic):,0.000157
Time:,12:33:11,Log-Likelihood:,-4.9789
No. Observations:,19,AIC:,15.96
Df Residuals:,16,BIC:,18.79
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.4499,0.215,2.093,0.053,-0.006 0.906
meanw,0.9527,0.260,3.667,0.002,0.402 1.504
meand,-1.6300,0.275,-5.924,0.000,-2.213 -1.047

0,1,2,3
Omnibus:,9.349,Durbin-Watson:,1.744
Prob(Omnibus):,0.009,Jarque-Bera (JB):,7.42
Skew:,0.935,Prob(JB):,0.0245
Kurtosis:,5.424,Cond. No.,2.45
