In [92]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pylab import hist, show
import scipy
import zipfile
from math import log


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. 
- The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen.

- Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked, a team is considered to have an above average roster when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

### games with 4 centers, 8 wingers, 6 defensemen

In [21]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_4c_8w_6d_game_team.csv')
#da = pd.readcsv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_games_with_4c_8w_6d_1g.csv')
da = da.drop('Unnamed: 0', axis=1)

In [22]:
da.shape

(15794, 17)

In [23]:
da['playercount'] = da.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerName',])['PlayerName'].transform('count')
da['rosterposition'] = da.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

da.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,playercount,rosterposition
0,2010,20041,VAN,RYAN KESLER,C,1.0,3,4,-1,ANA,VAN,19,4,4.0,8.0,6.0,1.0,1,2
1,2010,20041,VAN,MANNY MALHOTRA,C,3.0,3,4,-1,ANA,VAN,19,4,4.0,8.0,6.0,1.0,1,1
2,2010,20041,VAN,JANNIK HANSEN,W,2.0,3,4,-1,ANA,VAN,19,8,4.0,8.0,6.0,1.0,1,2
3,2010,20041,VAN,HENRIK SEDIN,C,1.0,3,4,-1,ANA,VAN,19,4,4.0,8.0,6.0,1.0,1,2
4,2010,20041,VAN,RICK RYPIEN,C,4.0,3,4,-1,ANA,VAN,19,4,4.0,8.0,6.0,1.0,1,1


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [24]:
da = pd.pivot_table(da, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount'], columns=['Position', 'Rank'], values=['rosterposition'])
da = da.reset_index()
da.columns = ['_'.join(str(s).strip() for s in col if s) for col in da.columns]
da.reset_index()
da = da.fillna(0)
da = da.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3', 'rosterposition_G_1.0': 'G1', 'rosterposition_G_2.0': 'G2', 'rosterposition_G_3.0': 'G3' })
da = da[['Season', 'GameNumber', 'TeamCode', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam', 'RosterCount', 'CCount', 'WCount', 'DCount', 'GCount', 'C1', 'C2', 'C3', 'C4', 'D1', 'D2', 'D3', 'G1', 'G2', 'G3', 'W1', 'W2', 'W3', 'W4']]
da.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,GF,GA,GD,WinTeam,LossTeam,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4
0,2010,20041,VAN,3,4,-1,ANA,VAN,19,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,2.0
1,2010,20041,ANA,4,3,1,ANA,VAN,19,4.0,8.0,6.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,4.0,1.0,0.0,0.0,3.0,1.0,3.0,1.0
2,2010,20061,MIN,2,3,-1,CBJ,MIN,19,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,1.0,2.0,3.0,2.0
3,2010,20061,CBJ,3,2,1,CBJ,MIN,19,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,0.0,4.0,2.0,0.0,0.0,1.0,1.0,4.0,3.0,0.0
4,2010,20076,VAN,2,6,-4,MIN,VAN,20,4.0,8.0,6.0,2.0,2.0,0.0,1.0,1.0,2.0,1.0,3.0,2.0,0.0,0.0,3.0,2.0,1.0,2.0
5,2010,20076,MIN,6,2,4,MIN,VAN,19,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,1.0,2.0,3.0,2.0
6,2010,20084,PHI,2,3,-1,ANA,PHI,19,4.0,8.0,6.0,1.0,2.0,0.0,2.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,5.0,2.0,0.0,1.0
7,2010,20084,ANA,3,2,1,ANA,PHI,19,4.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,0.0,0.0,1.0,3.0,1.0,3.0,1.0
8,2010,20101,TOR,2,5,-3,PHI,TOR,19,4.0,8.0,6.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,1.0,2.0,3.0,2.0,1.0
9,2010,20101,PHI,5,2,3,PHI,TOR,19,4.0,8.0,6.0,1.0,2.0,0.0,2.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,2.0


- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [25]:
da['Win'] = da.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
da['MeanC'] = ((da['C1']*1) + (da['C2']*2) + (da['C3']*3) + (da['C4'] *4))/da['CCount']
da['MeanW'] = ((da['W1']*1) + (da['W2']*2) + (da['W3']*3) + (da['W4'] *4))/da['WCount']
da['MeanD'] = ((da['D1']*1) + (da['D2']*2) + (da['D3']*3))/da['DCount']
da['MeanG'] = ((da['G1']*1) + (da['G2']*2) + (da['G3']*3))/da['GCount']
da.sort_values(['GameNumber'], ascending=[True], inplace=True)
da.head()

Unnamed: 0,Season,GameNumber,TeamCode,GF,GA,GD,WinTeam,LossTeam,RosterCount,CCount,WCount,DCount,GCount,C1,C2,C3,C4,D1,D2,D3,G1,G2,G3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD,MeanG
726,2017,20003,CGY,0,3,-3,EDM,CGY,19,4.0,8.0,6.0,1.0,1.0,0.0,3.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,2.0,3.0,2.0,1.0,0,2.5,2.25,2.0,1.0
727,2017,20003,EDM,3,0,3,EDM,CGY,19,4.0,8.0,6.0,1.0,3.0,1.0,0.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,3.0,4.0,1.0,1,1.25,2.75,2.333333,2.0
640,2016,20007,WSH,4,5,-1,PIT,WSH,19,4.0,8.0,6.0,1.0,2.0,2.0,0.0,0.0,4.0,2.0,0.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,0,1.5,1.75,1.333333,1.0
641,2016,20007,PIT,5,4,1,PIT,WSH,19,4.0,8.0,6.0,1.0,2.0,2.0,0.0,0.0,2.0,4.0,0.0,0.0,1.0,0.0,3.0,4.0,0.0,1.0,1,1.5,1.875,1.666667,2.0
370,2013,20008,N.J,0,3,-3,PIT,N.J,19,4.0,8.0,6.0,1.0,1.0,2.0,1.0,0.0,5.0,1.0,0.0,0.0,1.0,0.0,2.0,4.0,2.0,0.0,0,2.0,2.0,1.166667,2.0


In [26]:
me = da.copy()
me = me[me['Season'] == 2010]
me.shape

(156, 32)

In [27]:
da.shape

(828, 32)

In [28]:
da['Season'].value_counts()

2010    156
2011    154
2017    102
2014    102
2013    100
2016     86
2015     68
2012     60
Name: Season, dtype: int64

In [63]:
df = da.groupby(['Win'])['MeanC',  'MeanW',  'MeanD'].mean()
df =  df.T
df['bf']  =  df[1]/df[0]
df

Win,0,1,bf
MeanC,2.146739,2.036836,0.948805
MeanW,2.171498,2.063104,0.950083
MeanD,1.955314,1.845411,0.943792


### summary analysis

In [70]:
da.groupby(['Win'])['MeanC', 'MeanW', 'MeanD', 'MeanG'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD,MeanG
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,414.0,414.0,414.0,414.0
0,mean,2.146739,2.171498,1.955314,1.96256
0,std,0.446328,0.347439,0.356277,0.677679
0,min,1.0,1.0,1.166667,1.0
0,25%,1.75,2.0,1.666667,1.5
0,50%,2.25,2.125,2.0,2.0
0,75%,2.5,2.375,2.166667,2.0
0,max,3.5,3.125,2.833333,3.0
1,count,414.0,414.0,414.0,414.0
1,mean,2.036836,2.063104,1.845411,1.758454


### model estimation

- regress **team win percent** on the mean of players by position for games with 4 centers, 8 wingers and 6 defensemen.

In [71]:
da['meanc'] = 2.5 - da['MeanC']
da['meanw'] = (2.5 - da['MeanW'])*2
da['meand'] = (2 - da['MeanD'])*2
da['meang'] = 2 - da['MeanG']

In [112]:
y1 = da['Win'] 
y2 = da['GF']
y3 = da['GA']
y4 = da['GD']

X1 = sm.add_constant(da[['meanc', 'meanw', 'meand']] )

m1 = sm.OLS(y1, X1).fit()
m2 = sm.OLS(y2, X1).fit()
m3 = sm.OLS(y3, X1).fit()
m4 = sm.OLS(y4, X1).fit()

#m1.summary()
m2.summary()
#m3.summary()
#m4.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,6.046
Date:,"Thu, 05 Jul 2018",Prob (F-statistic):,0.00045
Time:,14:58:45,Log-Likelihood:,-1607.7
No. Observations:,828,AIC:,3223.0
Df Residuals:,824,BIC:,3242.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.5879,0.098,26.341,0.000,2.395 2.781
meanc,0.2763,0.142,1.952,0.051,-0.002 0.554
meanw,0.2427,0.090,2.685,0.007,0.065 0.420
meand,0.0605,0.093,0.650,0.516,-0.122 0.243

0,1,2,3
Omnibus:,16.272,Durbin-Watson:,1.958
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16.573
Skew:,0.328,Prob(JB):,0.000252
Kurtosis:,2.774,Cond. No.,3.84


In [122]:
da['nC1'] = da['C1'].astype(float)
da['lnPC1'] = log(da['nC1'])
#da['lnPW1'] = (2.5 - da['MeanW'])*2
#da['lnPD1'] = (2 - da['MeanD'])*2

TypeError: cannot convert the series to <class 'float'>

In [None]:
y1 = da['Win'] 
y2 = da['GF']
y3 = da['GA']
y4 = da['GD']

X1 = sm.add_constant(da[['meanc', 'meanw', 'meand']] )

m1 = sm.OLS(y1, X1).fit()
m2 = sm.OLS(y2, X1).fit()
m3 = sm.OLS(y3, X1).fit()
m4 = sm.OLS(y4, X1).fit()

#m1.summary()
m2.summary()
#m3.summary()
#m4.summary()

In [120]:
c = da['meanc']
w = da['meanw']
d = da['meand']

c0 = (da['C1'] == 0)
c1 = (da['C1'] == 1)
c2 = (da['C1'] == 2)
c3 = (da['C1'] == 3)
c4 = (da['C1'] == 4)



NameError: name 'C1' is not defined

### calculate  and inspect Variance Inflation Factor (VIF)

- Not correlated: $VIF=1$
- Moderately correlated: **$1<VIF<5$** or at a more conservative level of **$1<VIF <2.5$**
- Highly correlated: **$VIF>=5$** or at a more conservative level **$VIF>=5$**

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

In [34]:
# For each X1, calculate VIF and save in dataframe
vif1 = pd.DataFrame()
vif1['VIF Factor'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif1['features'] = X1.columns
vif1.round(3)

Unnamed: 0,VIF Factor,features
0,57.323,const
1,1.127,MeanC
2,1.158,MeanW
3,1.229,MeanD


the variance inflation factors for the constant of this model is **not acceptable** as $VIF<5$

In [62]:
# For each X2, calculate VIF and save in dataframe
vif2 = pd.DataFrame()
vif2['VIF Factor'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif2['features'] = X2.columns
vif2.round(3)

Unnamed: 0,VIF Factor,features
0,2.796,const
1,1.127,meanc
2,1.158,meanw
3,1.229,meand


the variance inflation factors for this model are **acceptable** as $VIF<5$

In [36]:
beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/analysis/vif_2010-2017.tex', 'w')
f.write(beginningtex)
f.write(vif2.to_latex())
f.write(endtex)
f.close()