In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

#### $Win = \beta_{0} + \beta_{1}MeanC + \beta_{2}MeanW + \beta_{3}MeanD + e_{s}$

## regular season 

In [2]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_game_player_rank_nhl_positions.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_game_player_rank_nhl_positions.csv')
da = da.drop('Unnamed: 0', axis=1)
da.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0


In [3]:
da = da.rename(columns={'PlayerPosition': 'Position'})
da['playercount'] = da.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
da['rosterposition'] = da.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
da.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0,1.0,4.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,3.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,3.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,3.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,3.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [4]:
da = pd.pivot_table(da, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
da = da.reset_index()
da.columns = ['_'.join(str(s).strip() for s in col if s) for col in da.columns]
da.reset_index()
da = da.fillna(0)
da = da.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
da.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,3.0,0.0,3.0,1.0,1.0
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,2.0,1.0,0.0,2.0,1.0,4.0,1.0,2.0,2.0,0.0,3.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,1.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,2.0,1.0,2.0,0.0,2.0,3.0,1.0,3.0,1.0,1.0,2.0
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,1.0,3.0,2.0,2.0
5,2010,20003,CAR,MIN,4,3,-1,CAR,18.0,6.0,6.0,6.0,2.0,0.0,3.0,1.0,3.0,1.0,2.0,3.0,1.0,0.0,2.0
6,2010,20004,COL,CHI,3,4,1,CHI,18.0,3.0,9.0,6.0,1.0,1.0,0.0,1.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0
7,2010,20004,COL,CHI,4,3,1,COL,18.0,6.0,6.0,6.0,1.0,3.0,1.0,1.0,1.0,0.0,5.0,2.0,2.0,1.0,1.0
8,2010,20005,EDM,CGY,0,4,4,CGY,18.0,3.0,9.0,6.0,1.0,2.0,0.0,0.0,1.0,2.0,3.0,2.0,3.0,1.0,3.0
9,2010,20005,EDM,CGY,4,0,4,EDM,18.0,6.0,6.0,6.0,0.0,1.0,4.0,1.0,0.0,3.0,3.0,1.0,3.0,1.0,1.0


In [5]:
da.shape

(2460, 23)

- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [6]:
da['Win'] = da.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
da['MeanC'] = ((da['C1']*1) + (da['C2']*2) + (da['C3']*3) + (da['C4'] *4))/da['CCount']
da['MeanW'] = ((da['W1']*1) + (da['W2']*2) + (da['W3']*3) + (da['W4'] *4))/da['WCount']
da['MeanD'] = ((da['D1']*1) + (da['D2']*2) + (da['D3']*3))/da['DCount']
da.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,3.0,0.0,3.0,1.0,1.0,0,2.714286,2.6,2.166667
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,2.0,1.0,0.0,2.0,1.0,4.0,1.0,2.0,2.0,0.0,3.0,1,2.4,2.571429,2.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,1.0,0,2.375,2.5,2.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,2.0,1.0,2.0,0.0,2.0,3.0,1.0,3.0,1.0,1.0,2.0,1,2.0,2.285714,1.833333
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,1.0,3.0,2.0,2.0,0,2.5,2.625,2.666667


#### summary analysis

In [7]:
da.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,1230.0,1230.0,1230.0
0,mean,2.528284,2.437863,2.28548
0,std,0.4558,0.351467,0.281204
0,min,1.2,1.4,1.666667
0,25%,2.2,2.142857,2.0
0,50%,2.571429,2.428571,2.333333
0,75%,2.857143,2.666667,2.5
0,max,3.666667,3.6,3.0
1,count,1230.0,1230.0,1230.0
1,mean,2.464629,2.372526,2.235673


In [8]:
da['meanc'] = 2.5 - da['MeanC']
da['meanw'] = 2.5 - da['MeanW']
da['meand'] = 2 - da['MeanD']

y = da['Win']   
X1 = sm.add_constant(da[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(da[['meanc', 'meanw', 'meand']] )

m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()

m1.summary()
m2.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,12.3
Date:,"Wed, 14 Mar 2018",Prob (F-statistic):,5.52e-08
Time:,11:58:08,Log-Likelihood:,-1767.1
No. Observations:,2460,AIC:,3542.0
Df Residuals:,2456,BIC:,3565.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5120,0.015,33.615,0.000,0.482 0.542
meanc,0.0500,0.024,2.123,0.034,0.004 0.096
meanw,0.1113,0.031,3.620,0.000,0.051 0.172
meand,0.0872,0.039,2.213,0.027,0.010 0.164

0,1,2,3
Omnibus:,0.003,Durbin-Watson:,3.965
Prob(Omnibus):,0.999,Jarque-Bera (JB):,386.33
Skew:,-0.003,Prob(JB):,1.29e-84
Kurtosis:,1.059,Cond. No.,4.54


### games with 12 forwards and 6 defensemen

In [9]:
db = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv')
#db = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv')
db = db.drop('Unnamed: 0', axis=1)
db.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0


In [10]:
db = db.rename(columns={'PlayerPosition': 'Position'})
db['playercount'] = db.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
db['rosterposition'] = db.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
db.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0,1.0,4.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,3.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0,1.0,3.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,3.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0,1.0,3.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [11]:
db = pd.pivot_table(db, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
db = db.reset_index()
db.columns = ['_'.join(str(s).strip() for s in col if s) for col in db.columns]
db.reset_index()
db = db.fillna(0)
db = db.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
db.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,3.0,0.0,3.0,1.0,1.0
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,2.0,1.0,0.0,2.0,1.0,4.0,1.0,2.0,2.0,0.0,3.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,1.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,2.0,1.0,2.0,0.0,2.0,3.0,1.0,3.0,1.0,1.0,2.0
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,1.0,3.0,2.0,2.0
5,2010,20003,CAR,MIN,4,3,-1,CAR,18.0,6.0,6.0,6.0,2.0,0.0,3.0,1.0,3.0,1.0,2.0,3.0,1.0,0.0,2.0
6,2010,20004,COL,CHI,3,4,1,CHI,18.0,3.0,9.0,6.0,1.0,1.0,0.0,1.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0
7,2010,20004,COL,CHI,4,3,1,COL,18.0,6.0,6.0,6.0,1.0,3.0,1.0,1.0,1.0,0.0,5.0,2.0,2.0,1.0,1.0
8,2010,20005,EDM,CGY,0,4,4,CGY,18.0,3.0,9.0,6.0,1.0,2.0,0.0,0.0,1.0,2.0,3.0,2.0,3.0,1.0,3.0
9,2010,20005,EDM,CGY,4,0,4,EDM,18.0,6.0,6.0,6.0,0.0,1.0,4.0,1.0,0.0,3.0,3.0,1.0,3.0,1.0,1.0


In [12]:
db.shape

(1702, 23)

- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [13]:
db['Win'] = db.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
db['MeanC'] = ((db['C1']*1) + (db['C2']*2) + (db['C3']*3) + (db['C4'] *4))/db['CCount']
db['MeanW'] = ((db['W1']*1) + (db['W2']*2) + (db['W3']*3) + (db['W4'] *4))/db['WCount']
db['MeanD'] = ((db['D1']*1) + (db['D2']*2) + (db['D3']*3))/db['DCount']
db.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD
0,2010,20001,TOR,MTL,2,3,1,MTL,18.0,7.0,5.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,3.0,0.0,3.0,1.0,1.0,0,2.714286,2.6,2.166667
1,2010,20001,TOR,MTL,3,2,1,TOR,18.0,5.0,7.0,6.0,2.0,1.0,0.0,2.0,1.0,4.0,1.0,2.0,2.0,0.0,3.0,1,2.4,2.571429,2.0
2,2010,20002,PHI,PIT,2,3,-1,PIT,18.0,8.0,4.0,6.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,3.0,0.0,1.0,0,2.375,2.5,2.0
3,2010,20002,PHI,PIT,3,2,-1,PHI,18.0,5.0,7.0,6.0,2.0,1.0,2.0,0.0,2.0,3.0,1.0,3.0,1.0,1.0,2.0,1,2.0,2.285714,1.833333
4,2010,20003,CAR,MIN,3,4,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,1.0,3.0,2.0,2.0,0,2.5,2.625,2.666667


#### summary analysis

In [14]:
db.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,851.0,851.0,851.0
0,mean,2.524687,2.43335,2.267137
0,std,0.440502,0.342716,0.288888
0,min,1.25,1.4,1.666667
0,25%,2.2,2.142857,2.0
0,50%,2.5,2.428571,2.333333
0,75%,2.833333,2.666667,2.5
0,max,3.625,3.6,3.0
1,count,851.0,851.0,851.0
1,mean,2.471935,2.364086,2.204269


In [15]:
db['meanc'] = 2.5 - db['MeanC']
db['meanw'] = 2.5 - db['MeanW']
db['meand'] = 2 - db['MeanD']

y = db['Win']   
X1 = sm.add_constant(db[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(db[['meanc', 'meanw', 'meand']] )

m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()

m1.summary()
m2.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,10.14
Date:,"Wed, 14 Mar 2018",Prob (F-statistic):,1.28e-06
Time:,11:58:08,Log-Likelihood:,-1220.2
No. Observations:,1702,AIC:,2448.0
Df Residuals:,1698,BIC:,2470.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5158,0.018,28.787,0.000,0.481 0.551
meanc,0.0359,0.030,1.192,0.234,-0.023 0.095
meanw,0.1191,0.038,3.099,0.002,0.044 0.194
meand,0.1186,0.048,2.459,0.014,0.024 0.213

0,1,2,3
Omnibus:,0.001,Durbin-Watson:,3.963
Prob(Omnibus):,0.999,Jarque-Bera (JB):,264.198
Skew:,-0.002,Prob(JB):,4.27e-58
Kurtosis:,1.07,Cond. No.,4.75


### games with 4 centers, 8 wingers and 6 defensemen

In [16]:
dc = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/4c_8w_4d_game_team_player_ranking_nhl_positions.csv')
#dc = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/4c_8w_4d_game_team_player_ranking_nhl_positions.csv')
dc = dc.drop('Unnamed: 0', axis=1)
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20025,LA,11.0,KOPITAR,C,1.0,1,3,2,CGY,LA,18.0,4.0,4.0,8.0,6.0
1,2010,20025,LA,23.0,BROWN,W,1.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0
2,2010,20025,LA,94.0,SMYTH,W,2.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0
3,2010,20025,LA,8.0,DOUGHTY,D,1.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0
4,2010,20025,LA,33.0,MITCHELL,D,3.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0


In [17]:
dc = dc.rename(columns={'PlayerPosition': 'Position'})
dc['playercount'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')
dc['rosterposition'] = dc.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')
dc.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,playercount,rosterposition
0,2010,20025,LA,11.0,KOPITAR,C,1.0,1,3,2,CGY,LA,18.0,4.0,4.0,8.0,6.0,1.0,1.0
1,2010,20025,LA,23.0,BROWN,W,1.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0,1.0,2.0
2,2010,20025,LA,94.0,SMYTH,W,2.0,1,3,2,CGY,LA,18.0,8.0,4.0,8.0,6.0,1.0,2.0
3,2010,20025,LA,8.0,DOUGHTY,D,1.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0,1.0,2.0
4,2010,20025,LA,33.0,MITCHELL,D,3.0,1,3,2,CGY,LA,18.0,6.0,4.0,8.0,6.0,1.0,4.0


#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [18]:
dc = pd.pivot_table(dc, index=['Season', 'GameNumber', 'WinTeam', 'LossTeam', 'GF', 'GA', 'GD', 'TeamCode', 'RosterCount', 'CCount', 'WCount', 'DCount',], columns=['Position', 'Rank'], values=['rosterposition'])
dc = dc.reset_index()
dc.columns = ['_'.join(str(s).strip() for s in col if s) for col in dc.columns]
dc.reset_index()
dc = dc.fillna(0)
dc = dc.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dc.head(10)

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20025,CGY,LA,1,3,2,LA,18.0,4.0,8.0,6.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,2.0,1.0,3.0
1,2010,20025,CGY,LA,3,1,2,CGY,18.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,1.0,2.0
2,2010,20037,WSH,NYI,1,2,1,NYI,18.0,4.0,8.0,6.0,0.0,0.0,3.0,1.0,0.0,3.0,3.0,3.0,1.0,0.0,4.0
3,2010,20037,WSH,NYI,2,1,1,WSH,18.0,4.0,8.0,6.0,1.0,1.0,0.0,2.0,1.0,1.0,4.0,2.0,4.0,1.0,1.0
4,2010,20076,MIN,VAN,2,6,4,VAN,18.0,4.0,8.0,6.0,2.0,0.0,0.0,2.0,2.0,1.0,3.0,2.0,3.0,1.0,2.0
5,2010,20076,MIN,VAN,6,2,4,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,2.0
6,2010,20116,LA,MIN,4,5,-1,MIN,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,2.0
7,2010,20116,LA,MIN,5,4,-1,LA,18.0,4.0,8.0,6.0,1.0,0.0,3.0,0.0,1.0,0.0,5.0,2.0,2.0,1.0,3.0
8,2010,20363,VAN,CGY,2,7,-5,CGY,18.0,4.0,8.0,6.0,1.0,3.0,0.0,0.0,2.0,3.0,1.0,2.0,3.0,2.0,1.0
9,2010,20363,VAN,CGY,7,2,-5,VAN,18.0,4.0,8.0,6.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,1.0


In [19]:
dc.shape

(88, 23)

- Assign a value of 1 to the team that won the game and a value of 0 to the team that loss. Compute the mean by position per team for each game.

In [20]:
dc['Win'] = dc.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dc['MeanC'] = ((dc['C1']*1) + (dc['C2']*2) + (dc['C3']*3) + (dc['C4'] *4))/dc['CCount']
dc['MeanW'] = ((dc['W1']*1) + (dc['W2']*2) + (dc['W3']*3) + (dc['W4'] *4))/dc['WCount']
dc['MeanD'] = ((dc['D1']*1) + (dc['D2']*2) + (dc['D3']*3))/dc['DCount']
dc.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam,GF,GA,GD,TeamCode,RosterCount,CCount,WCount,DCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,Win,MeanC,MeanW,MeanD
0,2010,20025,CGY,LA,1,3,2,LA,18.0,4.0,8.0,6.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,2.0,1.0,3.0,0,2.75,2.625,2.333333
1,2010,20025,CGY,LA,3,1,2,CGY,18.0,4.0,8.0,6.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,1.0,2.0,1,2.25,2.375,2.333333
2,2010,20037,WSH,NYI,1,2,1,NYI,18.0,4.0,8.0,6.0,0.0,0.0,3.0,1.0,0.0,3.0,3.0,3.0,1.0,0.0,4.0,0,3.25,2.625,2.5
3,2010,20037,WSH,NYI,2,1,1,WSH,18.0,4.0,8.0,6.0,1.0,1.0,0.0,2.0,1.0,1.0,4.0,2.0,4.0,1.0,1.0,1,2.75,2.125,2.5
4,2010,20076,MIN,VAN,2,6,4,VAN,18.0,4.0,8.0,6.0,2.0,0.0,0.0,2.0,2.0,1.0,3.0,2.0,3.0,1.0,2.0,0,2.5,2.375,2.166667


#### summary analysis

In [21]:
dc.groupby(['Win'])['MeanC', 'MeanW', 'MeanD'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanC,MeanW,MeanD
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,44.0,44.0,44.0
0,mean,2.482955,2.525568,2.208333
0,std,0.545589,0.343735,0.307371
0,min,1.25,1.625,1.666667
0,25%,2.0,2.34375,2.0
0,50%,2.5,2.5,2.166667
0,75%,3.0,2.75,2.5
0,max,3.25,3.25,2.833333
1,count,44.0,44.0,44.0
1,mean,2.482955,2.420455,2.223485


In [31]:
dc['meanc'] = 2.5 - dc['MeanC']
dc['meanw'] = 2.5 - dc['MeanW']
dc['meand'] = 2 - dc['MeanD']

y = dc['Win']   
X1 = sm.add_constant(dc[['MeanC', 'MeanW', 'MeanD']] )
X2 = sm.add_constant(dc[['meanc', 'meanw', 'meand']] )
X3 = dc[['meanc', 'meanw', 'meand']]


m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()


#m1.summary()
#m2.summary()
m3.summary()


0,1,2,3
Dep. Variable:,Win,R-squared:,0.361
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,16.03
Date:,"Wed, 14 Mar 2018",Prob (F-statistic):,2.41e-08
Time:,12:03:29,Log-Likelihood:,-74.641
No. Observations:,88,AIC:,155.3
Df Residuals:,85,BIC:,162.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
meanc,0.3270,0.133,2.451,0.016,0.062 0.592
meanw,0.8126,0.185,4.384,0.000,0.444 1.181
meand,-1.3026,0.194,-6.707,0.000,-1.689 -0.916

0,1,2,3
Omnibus:,27.985,Durbin-Watson:,2.895
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.425
Skew:,0.097,Prob(JB):,0.0664
Kurtosis:,1.799,Cond. No.,2.06
