# Data

## season_game_level_data

In [161]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [162]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [163]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [164]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [165]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [166]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [167]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [168]:
# dg = dp[['Season', 'GameNumber', 'EventTeamCode', 'VTeamCode', 'HTeamCode']]
# dg = dg.drop_duplicates(['Season', 'GameNumber',  'EventTeamCode'])
# dg = dg.rename(columns={'EventTeamCode': 'Team'})
# dg['Opp'] = dg.apply(lambda x: x['HTeamCode'] if x['Team'] == x['HTeamCode'] else x['VTeamCode'], axis=1)
# dg['Designation'] = dg.apply(lambda x: 'home' if x['Team'] == x['HTeamCode'] else 'away', axis=1)
# dg = dg[['Season', 'GameNumber', 'Team', 'Opp', 'Designation']]

In [169]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [170]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [171]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [172]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [173]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [174]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [175]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_data.csv', index='False', sep=',')

## season_level_data

In [176]:
dm = da

events that happened in regulation time only

In [177]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [178]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [179]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [180]:
dm.shape

(1796745, 24)

In [181]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTeamCode',
       'EventTimeFromTwenty', 'EventTimeFromZero', 'EventType', 'GameDate',
       'GameNumber', 'HTeamCode', 'Length', 'PenaltyType', 'Period',
       'PlayerName', 'PlayerNumber', 'Season', 'ShotResult', 'ShotType',
       'VTeamCode', 'Zone', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [182]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [183]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [184]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [185]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [186]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [187]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [188]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats_per_player.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats_per_player.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [189]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,TOR,37.0,BRENT,C
2,2010,MTL,14.0,PLEKANEC,C
3,2010,MTL,76.0,SUBBAN,LD
4,2010,TOR,35.0,GIGUERE,G


In [190]:
dp.shape

(1058, 5)

In [191]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/player_rank_by_points_per_time_on_ice.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/player_rank_by_points_per_time_on_ice.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [192]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
709,2010,ANA,5.0,SBISA,RD,3
503,2010,ANA,28.0,CHIPCHURA,C,4
313,2010,ANA,42.0,SEXTON,LW,3
731,2010,ANA,3.0,LILJA,RD,3
496,2010,ANA,22.0,MARCHANT,C,3


- merge player position and player rankings and drop goaltenders

In [193]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1058, 6)

In [194]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              256
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [195]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'RD') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'LD') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'LW') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'RW') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1058, 6)

In [196]:
ds1 = ds[(ds['PlayerPosition'] == 'RD') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [197]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'RD') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [198]:
ds1 = ds[(ds['PlayerPosition'] == 'RD') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [199]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,RD,2.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,RD,2.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [200]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [201]:
dw.shape

(3688734, 12)

In [202]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130215, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [203]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw['Position'] = dw.apply(lambda x: 'D' if (x['PlayerPosition']=='LD') else 'D' if (x['PlayerPosition']=='RD') else 'F', 1)
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'Position'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,Position,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,F,12.0
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL,18.0,F,12.0
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL,18.0,F,12.0
3,2010,20001,MTL,26.0,GORGES,RD,2.0,2,3,1,TOR,MTL,18.0,D,6.0
4,2010,20001,MTL,75.0,GILL,RD,2.0,2,3,1,TOR,MTL,18.0,D,6.0


- count the amount of forwards and defensemen by team per game.

In [204]:
dw['FCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='F' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='D' else np.NaN, 1)
dw['FCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['FCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,Position,PositionCount,FCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
3,2010,20001,MTL,26.0,GORGES,RD,2.0,2,3,1,TOR,MTL,18.0,D,6.0,12.0,6.0
4,2010,20001,MTL,75.0,GILL,RD,2.0,2,3,1,TOR,MTL,18.0,D,6.0,12.0,6.0


### keep games that have only 12 F and 6 D per team!!!!

In [205]:
dw = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['FCount'] == 12) & (x['DCount'] == 6)).all())

In [206]:
dw.shape

(31644, 17)

In [207]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/team_roster_player_rank_by_points_per_toi.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/team_roster_player_rank_by_points_per_toi.csv', index='False', sep=',')

- create a new dataset using team roster player rank

In [208]:
dv = dw

In [209]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'GF', 'GA', 'RosterCount', 'PositionCount', 'FCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank
0,2010,20001,MTL,D,2,3,18.0,6.0,12.0,6.0,TOR,MTL,1.833333
1,2010,20001,MTL,F,2,3,18.0,12.0,12.0,6.0,TOR,MTL,2.75
2,2010,20001,TOR,D,3,2,18.0,6.0,12.0,6.0,TOR,MTL,1.833333
3,2010,20001,TOR,F,3,2,18.0,12.0,12.0,6.0,TOR,MTL,2.25
4,2010,20002,PHI,D,3,2,18.0,6.0,12.0,6.0,PHI,PIT,1.833333
5,2010,20002,PHI,F,3,2,18.0,12.0,12.0,6.0,PHI,PIT,2.25
6,2010,20002,PIT,D,2,3,18.0,6.0,12.0,6.0,PHI,PIT,1.833333
7,2010,20002,PIT,F,2,3,18.0,12.0,12.0,6.0,PHI,PIT,2.166667
8,2010,20003,CAR,D,4,3,18.0,6.0,12.0,6.0,CAR,MIN,1.666667
9,2010,20003,CAR,F,4,3,18.0,12.0,12.0,6.0,CAR,MIN,2.25


In [210]:
dv.shape

(3516, 13)

- create columns for team win and team loss. 

In [211]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']!=x['WinTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20001,MTL,D,2,3,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,0,1
1,2010,20001,MTL,F,2,3,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1
2,2010,20001,TOR,D,3,2,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,1,0
3,2010,20001,TOR,F,3,2,18.0,12.0,12.0,6.0,TOR,MTL,2.25,1,0
4,2010,20002,PHI,D,3,2,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0


- display games played, games won, games loss, goals for and goals against by team for the season.

In [212]:
dv['GP'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'Position', 'WinTeam'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'Position', 'LossTeam'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,0,1,60,31,30
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,1,0,64,31,30
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.25,1,0,64,31,30
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30


- create columns with the mean ranking for forward and defenseman by team per game.

In [213]:
dv['Rank_F'] = dv.apply(lambda x: x['Rank'] if x['Position']=='F' else np.NaN, 1)
dv['Rank_D'] = dv.apply(lambda x: x['Rank'] if x['Position']=='D' else np.NaN, 1)
dv['Rank_F'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_F'].apply(lambda x: x.ffill().bfill())
dv['Rank_D'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_D'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,0,1,60,31,30,2.75,1.833333
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.833333
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,1,0,64,31,30,2.25,1.833333
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.25,1,0,64,31,30,2.25,1.833333
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333


- compute the mean per position by team for the season.

In [214]:
dv['Mean_F']= dv.groupby(['Season', 'TeamCode'])['Rank_F'].transform('mean')
dv['Mean_D']= dv.groupby(['Season', 'TeamCode'])['Rank_D'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D,Mean_F,Mean_D
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,0,1,60,31,30,2.75,1.833333,2.494444,1.827778
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.833333,2.494444,1.827778
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,1,0,64,31,30,2.25,1.833333,2.19401,2.007812
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.25,1,0,64,31,30,2.25,1.833333,2.19401,2.007812
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333,2.204545,1.641414


- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [215]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D,Mean_F,Mean_D,L,W
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,0,1,60,31,30,2.75,1.833333,2.494444,1.827778,30,30
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.833333,2.494444,1.827778,30,30
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.833333,1,0,64,31,30,2.25,1.833333,2.19401,2.007812,33,31
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.25,1,0,64,31,30,2.25,1.833333,2.19401,2.007812,33,31
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333,2.204545,1.641414,27,39


- compute win and loss percent by team. Drop duplicate observations.

In [216]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'Mean_F', 'Mean_D']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'Mean_F', 'Mean_D']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D
0,2010,MTL,60,30,30,165,169,0.5,0.5,2.494444,1.827778
2,2010,TOR,64,31,33,179,202,0.484375,0.515625,2.19401,2.007812
4,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.204545,1.641414
6,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.321429,1.769048
8,2010,CAR,65,32,33,183,183,0.492308,0.507692,2.09359,1.735897


- rank teams based on win percent, mean forwards and mean defensemen. 

In [217]:
dv['Rank_W'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['Rank_F'] = dv.groupby(['Season'])['Mean_F'].rank(ascending=True)
dv['Rank_D'] = dv.groupby(['Season'])['Mean_D'].rank(ascending=True)
dv = dv.sort_values(['Season', 'Rank_W', 'Rank_F', 'Rank_D'], ascending=[True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D
198,2010,VAN,58,40,18,198,137,0.689655,0.310345,2.172414,2.166667,1.0,8.0,27.0
12,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.079787,1.801418,2.0,3.0,9.0
4,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.204545,1.641414,3.0,12.0,1.0
30,2010,DET,61,36,25,197,181,0.590164,0.409836,2.185792,1.653005,4.0,9.0,2.0
28,2010,ANA,60,35,25,169,168,0.583333,0.416667,2.448611,2.091667,5.0,25.0,23.0
58,2010,TB,35,20,15,112,109,0.571429,0.428571,2.07619,1.971429,6.5,2.0,17.0
6,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.321429,1.769048,6.5,19.0,8.0
178,2010,PHX,57,32,25,173,157,0.561404,0.438596,2.150585,2.011696,8.0,5.0,21.0
188,2010,BOS,66,37,29,200,163,0.560606,0.439394,2.017677,1.868687,9.0,1.0,12.0
16,2010,CGY,67,37,30,213,191,0.552239,0.447761,2.208955,1.91791,10.0,13.0,16.0


In [218]:
dv.shape

(30, 14)

In [219]:
#dv.to_csv('season_team_roster_ranking.csv', index='False')

In [220]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_team_roster_ranking_by_points_per_toi.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_team_roster_ranking_by_points_per_toi.csv', index='False', sep=',')

## season_game_team_roster_data

- use the team roster player rank dataset (dw) to display the roster quality by team per game

In [221]:
dx = dw

In [222]:
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank']]
dx = dx.rename(columns={'PlayerPosition': 'Position'})
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank
0,2010,20001,MTL,18.0,11.0,C,2.0
1,2010,20001,MTL,18.0,21.0,RW,2.0
2,2010,20001,MTL,18.0,57.0,LW,3.0
3,2010,20001,MTL,18.0,26.0,RD,2.0
4,2010,20001,MTL,18.0,75.0,RD,2.0


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 19 players per team and 38 per game for the dataset to be correct.

In [223]:
dx['playercount'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [224]:
dx['rosterposition'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank (10 columns). 


In [225]:
dx = pd.pivot_table(dx, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dx = dx.reset_index()
dx.columns = ['_'.join(str(s).strip() for s in col if s) for col in dx.columns]
dx.reset_index()
dx = dx.fillna(0)
dx = dx.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_LW_1.0': 'LW1', 'rosterposition_LW_2.0': 'LW2', 'rosterposition_LW_3.0': 'LW3', 'rosterposition_LW_4.0': 'LW4', 'rosterposition_RW_1.0': 'RW1', 'rosterposition_RW_2.0': 'RW2', 'rosterposition_RW_3.0': 'RW3', 'rosterposition_RW_4.0': 'RW4', 'rosterposition_LD_1.0': 'LD1', 'rosterposition_LD_2.0': 'LD2', 'rosterposition_LD_3.0': 'LD3', 'rosterposition_LD_4.0': 'LD4', 'rosterposition_RD_1.0': 'RD1', 'rosterposition_RD_2.0': 'RD2', 'rosterposition_RD_3.0': 'RD3', 'rosterposition_RD_4.0': 'RD4' })
dx.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4
0,2010,20001,MTL,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,2.0,1.0
1,2010,20001,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0
2,2010,20002,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0
4,2010,20003,CAR,18.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
5,2010,20003,MIN,18.0,1.0,3.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,1.0
6,2010,20004,CHI,18.0,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0
7,2010,20004,COL,18.0,2.0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,3.0,1.0,0.0
8,2010,20005,CGY,18.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
9,2010,20005,EDM,18.0,0.0,3.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,2.0,1.0,1.0,0.0


In [226]:
dx.shape

(1758, 22)

In [227]:
#dx.to_csv('season_game_team_roster.csv', index='False', sep=',')

In [228]:
dx.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_game_team_roster_by_points_per_toi.csv', index='False', sep=',')
#dx.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_game_team_roster_by_points_per_toi.csv', index='False', sep=',')

- create a dataset that will display the mean of forwards and defencemen by season per team

In [229]:
dz = dx

In [230]:
# mean centers ranking per team
dz['MeanC1'] = dz.groupby(['Season', 'TeamCode'])['C1'].transform('mean')
dz['MeanC2'] = dz.groupby(['Season', 'TeamCode'])['C2'].transform('mean')
dz['MeanC3'] = dz.groupby(['Season', 'TeamCode'])['C3'].transform('mean')
dz['MeanC4'] = dz.groupby(['Season', 'TeamCode'])['C4'].transform('mean')

# mean left wing ranking per team
dz['MeanLW1'] = dz.groupby(['Season', 'TeamCode'])['LW1'].transform('mean')
dz['MeanLW2'] = dz.groupby(['Season', 'TeamCode'])['LW2'].transform('mean')
dz['MeanLW3'] = dz.groupby(['Season', 'TeamCode'])['LW3'].transform('mean')
dz['MeanLW4'] = dz.groupby(['Season', 'TeamCode'])['LW4'].transform('mean')

# mean right wing ranking per team
dz['MeanRW1'] = dz.groupby(['Season', 'TeamCode'])['RW1'].transform('mean')
dz['MeanRW2'] = dz.groupby(['Season', 'TeamCode'])['RW2'].transform('mean')
dz['MeanRW3'] = dz.groupby(['Season', 'TeamCode'])['RW3'].transform('mean')
dz['MeanRW4'] = dz.groupby(['Season', 'TeamCode'])['RW4'].transform('mean')

# mean left defense ranking per team
dz['MeanLD1'] = dz.groupby(['Season', 'TeamCode'])['LD1'].transform('mean')
dz['MeanLD2'] = dz.groupby(['Season', 'TeamCode'])['LD2'].transform('mean')
dz['MeanLD3'] = dz.groupby(['Season', 'TeamCode'])['LD3'].transform('mean')

# mean right defense ranking per team
dz['MeanRD1'] = dz.groupby(['Season', 'TeamCode'])['RD1'].transform('mean')
dz['MeanRD2'] = dz.groupby(['Season', 'TeamCode'])['RD2'].transform('mean')
dz['MeanRD3'] = dz.groupby(['Season', 'TeamCode'])['RD3'].transform('mean')

dz.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,20001,MTL,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,2.0,1.0,0.933333,1.566667,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.766667,1.0,1.133333,0.8,1.916667,0.0,0.683333,0.6,2.0,0.8
1,2010,20001,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.984375,1.0,1.5625,0.21875,2.015625,0.0,0.578125,2.078125,2.0,0.96875,0.59375,0.0,0.671875,1.953125,0.640625,0.828125,1.0,0.90625
2,2010,20002,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.028571,1.1,2.242857,0.085714,0.8,0.842857,1.285714,0.414286,0.514286,2.185714,1.328571,0.171429,1.528571,0.957143,0.9,0.757143,1.857143,0.0
4,2010,20003,CAR,18.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.984615,1.0,0.169231,0.892308,2.107692,0.738462,0.261538,0.523077,1.0,1.0,1.692308,0.630769,1.938462,0.4,1.061538,0.907692,1.492308,0.2


- drop duplicates by season and team

In [231]:
dz = dz.drop_duplicates(['Season', 'TeamCode'])

In [232]:
dz = dz[['Season', 'TeamCode', 'RosterCount', 'MeanC1', 'MeanC2', 'MeanC3', 'MeanC4', 'MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']]
dz.head()

Unnamed: 0,Season,TeamCode,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,MTL,18.0,0.933333,1.566667,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.766667,1.0,1.133333,0.8,1.916667,0.0,0.683333,0.6,2.0,0.8
1,2010,TOR,18.0,0.984375,1.0,1.5625,0.21875,2.015625,0.0,0.578125,2.078125,2.0,0.96875,0.59375,0.0,0.671875,1.953125,0.640625,0.828125,1.0,0.90625
2,2010,PHI,18.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121
3,2010,PIT,18.0,1.028571,1.1,2.242857,0.085714,0.8,0.842857,1.285714,0.414286,0.514286,2.185714,1.328571,0.171429,1.528571,0.957143,0.9,0.757143,1.857143,0.0
4,2010,CAR,18.0,1.984615,1.0,0.169231,0.892308,2.107692,0.738462,0.261538,0.523077,1.0,1.0,1.692308,0.630769,1.938462,0.4,1.061538,0.907692,1.492308,0.2


In [233]:
dz.shape

(30, 21)

In [234]:
#dz.to_csv('season_team.csv', index='False', sep=',')

In [235]:
dz.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_team_by_points_per_toi.csv', index='False', sep=',')
#dz.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_team_by_points_per_toi.csv', index='False', sep=',')

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [236]:
dy = dx

In [237]:
dy.loc[dy.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dy = dy.fillna(2)

In [238]:
dy.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A
0,2010,20001,MTL,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,2.0,1.0,0.933333,1.566667,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.766667,1.0,1.133333,0.8,1.916667,0.0,0.683333,0.6,2.0,0.8,1.0
1,2010,20001,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.984375,1.0,1.5625,0.21875,2.015625,0.0,0.578125,2.078125,2.0,0.96875,0.59375,0.0,0.671875,1.953125,0.640625,0.828125,1.0,0.90625,2.0
2,2010,20002,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121,1.0
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.028571,1.1,2.242857,0.085714,0.8,0.842857,1.285714,0.414286,0.514286,2.185714,1.328571,0.171429,1.528571,0.957143,0.9,0.757143,1.857143,0.0,2.0
4,2010,20003,CAR,18.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.984615,1.0,0.169231,0.892308,2.107692,0.738462,0.261538,0.523077,1.0,1.0,1.692308,0.630769,1.938462,0.4,1.061538,0.907692,1.492308,0.2,1.0


- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [239]:
dy = pd.pivot_table(dy, index=['Season', 'GameNumber'], columns=['A'], values=['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4', 'RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3'])
dy = dy.reset_index()
dy.columns = ['_'.join(str(s).strip() for s in col if s) for col in dy.columns]
dy = dy.reset_index()
dy = dy.rename(columns={'C1_1.0': 'VC1', 'C2_1.0': 'VC2', 'C3_1.0': 'VC3', 'C4_1.0': 'VC4', 'LW1_1.0': 'VLW1', 'LW2_1.0': 'VLW2', 'LW3_1.0': 'VLW3', 'LW4_1.0': 'VLW4', 'RW1_1.0': 'VRW1', 'RW2_1.0': 'VRW2', 'RW3_1.0': 'VRW3', 'RW4_1.0': 'VRW4', 'LD1_1.0': 'VLD1', 'LD2_1.0': 'VLD2', 'LD3_1.0': 'VLD3', 'RD1_1.0': 'VRD1', 'RD2_1.0': 'VRD2', 'RD3_1.0': 'VRD3', 'C1_2.0': 'HC1', 'C2_2.0': 'HC2', 'C3_2.0': 'HC3', 'C4_2.0': 'HC4', 'LW1_2.0': 'HLW1', 'LW2_2.0': 'HLW2', 'LW3_2.0': 'HLW3', 'LW4_2.0': 'HLW4', 'RW1_2.0': 'HRW1', 'RW2_2.0': 'HRW2', 'RW3_2.0': 'HRW3', 'RW4_2.0': 'HRW4', 'LD1_2.0': 'HLD1', 'LD2_2.0': 'HLD2', 'LD3_2.0': 'HLD3', 'RD1_2.0': 'HRD1', 'RD2_2.0': 'HRD2', 'RD3_2.0': 'HRD3'})
dy = dy[['Season', 'GameNumber', 'VC1', 'VC2', 'VC3', 'VC4', 'VLW1', 'VLW2', 'VLW3', 'VLW4', 'VRW1', 'VRW2', 'VRW3', 'VRW4', 'VLD1', 'VLD2', 'VLD3', 'VRD1', 'VRD2', 'VRD3', 'HC1', 'HC2', 'HC3', 'HC4', 'HLW1', 'HLW2', 'HLW3', 'HLW4', 'HRW1', 'HRW2', 'HRW3', 'HRW4', 'HLD1', 'HLD2', 'HLD3', 'HRD1', 'HRD2', 'HRD3']]
dy.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3
0,2010,20001,1.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0
1,2010,20002,3.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0
2,2010,20003,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0
3,2010,20004,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0
4,2010,20005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0


In [240]:
dy.shape

(879, 38)

In [241]:
#dy.to_csv('season_game_roster.csv', index='False', sep=',')

In [242]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_game_roster_by_points_per_toi.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_roster.csv', index='False', sep=',')

# Roster Analysis

## season_level_analysis

#### $WinPc = \beta_{0} + \beta_{1}MeanC_{1} + \beta_{2}MeanC_{2}+ \beta_{3}MeanC_{3} + \beta_{4}MeanC_{4} + \beta_{5}MeanLW_{1} + \beta_{6}MeanLW_{2}+ \beta_{7}MeanLW_{3} + \beta_{8}MeanLW_{4} + \beta_{9}MeanRW_{1} + \beta_{10}MeanRW_{2}+ \beta_{11}MeanRW_{3} + \beta_{12}MeanRW_{4} + \beta_{13}MeanLD_{1} + \beta_{14}MeanLD_{2}+ \beta_{15}MeanLD_{3} + \beta_{16}MeanRD_{1} + \beta_{17}MeanRD_{2}+ \beta_{18}MeanRD_{3} + e_{s}$

- merge season_team dataset (dz) and season_team_roster_ranking (dv) for roster analysis at the season level. Use **ds** as the merging dataset.

In [243]:
ds = dv.merge(dz, on=['Season', 'TeamCode'], how='left')
ds.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,VAN,58,40,18,198,137,0.689655,0.310345,2.172414,2.166667,1.0,8.0,27.0,18.0,2.0,0.896552,0.0,0.844828,1.0,0.810345,1.206897,1.155172,1.827586,1.0,0.827586,0.431034,0.568966,0.793103,1.741379,0.965517,1.137931,0.793103
1,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.079787,1.801418,2.0,3.0,9.0,18.0,1.851064,0.787234,1.489362,0.148936,1.0,1.978723,1.0,0.0,0.93617,0.93617,1.787234,0.085106,1.808511,0.0,1.255319,1.297872,0.978723,0.659574
2,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.204545,1.641414,3.0,12.0,1.0,18.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121
3,2010,DET,61,36,25,197,181,0.590164,0.409836,2.185792,1.653005,4.0,9.0,2.0,18.0,0.606557,1.803279,1.606557,0.180328,0.967213,1.836066,1.704918,0.311475,1.737705,0.0,1.245902,0.0,0.918033,1.688525,0.0,1.819672,0.918033,0.655738
4,2010,ANA,60,35,25,169,168,0.583333,0.416667,2.448611,2.091667,5.0,25.0,23.0,18.0,0.8,0.983333,1.433333,1.066667,1.0,0.95,1.55,0.5,1.9,0.0,0.666667,1.15,0.2,1.65,1.066667,1.7,0.0,1.383333


- display the diffence in quality of forwards (DF) and defensemen (DD) per team.

In [244]:
ds['DC'] = ds['MeanC1'] - ds['MeanC2'] - ds['MeanC3'] - ds['MeanC4']
ds['DLW'] = ds['MeanLW1'] - ds['MeanLW2'] - ds['MeanLW3'] - ds['MeanLW4']
ds['DRW'] = ds['MeanRW1'] - ds['MeanRW2'] - ds['MeanRW3'] - ds['MeanRW4']
ds['DLD'] = ds['MeanLD1'] - ds['MeanLD2'] - ds['MeanLD3'] 
ds['DRD'] = ds['MeanRD1'] - ds['MeanRD2'] - ds['MeanRD3']

- mean goals for and mean goals against per team.

In [245]:
ds['meanGF'] = ds['GF']/ ds['GP']
ds['meanGA'] = ds['GA']/ ds['GP']

In [246]:
ds.shape

(30, 40)

### summary analysis

In [247]:
ds.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,DC,DLW,DRW,DLD,DRD,meanGF,meanGA
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,58.6,29.3,29.3,167.7,167.7,0.494979,0.505021,2.278944,1.933031,15.5,15.5,15.5,18.0,1.171449,1.369626,1.18488,0.392586,1.105346,1.114163,1.060573,0.634086,0.8197,1.466723,1.216712,0.464158,0.818907,1.33259,0.801328,1.106871,1.217667,0.722636,-1.775643,-1.703475,-2.327893,-1.315012,-0.833433,2.836002,2.859379
std,0.0,12.237872,8.50213,8.183878,40.87251,41.150816,0.104917,0.104917,0.149173,0.181503,8.80145,8.803408,8.803408,0.0,0.733614,0.764752,0.779848,0.447899,0.771059,0.763588,0.572048,0.562951,0.646767,0.794448,0.665441,0.408031,0.816321,0.824077,0.453673,0.636709,0.813734,0.508891,1.501463,1.383219,1.353787,1.506458,1.255639,0.371673,0.335736
min,2010.0,19.0,4.0,15.0,32.0,56.0,0.210526,0.310345,2.017677,1.641414,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.966667,-4.105263,-4.514286,-3.105263,-3.432836,1.684211,2.362069
25%,2010.0,58.0,22.5,25.0,158.5,157.0,0.460623,0.438796,2.175758,1.77714,8.25,8.25,8.25,18.0,0.793284,0.917857,0.773455,0.044156,0.91817,0.73956,0.725852,0.171429,0.120833,0.966325,0.693182,0.089949,0.028571,0.789185,0.450216,0.639286,0.81392,0.239744,-2.827083,-2.213558,-3.285511,-2.547024,-1.533005,2.713258,2.569866
50%,2010.0,63.5,32.0,29.5,177.0,172.0,0.520398,0.479602,2.229735,1.911541,15.5,15.5,15.5,18.0,0.954885,1.430704,1.277273,0.25,0.974658,0.962227,1.015873,0.408635,0.843863,1.576659,1.275125,0.400527,0.62042,1.491667,0.828358,0.992754,1.036232,0.738218,-2.157792,-1.85431,-2.374327,-1.91808,-0.903811,2.853945,2.840909
75%,2010.0,66.0,35.75,33.0,192.5,191.0,0.561204,0.539377,2.407968,2.085677,22.75,22.75,22.75,18.0,1.824012,1.935497,1.811218,0.503879,1.169246,1.819078,1.441026,0.981138,1.0,1.923951,1.669231,0.799219,1.521429,1.910109,1.130469,1.633373,1.789409,1.124774,-0.674351,-1.07619,-1.731917,-0.345567,0.144936,3.056391,3.077518
max,2010.0,70.0,40.0,44.0,219.0,230.0,0.689655,0.789474,2.587719,2.25641,30.0,30.0,30.0,18.0,2.878788,2.878788,2.432836,1.925373,3.567164,2.439394,2.0,2.078125,2.0,3.614035,3.015152,1.15,2.984848,2.877193,1.741379,2.318182,2.74359,1.628571,1.954545,2.701493,0.491803,2.772727,1.409091,3.413793,3.666667


### estimate roster model 

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [248]:
print ('season level analysis (win percent) by mean roster position')
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3' ]] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/points_per_toi/season_level_analysis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season level analysis (win percent) by mean roster position
                            OLS Regression Results                            
Dep. Variable:                  WinPc   R-squared:                       0.756
Model:                            OLS   Adj. R-squared:                  0.456
Method:                 Least Squares   F-statistic:                     2.522
Date:                Thu, 22 Feb 2018   Prob (F-statistic):             0.0495
Time:                        03:47:28   Log-Likelihood:                 46.758
No. Observations:                  30   AIC:                            -59.52
Df Residuals:                      13   BIC:                            -35.70
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------

- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [249]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.053
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,1.554
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.223
Time:,03:47:28,Log-Likelihood:,26.388
No. Observations:,30,AIC:,-48.78
Df Residuals:,28,BIC:,-45.97
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4566,0.036,12.616,0.000,0.382 0.531
MeanC1,0.0328,0.026,1.247,0.223,-0.021 0.087

0,1,2,3
Omnibus:,6.049,Durbin-Watson:,0.172
Prob(Omnibus):,0.049,Jarque-Bera (JB):,4.664
Skew:,-0.945,Prob(JB):,0.0971
Kurtosis:,3.402,Cond. No.,3.74


- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **Logit**.  The purpose is to deterimine the impact each roster position has on team winning percent.

In [250]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.662019
         Iterations 5


  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,13.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.04477
Time:,03:47:28,Log-Likelihood:,-19.861
converged:,True,LL-Null:,-20.791
,,LLR p-value:,1.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.1562,3.12e+07,5.01e-09,1.000,-6.11e+07 6.11e+07
MeanC1,0.0974,,,,nan nan
MeanC2,-0.1036,,,,nan nan
MeanC3,-0.0310,,,,nan nan
MeanC4,-0.1067,,,,nan nan
MeanLW1,0.1725,,,,nan nan
MeanLW2,0.1939,,,,nan nan
MeanLW3,0.2680,,,,nan nan
MeanLW4,-0.3178,,,,nan nan


- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [251]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690851
         Iterations 3


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,28.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.003171
Time:,03:47:28,Log-Likelihood:,-20.726
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.7165

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.1741,0.698,-0.249,0.803,-1.542 1.194
MeanC1,0.1315,0.508,0.259,0.796,-0.864 1.127


- regress **team win percent** on the difference in the mean quality of forwards (DF). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [252]:
print ('season level analysis (win percent) by roster position differential')
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/points_per_toi/season_winpc_with_roster_position_differential.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season level analysis (win percent) by roster position differential
                            OLS Regression Results                            
Dep. Variable:                  WinPc   R-squared:                       0.312
Model:                            OLS   Adj. R-squared:                  0.168
Method:                 Least Squares   F-statistic:                     2.173
Date:                Thu, 22 Feb 2018   Prob (F-statistic):             0.0910
Time:                        03:47:28   Log-Likelihood:                 31.180
No. Observations:                  30   AIC:                            -50.36
Df Residuals:                      24   BIC:                            -41.95
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------

- regress **team win percent** on the difference in the mean quality of forwards (DF). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [253]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.680415
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,26.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.01823
Time:,03:47:28,Log-Likelihood:,-20.412
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.8595

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.4328,0.937,0.462,0.644,-1.404 2.270
DC,0.0631,0.251,0.252,0.801,-0.428 0.554
DLW,-0.0152,0.272,-0.056,0.955,-0.548 0.518
DRW,0.1575,0.279,0.565,0.572,-0.389 0.704


- regress **team win percent** on the difference in the mean quality of defensemen (DD). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [254]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.066
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.9487
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.4
Time:,03:47:28,Log-Likelihood:,26.597
No. Observations:,30,AIC:,-47.19
Df Residuals:,27,BIC:,-42.99
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5249,0.030,17.377,0.000,0.463 0.587
DLD,0.0104,0.013,0.789,0.437,-0.017 0.037
DRD,0.0196,0.016,1.242,0.225,-0.013 0.052

0,1,2,3
Omnibus:,7.312,Durbin-Watson:,0.216
Prob(Omnibus):,0.026,Jarque-Bera (JB):,5.567
Skew:,-0.953,Prob(JB):,0.0618
Kurtosis:,3.907,Cond. No.,3.75


- regress **team win percent** on the difference in the mean quality of defensemen (DD). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [255]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690288
         Iterations 3


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,27.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.003984
Time:,03:47:28,Log-Likelihood:,-20.709
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.9205

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0999,0.576,0.174,0.862,-1.029 1.229
DLD,0.0416,0.250,0.166,0.868,-0.449 0.532
DRD,0.0785,0.301,0.261,0.794,-0.511 0.668


In [256]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.680038
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,24.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.01877
Time:,03:47:28,Log-Likelihood:,-20.401
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.9783

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.4732,1.039,0.456,0.649,-1.563 2.509
DC,0.0573,0.261,0.219,0.826,-0.454 0.569
DLW,-0.0026,0.302,-0.009,0.993,-0.594 0.589
DRW,0.1486,0.291,0.510,0.610,-0.423 0.720
DLD,0.0176,0.279,0.063,0.950,-0.530 0.565
DRD,0.0326,0.329,0.099,0.921,-0.612 0.677


#### mean goals regression

- regress **mean goals for** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [257]:
y = ds['meanGF']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGF,R-squared:,0.858
Model:,OLS,Adj. R-squared:,0.684
Method:,Least Squares,F-statistic:,4.929
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.00298
Time:,03:47:28,Log-Likelihood:,16.962
No. Observations:,30,AIC:,0.07547
Df Residuals:,13,BIC:,23.9
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1325,0.007,18.862,0.000,0.117 0.148
MeanC1,0.4594,0.116,3.948,0.002,0.208 0.711
MeanC2,0.2222,0.101,2.211,0.046,0.005 0.439
MeanC3,0.0400,0.072,0.553,0.590,-0.116 0.196
MeanC4,0.2151,0.161,1.336,0.204,-0.133 0.563
MeanLW1,0.1904,0.096,1.985,0.069,-0.017 0.398
MeanLW2,0.3415,0.114,2.996,0.010,0.095 0.588
MeanLW3,0.3129,0.103,3.036,0.010,0.090 0.536
MeanLW4,-0.0964,0.104,-0.925,0.372,-0.321 0.129

0,1,2,3
Omnibus:,2.633,Durbin-Watson:,2.369
Prob(Omnibus):,0.268,Jarque-Bera (JB):,2.221
Skew:,-0.651,Prob(JB):,0.329
Kurtosis:,2.714,Cond. No.,5.4e+16


- regress **mean goals against** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [258]:
y = ds['meanGA']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGA,R-squared:,0.662
Model:,OLS,Adj. R-squared:,0.247
Method:,Least Squares,F-statistic:,1.593
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.201
Time:,03:47:28,Log-Likelihood:,6.9648
No. Observations:,30,AIC:,20.07
Df Residuals:,13,BIC:,43.89
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1456,0.010,14.851,0.000,0.124 0.167
MeanC1,0.3075,0.162,1.894,0.081,-0.043 0.658
MeanC2,0.3647,0.140,2.600,0.022,0.062 0.668
MeanC3,0.2074,0.101,2.054,0.061,-0.011 0.426
MeanC4,0.2016,0.225,0.897,0.386,-0.284 0.687
MeanLW1,0.0076,0.134,0.057,0.956,-0.282 0.297
MeanLW2,0.0600,0.159,0.377,0.712,-0.284 0.404
MeanLW3,-0.1017,0.144,-0.707,0.492,-0.412 0.209
MeanLW4,0.2078,0.145,1.430,0.176,-0.106 0.522

0,1,2,3
Omnibus:,3.433,Durbin-Watson:,1.66
Prob(Omnibus):,0.18,Jarque-Bera (JB):,1.649
Skew:,0.225,Prob(JB):,0.438
Kurtosis:,1.944,Cond. No.,5.4e+16


- regress **mean goals for** on the differential of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [259]:
y = ds['meanGF']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGF,R-squared:,0.442
Model:,OLS,Adj. R-squared:,0.325
Method:,Least Squares,F-statistic:,3.799
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.0112
Time:,03:47:28,Log-Likelihood:,-3.6226
No. Observations:,30,AIC:,19.25
Df Residuals:,24,BIC:,27.65
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.4201,0.157,21.759,0.000,3.096 3.745
DC,0.0889,0.040,2.248,0.034,0.007 0.171
DLW,0.0434,0.046,0.948,0.353,-0.051 0.138
DRW,0.0937,0.044,2.132,0.043,0.003 0.184
DLD,0.0542,0.042,1.281,0.213,-0.033 0.142
DRD,0.0755,0.050,1.514,0.143,-0.027 0.178

0,1,2,3
Omnibus:,3.236,Durbin-Watson:,1.342
Prob(Omnibus):,0.198,Jarque-Bera (JB):,1.822
Skew:,-0.502,Prob(JB):,0.402
Kurtosis:,3.671,Cond. No.,12.0


- regress **mean goals against** on the differential of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [260]:
y = ds['meanGA']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGA,R-squared:,0.075
Model:,OLS,Adj. R-squared:,-0.117
Method:,Least Squares,F-statistic:,0.3918
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.85
Time:,03:47:28,Log-Likelihood:,-8.1399
No. Observations:,30,AIC:,28.28
Df Residuals:,24,BIC:,36.69
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8031,0.183,15.341,0.000,2.426 3.180
DC,-0.0273,0.046,-0.595,0.558,-0.122 0.068
DLW,0.0458,0.053,0.860,0.399,-0.064 0.156
DRW,-0.0509,0.051,-0.996,0.329,-0.156 0.055
DLD,0.0248,0.049,0.504,0.619,-0.077 0.126
DRD,0.0001,0.058,0.002,0.998,-0.119 0.120

0,1,2,3
Omnibus:,1.409,Durbin-Watson:,1.173
Prob(Omnibus):,0.494,Jarque-Bera (JB):,0.977
Skew:,0.44,Prob(JB):,0.614
Kurtosis:,2.907,Cond. No.,12.0


## season_game_level_analysis

#### $HomeWin = \beta_{0} + \beta_{1}DC_{1} + \beta_{2}DC_{2} + \beta_{3}DC_{3} + \beta_{4}DC_{4} + \beta_{5}DLW_{1} + \beta_{6}DLW_{2} + \beta_{7}DLW_{3} + \beta_{8}DLW_{4} + + \beta_{9}DRW_{1} + \beta_{10}DRW_{2} + \beta_{11}DRW_{3} + \beta_{12}DRW_{4} + \beta_{13}DLD_{1} + \beta_{14}DLD_{2} + \beta_{15}DLD_{3} + \beta_{16}DRD_{1}+ \beta_{17}DRD_{2} + \beta_{18}DRD_{3}+ e_{s,g}$

- merge season game data (dg) and season game roster (dy).

In [261]:
dl = dg.merge(dy, on=['Season', 'GameNumber'], how='left')
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,1.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,3.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0


- determine if the home or away team won the game.

In [262]:
dl['WinTeam'] = dl.apply(lambda x: 'HOME' if x['GD'] > 0 else 'AWAY', axis=1)

- Calculate the difference between player quality per game for all positions with respect to home team ( Home Team - Visitor Team). There are 5 positions and 2 types of player quality. This will give us a total of 10 differenecs. 

In [263]:
dl.shape

(1230, 47)

- total of forwards and defensemen by team per game.

In [264]:
dl['VF'] = dl['VC1'] + dl['VC2'] + dl['VC3'] + dl['VC4'] + dl['VLW1'] + dl['VLW2'] + dl['VLW3'] + dl['VLW4'] + dl['VRW1'] + dl['VRW2'] + dl['VRW3'] + dl['VRW4']
dl['VD'] = dl['VLD1'] + dl['VLD2'] + dl['VLD3'] + dl['VRD1'] + dl['VRD2'] + dl['VRD3']
dl['HF'] = dl['HC1'] + dl['HC2'] + dl['HC3'] + dl['HC4'] + dl['HLW1'] + dl['HLW2'] + dl['HLW3'] + dl['HLW4'] + dl['HRW1'] + dl['HRW2'] + dl['HRW3'] + dl['HRW4']
dl['HD'] = dl['HLD1'] + dl['HLD2'] + dl['HLD3'] + dl['HRD1'] + dl['HRD2'] + dl['HRD3']

- total of forwards and defensemen per game.

In [265]:
dl['F'] = dl['VF'] + dl['HF']
dl['D'] = dl['VD'] + dl['HD']
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3,VF,VD,HF,HD,F,D
0,2010,20001,MTL,TOR,2,3,1,3,2,HOME,MTL,1.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
1,2010,20002,PHI,PIT,3,2,-1,2,3,AWAY,PIT,3.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
2,2010,20003,CAR,MIN,4,3,-1,3,4,AWAY,MIN,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,12.0,6.0,12.0,6.0,24.0,12.0
3,2010,20004,CHI,COL,3,4,1,4,3,HOME,CHI,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
4,2010,20005,CGY,EDM,0,4,4,4,0,HOME,CGY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,3.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0


- **keep games with 12 forwards and 6 defensemen per team.**

In [266]:
dl = dl[((dl['VF'] == 12) & (dl['VD'] == 6) & (dl['HF'] == 12) & (dl['HD'] == 6))]

In [267]:
dl.shape

(879, 53)

In [268]:
dl['VF'].value_counts()

12.0    879
Name: VF, dtype: int64

In [269]:
dl['VD'].value_counts()

6.0    879
Name: VD, dtype: int64

In [270]:
dl['HF'].value_counts()

12.0    879
Name: HF, dtype: int64

In [271]:
dl['HD'].value_counts()

6.0    879
Name: HD, dtype: int64

### summary analysis

In [272]:
dl.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3,VF,VD,HF,HD,F,D
count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
mean,2010.0,20613.112628,2.763367,2.960182,0.196815,2.960182,2.763367,1.119454,1.435722,1.284414,0.426621,1.139932,1.098976,1.050057,0.534699,0.803185,1.394767,1.208191,0.503982,0.789534,1.415245,0.805461,1.185438,1.170648,0.633675,1.203641,1.320819,1.097838,0.372014,1.059158,1.141069,1.063709,0.660978,0.904437,1.508532,1.240046,0.427759,0.936291,1.227531,0.810011,1.023891,1.255973,0.746303,12.0,6.0,12.0,6.0,24.0,12.0
std,0.0,349.491101,1.672088,1.727312,2.455284,1.727312,1.672088,0.722607,0.863139,0.855703,0.66905,0.829268,0.774295,0.743841,0.675551,0.738581,0.811377,0.809188,0.609096,0.863103,0.960161,0.693062,0.78335,0.970172,0.657914,0.790424,0.837292,0.931348,0.553876,0.799291,0.86603,0.665136,0.790729,0.694899,0.925275,0.829654,0.587612,0.905835,0.893213,0.729521,0.725785,0.891242,0.68703,0.0,0.0,0.0,0.0,0.0,0.0
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
25%,2010.0,20317.5,2.0,2.0,-1.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
50%,2010.0,20613.0,3.0,3.0,1.0,3.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
75%,2010.0,20913.5,4.0,4.0,2.0,4.0,4.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
max,2010.0,21230.0,10.0,9.0,7.0,9.0,10.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,12.0,6.0,12.0,6.0,24.0,12.0


In [273]:
#dl = dl[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'HGF', 'VGF', 'GD','WinTeam', 'VF1', 'VF2', 'VD1', 'VD2', 'HF1', 'HF2', 'HD1', 'HD2']]

- determine if a game was won by the home or visitor team.
- compute the difference in quality of forwards and defensemen between home and visitor team per game (DF1, DF2, DD1, DD2). 

In [274]:
dl['HomeWin'] = dl.apply(lambda x: 1 if x['WinTeam']=='HOME' else 0, axis=1)

# difference in center position per game
dl['DC1'] = dl['HC1'] - dl['VC1']
dl['DC2'] = dl['HC2'] - dl['VC2']
dl['DC3'] = dl['HC3'] - dl['VC3']
dl['DC4'] = dl['HC4'] - dl['VC4']

# difference in left position per game
dl['DLW1'] = dl['HLW1'] - dl['VLW1']
dl['DLW2'] = dl['HLW2'] - dl['VLW2']
dl['DLW3'] = dl['HLW3'] - dl['VLW3']
dl['DLW4'] = dl['HLW4'] - dl['VLW4']

# difference in right wing position per game
dl['DRW1'] = dl['HRW1'] - dl['VRW1']
dl['DRW2'] = dl['HRW2'] - dl['VRW2']
dl['DRW3'] = dl['HRW3'] - dl['VRW3']
dl['DRW4'] = dl['HRW4'] - dl['VRW4']

# difference in left defence position per game
dl['DLD1'] = dl['HLD1'] - dl['VLD1']
dl['DLD2'] = dl['HLD2'] - dl['VLD2']
dl['DLD3'] = dl['HLD3'] - dl['VLD3']

# difference in right defence position per game
dl['DRD1'] = dl['HRD1'] - dl['VRD1']
dl['DRD2'] = dl['HRD2'] - dl['VRD2']
dl['DRD3'] = dl['HRD3'] - dl['VRD3']

In [275]:
dl.groupby(['WinTeam'])['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3', ].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,DC1,DC2,DC3,DC4,DLW1,DLW2,DLW3,DLW4,DRW1,DRW2,DRW3,DRW4,DLD1,DLD2,DLD3,DRD1,DRD2,DRD3
WinTeam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AWAY,count,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0
AWAY,mean,0.014218,-0.097156,-0.135071,-0.049763,-0.087678,0.007109,-0.00237,0.13981,0.082938,0.132701,0.049763,-0.054502,0.175355,-0.180095,-0.021327,-0.208531,0.120853,0.113744
AWAY,std,1.107001,1.262483,1.288816,0.887971,1.15924,1.174733,0.988049,0.982932,1.047673,1.243098,1.124078,0.803417,1.228157,1.215803,1.009231,1.08285,1.33724,1.027577
AWAY,min,-3.0,-3.0,-3.0,-4.0,-4.0,-3.0,-2.0,-2.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0
AWAY,25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
AWAY,50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWAY,75%,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
AWAY,max,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0
HOME,count,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0
HOME,mean,0.148796,-0.131291,-0.234136,-0.059081,-0.074398,0.074398,0.028446,0.113786,0.118162,0.09628,0.015317,-0.09628,0.12035,-0.194748,0.028446,-0.118162,0.052516,0.111597


### estimate roster model

- regress **home win** on the difference in number of home and visitor players by position and quality. Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on home team success.

In [276]:
print ('season game level analysis (home win) by roster position differential')
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/points_per_toi/season_game_level_analysis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season game level analysis (home win) by roster position differential
                            OLS Regression Results                            
Dep. Variable:                HomeWin   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.6731
Date:                Thu, 22 Feb 2018   Prob (F-statistic):              0.822
Time:                        03:47:30   Log-Likelihood:                -631.82
No. Observations:                 879   AIC:                             1298.
Df Residuals:                     862   BIC:                             1379.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------

- By increasing the differential of **elite center** (home team – visitor team) by one unit, home win **increases** by 2.5 games.
- By increasing the differential of **secondary center** (home team – visitor team) by one unit, home win **increases** by 2 games respectfully.

- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [277]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.212
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.304
Time:,03:47:30,Log-Likelihood:,-634.84
No. Observations:,879,AIC:,1280.0
Df Residuals:,874,BIC:,1304.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5156,0.017,29.586,0.000,0.481 0.550
DC1,0.0350,0.023,1.536,0.125,-0.010 0.080
DC2,0.0109,0.021,0.515,0.607,-0.031 0.052
DC3,-0.0109,0.017,-0.640,0.523,-0.044 0.022
DC4,-0.0109,0.020,-0.533,0.594,-0.051 0.029

0,1,2,3
Omnibus:,0.943,Durbin-Watson:,1.907
Prob(Omnibus):,0.624,Jarque-Bera (JB):,143.315
Skew:,-0.08,Prob(JB):,7.58e-32
Kurtosis:,1.028,Cond. No.,2.84


In [278]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DLW1', 'DLW2', 'DLW3', 'DLW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.5063
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.731
Time:,03:47:30,Log-Likelihood:,-636.26
No. Observations:,879,AIC:,1283.0
Df Residuals:,874,BIC:,1306.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5193,0.017,30.342,0.000,0.486 0.553
DLW1,0.0188,0.019,1.013,0.311,-0.018 0.055
DLW2,0.0256,0.020,1.303,0.193,-0.013 0.064
DLW3,0.0166,0.020,0.840,0.401,-0.022 0.055
DLW4,0.0066,0.019,0.344,0.731,-0.031 0.044

0,1,2,3
Omnibus:,0.935,Durbin-Watson:,1.888
Prob(Omnibus):,0.627,Jarque-Bera (JB):,145.15
Skew:,-0.079,Prob(JB):,3.03e-32
Kurtosis:,1.016,Cond. No.,2.61


In [279]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DRW1', 'DRW2', 'DRW3', 'DRW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.3056
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.874
Time:,03:47:30,Log-Likelihood:,-636.66
No. Observations:,879,AIC:,1283.0
Df Residuals:,874,BIC:,1307.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5196,0.017,30.214,0.000,0.486 0.553
DRW1,0.0018,0.018,0.099,0.921,-0.034 0.038
DRW2,-0.0076,0.015,-0.502,0.616,-0.037 0.022
DRW3,-0.0095,0.016,-0.607,0.544,-0.040 0.021
DRW4,-0.0174,0.021,-0.820,0.413,-0.059 0.024

0,1,2,3
Omnibus:,0.943,Durbin-Watson:,1.888
Prob(Omnibus):,0.624,Jarque-Bera (JB):,145.686
Skew:,-0.08,Prob(JB):,2.32e-32
Kurtosis:,1.012,Cond. No.,1.88


In [280]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DLD1', 'DLD2', 'DLD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.3375
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.798
Time:,03:47:30,Log-Likelihood:,-636.77
No. Observations:,879,AIC:,1282.0
Df Residuals:,875,BIC:,1301.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5203,0.017,30.432,0.000,0.487 0.554
DLD1,-0.0136,0.020,-0.692,0.489,-0.052 0.025
DLD2,-0.0083,0.020,-0.411,0.682,-0.048 0.031
DLD3,0.0057,0.020,0.279,0.781,-0.034 0.046

0,1,2,3
Omnibus:,0.939,Durbin-Watson:,1.882
Prob(Omnibus):,0.625,Jarque-Bera (JB):,145.825
Skew:,-0.08,Prob(JB):,2.16e-32
Kurtosis:,1.011,Cond. No.,2.78


In [281]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.5194
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.669
Time:,03:47:30,Log-Likelihood:,-636.49
No. Observations:,879,AIC:,1281.0
Df Residuals:,875,BIC:,1300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5232,0.017,30.433,0.000,0.489 0.557
DRD1,0.0193,0.022,0.898,0.369,-0.023 0.062
DRD2,-0.0004,0.019,-0.022,0.982,-0.038 0.038
DRD3,-0.0009,0.021,-0.042,0.966,-0.042 0.040

0,1,2,3
Omnibus:,0.941,Durbin-Watson:,1.889
Prob(Omnibus):,0.625,Jarque-Bera (JB):,145.467
Skew:,-0.08,Prob(JB):,2.58e-32
Kurtosis:,1.013,Cond. No.,2.79


- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [282]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW1', 'DLD1', 'DRD1']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.332
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.248
Time:,03:47:30,Log-Likelihood:,-633.93
No. Observations:,879,AIC:,1280.0
Df Residuals:,873,BIC:,1309.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5207,0.017,30.037,0.000,0.487 0.555
DC1,0.0371,0.017,2.177,0.030,0.004 0.070
DLW1,-0.0063,0.016,-0.392,0.695,-0.038 0.025
DRW1,0.0077,0.017,0.452,0.651,-0.026 0.041
DLD1,-0.0185,0.016,-1.146,0.252,-0.050 0.013
DRD1,0.0153,0.017,0.898,0.369,-0.018 0.049

0,1,2,3
Omnibus:,0.936,Durbin-Watson:,1.906
Prob(Omnibus):,0.626,Jarque-Bera (JB):,142.082
Skew:,-0.079,Prob(JB):,1.4000000000000002e-31
Kurtosis:,1.037,Cond. No.,1.91


In [283]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW1', 'DLD1', 'DRD1']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.688556
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.005486
Time:,03:47:30,Log-Likelihood:,-605.24
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.2457

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0833,0.070,1.195,0.232,-0.053 0.220
DC1,0.1495,0.069,2.173,0.030,0.015 0.284
DLW1,-0.0253,0.064,-0.393,0.694,-0.152 0.101
DRW1,0.0310,0.068,0.454,0.650,-0.103 0.165
DLD1,-0.0746,0.065,-1.147,0.252,-0.202 0.053
DRD1,0.0616,0.069,0.899,0.369,-0.073 0.196


- regress **home win** on the difference in number of secondary quality home and visitor players by position (DC2, DLW2, DRW2, DLD2, DRD2). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [284]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.3022
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.912
Time:,03:47:30,Log-Likelihood:,-636.51
No. Observations:,879,AIC:,1285.0
Df Residuals:,873,BIC:,1314.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5203,0.017,30.060,0.000,0.486 0.554
DC2,-0.0066,0.014,-0.471,0.638,-0.034 0.021
DLW2,0.0108,0.015,0.717,0.474,-0.019 0.040
DRW2,-0.0056,0.014,-0.387,0.699,-0.034 0.023
DLD2,0.0013,0.015,0.092,0.927,-0.027 0.030
DRD2,-0.0087,0.013,-0.658,0.511,-0.035 0.017

0,1,2,3
Omnibus:,0.94,Durbin-Watson:,1.891
Prob(Omnibus):,0.625,Jarque-Bera (JB):,145.488
Skew:,-0.08,Prob(JB):,2.5600000000000004e-32
Kurtosis:,1.013,Cond. No.,1.63


In [285]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691490
         Iterations 3


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.001249
Time:,03:47:30,Log-Likelihood:,-607.82
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.9108

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0815,0.069,1.177,0.239,-0.054 0.217
DC2,-0.0263,0.056,-0.472,0.637,-0.136 0.083
DLW2,0.0431,0.060,0.719,0.472,-0.074 0.161
DRW2,-0.0224,0.058,-0.388,0.698,-0.135 0.091
DLD2,0.0054,0.059,0.092,0.927,-0.109 0.120
DRD2,-0.0350,0.053,-0.660,0.509,-0.139 0.069


- regress **home win** on the difference in number of third quality home and visitor players by position (DC3, DLW3, DRW3, DLD3, DRD3). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [286]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.4208
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.834
Time:,03:47:30,Log-Likelihood:,-636.22
No. Observations:,879,AIC:,1284.0
Df Residuals:,873,BIC:,1313.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5172,0.017,30.119,0.000,0.483 0.551
DC3,-0.0154,0.014,-1.108,0.268,-0.043 0.012
DLW3,0.0084,0.017,0.486,0.627,-0.026 0.042
DRW3,-0.0049,0.015,-0.330,0.742,-0.034 0.024
DLD3,0.0090,0.017,0.522,0.602,-0.025 0.043
DRD3,-0.0014,0.018,-0.079,0.937,-0.037 0.034

0,1,2,3
Omnibus:,0.943,Durbin-Watson:,1.891
Prob(Omnibus):,0.624,Jarque-Bera (JB):,145.094
Skew:,-0.08,Prob(JB):,3.11e-32
Kurtosis:,1.016,Cond. No.,1.59


In [287]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691150
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.001739
Time:,03:47:30,Log-Likelihood:,-607.52
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.8328

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0690,0.069,1.004,0.315,-0.066 0.204
DC3,-0.0618,0.056,-1.111,0.267,-0.171 0.047
DLW3,0.0338,0.069,0.487,0.626,-0.102 0.170
DRW3,-0.0197,0.059,-0.331,0.741,-0.136 0.097
DLD3,0.0362,0.069,0.524,0.600,-0.099 0.171
DRD3,-0.0058,0.073,-0.079,0.937,-0.149 0.137


- regress **home win** on the difference in number of bottom quality home and visitor forwards (DC4, DLW4, DRW4). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [288]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.2879
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.834
Time:,03:47:30,Log-Likelihood:,-636.84
No. Observations:,879,AIC:,1282.0
Df Residuals:,875,BIC:,1301.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5197,0.017,30.450,0.000,0.486 0.553
DC4,0.0003,0.019,0.017,0.986,-0.038 0.038
DLW4,-0.0090,0.017,-0.543,0.587,-0.042 0.024
DRW4,-0.0180,0.021,-0.840,0.401,-0.060 0.024

0,1,2,3
Omnibus:,0.934,Durbin-Watson:,1.888
Prob(Omnibus):,0.627,Jarque-Bera (JB):,145.925
Skew:,-0.079,Prob(JB):,2.0500000000000001e-32
Kurtosis:,1.01,Cond. No.,1.54


In [289]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691861
         Iterations 3


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,875.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0007125
Time:,03:47:30,Log-Likelihood:,-608.15
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.8333

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0789,0.068,1.155,0.248,-0.055 0.213
DC4,0.0013,0.078,0.017,0.986,-0.151 0.154
DLW4,-0.0362,0.067,-0.544,0.586,-0.167 0.094
DRW4,-0.0724,0.086,-0.841,0.401,-0.241 0.096


#### goal differential 

- regress **goal differential** on the difference in number of home and visitor players by position and quality (DF1, DF2, DD1, DD2). Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on goal differential.

In [290]:
y = dl['GD']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.01
Model:,OLS,Adj. R-squared:,-0.008
Method:,Least Squares,F-statistic:,0.5398
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.927
Time:,03:47:30,Log-Likelihood:,-2031.9
No. Observations:,879,AIC:,4098.0
Df Residuals:,862,BIC:,4179.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2213,0.091,2.428,0.015,0.042 0.400
DC1,0.1849,0.108,1.719,0.086,-0.026 0.396
DC2,0.0548,0.088,0.626,0.532,-0.117 0.227
DC3,-0.0406,0.072,-0.563,0.574,-0.182 0.101
DC4,-0.0325,0.106,-0.308,0.758,-0.240 0.175
DLW1,-0.0073,0.096,-0.076,0.939,-0.195 0.181
DLW2,0.0504,0.088,0.571,0.568,-0.123 0.224
DLW3,0.0348,0.088,0.394,0.694,-0.138 0.208
DLW4,-0.1204,0.086,-1.407,0.160,-0.288 0.048

0,1,2,3
Omnibus:,0.722,Durbin-Watson:,2.001
Prob(Omnibus):,0.697,Jarque-Bera (JB):,0.806
Skew:,0.034,Prob(JB):,0.668
Kurtosis:,2.868,Cond. No.,8010000000000000.0


## season_game_team_level_analysis

#### $Win = \beta_{0} + \beta_{1}C_{1} + \beta_{2}C_{2} + \beta_{3}C_{3} + \beta_{4}C_{4} + \beta_{5}LW_{1} + \beta_{6}LW_{2} + \beta_{7}LW_{3} + \beta_{8}LW_{4} + \beta_{9}RW_{1} + \beta_{10}RW_{2} + \beta_{11}RW_{3} + \beta_{12}RW_{4} + \beta_{13}LD_{1} + \beta_{14}LD_{2} + \beta_{15}LD_{3} + \beta_{16}RD_{1} + \beta_{17}RD_{2} + \beta_{18}RD_{3} + e_{s,g,t}$

- use season game data (dg) and season game team roster (dx) to conduct season game team level analysis (dt).

In [291]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [292]:
dt = dg.merge(dx, on=['Season', 'GameNumber'], how='left')
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,MTL,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,2.0,1.0,0.933333,1.566667,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.766667,1.0,1.133333,0.8,1.916667,0.0,0.683333,0.6,2.0,0.8,1.0
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.984375,1.0,1.5625,0.21875,2.015625,0.0,0.578125,2.078125,2.0,0.96875,0.59375,0.0,0.671875,1.953125,0.640625,0.828125,1.0,0.90625,
2,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121,1.0
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.028571,1.1,2.242857,0.085714,0.8,0.842857,1.285714,0.414286,0.514286,2.185714,1.328571,0.171429,1.528571,0.957143,0.9,0.757143,1.857143,0.0,
4,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,CAR,18.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.984615,1.0,0.169231,0.892308,2.107692,0.738462,0.261538,0.523077,1.0,1.0,1.692308,0.630769,1.938462,0.4,1.061538,0.907692,1.492308,0.2,1.0


In [293]:
dt.shape

(2109, 50)

- Sum up goals for and against by team per game and find the goal differential (GD) per game. Assign a value of 1 to the team that won the game. 

In [294]:
dt['GD'] = dt.apply(lambda x: (x['HGF'] - x['VGF']) if x['HTeamCode']== x['TeamCode'] else (x['VGF'] - x['HGF']), 1)
dt['Win'] = dt.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dt['GF'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']== x['TeamCode'] else x['VGF'], 1)
dt['GA'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']!= x['TeamCode'] else x['VGF'], 1)
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA
0,2010,20001,MTL,TOR,2,3,-1,3,2,TOR,MTL,MTL,18.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,2.0,1.0,0.933333,1.566667,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.766667,1.0,1.133333,0.8,1.916667,0.0,0.683333,0.6,2.0,0.8,1.0,0,2,3
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.984375,1.0,1.5625,0.21875,2.015625,0.0,0.578125,2.078125,2.0,0.96875,0.59375,0.0,0.671875,1.953125,0.640625,0.828125,1.0,0.90625,,1,3,2
2,2010,20002,PHI,PIT,3,2,1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,0.0,1.924242,0.69697,1.439394,1.0,1.333333,1.80303,0.0,2.984848,0.0,0.212121,0.590909,1.0,1.212121,1.0,1,3,2
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.028571,1.1,2.242857,0.085714,0.8,0.842857,1.285714,0.414286,0.514286,2.185714,1.328571,0.171429,1.528571,0.957143,0.9,0.757143,1.857143,0.0,,0,2,3
4,2010,20003,CAR,MIN,4,3,1,3,4,CAR,MIN,CAR,18.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.984615,1.0,0.169231,0.892308,2.107692,0.738462,0.261538,0.523077,1.0,1.0,1.692308,0.630769,1.938462,0.4,1.061538,0.907692,1.492308,0.2,1.0,1,4,3


- total of forwards and defensemen by team per game.

In [295]:
dt['F'] = dt['C1'] + dt['C2'] + dt['C3'] + dt['C4'] + dt['LW1'] + dt['LW2'] + dt['LW3'] + dt['LW4'] + dt['RW1'] + dt['RW2'] + dt['RW3'] + dt['RW4']   
dt['D'] = dt['LD1'] + dt['LD2'] + dt['LD3'] + dt['RD1'] + dt['RD2'] + dt['RD3']

- display the difference of quality per position.

In [296]:
dt['DC'] = dt['C1'] - dt['C2'] - dt['C3'] - dt['C4']
dt['DLW'] = dt['LW1'] - dt['LW2'] - dt['LW3'] - dt['LW4']
dt['DRW'] = dt['RW1'] - dt['RW2'] - dt['RW3'] - dt['RW4']
dt['DLD'] = dt['LD1'] - dt['LD2'] - dt['LD3']
dt['DRD'] = dt['RD1'] - dt['RD2'] - dt['RD3']

In [297]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [298]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

-**keep only games that have 12 forwards and 6 defensemen.**

In [299]:
dt = dt.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['F'] == 12) & (x['D'] == 6)).all())

In [300]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [301]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

In [302]:
dt.shape

(1758, 60)

### summary analysis

In [303]:
dt.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA,F,D,DC,DLW,DRW,DLD,DRD
count,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,879.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0
mean,2010.0,20613.112628,2.763367,2.960182,0.0,2.960182,2.763367,18.0,1.161547,1.378271,1.191126,0.399317,0.862912,1.321388,0.807736,1.099545,1.120023,1.056883,0.597838,1.104664,1.213311,0.689989,0.853811,1.45165,1.224118,0.46587,1.161547,1.378271,1.191126,0.399317,1.099545,1.120023,1.056883,0.597838,0.853811,1.45165,1.224118,0.46587,0.862912,1.321388,0.807736,1.104664,1.213311,0.689989,1.0,0.5,2.861775,2.861775,12.0,6.0,-1.807167,-1.675199,-2.287827,-1.266212,-0.798635
std,0.0,349.39163,1.671612,1.72682,2.462467,1.72682,1.671612,0.0,0.75823,0.852012,0.898928,0.614601,0.887516,0.931769,0.711326,0.815187,0.82148,0.705419,0.737897,0.759213,0.932255,0.674794,0.718655,0.871803,0.819407,0.599493,0.696254,0.749818,0.781773,0.451236,0.759265,0.736456,0.54259,0.527924,0.625569,0.775336,0.644196,0.40361,0.810268,0.82296,0.447606,0.639184,0.804996,0.499287,0.0,0.500142,1.702289,1.702289,0.0,0.0,1.625359,1.551532,1.586863,1.694582,1.512369
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,6.0,-6.0,-6.0,-6.0,-4.0,-5.0
25%,2010.0,20317.25,2.0,2.0,-2.0,2.0,2.0,18.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.8,0.928571,0.768116,0.030303,0.914286,0.738462,0.69697,0.171429,0.483333,0.965517,0.666667,0.085106,0.114286,0.787879,0.515152,0.6,0.8125,0.114286,1.0,0.0,2.0,2.0,12.0,6.0,-3.0,-3.0,-3.0,-3.0,-2.0
50%,2010.0,20613.0,3.0,3.0,0.0,3.0,3.0,18.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.955224,1.428571,1.121212,0.28125,0.970149,0.969231,1.015873,0.402985,0.890625,1.521739,1.245902,0.424242,0.671875,1.333333,0.828358,0.985507,1.0,0.683333,1.0,0.5,3.0,3.0,12.0,6.0,-2.0,-2.0,-2.0,-2.0,-1.0
75%,2010.0,20913.75,4.0,4.0,2.0,4.0,4.0,18.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.851064,1.948718,1.816667,0.516667,1.126984,1.836066,1.476923,0.984375,1.0,1.924242,1.692308,0.8,1.528571,1.910448,1.140625,1.651515,1.857143,1.020833,1.0,1.0,4.0,4.0,12.0,6.0,-1.0,-1.0,-1.0,0.0,0.0
max,2010.0,21230.0,10.0,9.0,8.0,9.0,10.0,18.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,2.878788,2.878788,2.432836,1.925373,3.567164,2.439394,2.0,2.078125,2.0,3.614035,3.015152,1.15,2.984848,2.877193,1.741379,2.318182,2.74359,1.628571,1.0,1.0,10.0,10.0,12.0,6.0,3.0,4.0,2.0,3.0,3.0


In [304]:
dt.groupby(['Win'])['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4', 'RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3,C4,LW1,LW2,LW3,LW4,RW1,RW2,RW3,RW4,LD1,LD2,LD3,RD1,RD2,RD3
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
0,mean,1.116041,1.405006,1.193402,0.435722,1.085324,1.102389,1.025028,0.624573,0.784983,1.500569,1.232082,0.494881,0.844141,1.32992,0.819113,1.077361,1.282139,0.647327
0,std,0.744122,0.866892,0.91875,0.638655,0.813189,0.811917,0.715473,0.766697,0.716199,0.860252,0.82849,0.609087,0.883697,0.943943,0.719275,0.768648,0.958138,0.675533
0,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,25%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
0,50%,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
0,75%,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0
0,max,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0
1,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
1,mean,1.207053,1.351536,1.188851,0.362912,1.113766,1.137656,1.088737,0.571104,0.922639,1.40273,1.216155,0.43686,0.881684,1.312856,0.796359,1.131968,1.144482,0.732651


### estimate roster model

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [305]:
print ('season game team level analysis (win) by roster position')
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/points_per_toi/season_game_team_level_analysis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season game team level analysis (win) by roster position
                            OLS Regression Results                            
Dep. Variable:                    Win   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     3.686
Date:                Thu, 22 Feb 2018   Prob (F-statistic):           1.04e-06
Time:                        03:47:34   Log-Likelihood:                -1246.7
No. Observations:                1758   AIC:                             2527.
Df Residuals:                    1741   BIC:                             2620.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success

In [306]:
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.676475
         Iterations 34


  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1741.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.02405
Time:,03:47:34,Log-Likelihood:,-1189.2
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,8.922e-07

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.0095,9.01e+06,-1.05e-09,1.000,-1.77e+07 1.77e+07
C1,0.1535,,,,nan nan
C2,-0.0541,,,,nan nan
C3,-0.0165,,,,nan nan
C4,-0.2992,,,,nan nan
LW1,0.1662,,,,nan nan
LW2,0.0638,,,,nan nan
LW3,0.1175,,,,nan nan
LW4,-0.2004,,,,nan nan


#### goal differential

- regress **goal differential** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success

In [307]:
y = dt['GD']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,3.169
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,2.15e-05
Time:,03:47:34,Log-Likelihood:,-4053.0
No. Observations:,1758,AIC:,8140.0
Df Residuals:,1741,BIC:,8233.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-0.0099,0.006,-1.567,0.117,-0.022 0.002
C1,0.1937,0.102,1.901,0.058,-0.006 0.394
C2,-0.1044,0.084,-1.250,0.212,-0.268 0.059
C3,-0.0920,0.071,-1.304,0.192,-0.230 0.046
C4,-0.2855,0.110,-2.603,0.009,-0.501 -0.070
LW1,0.1796,0.095,1.891,0.059,-0.007 0.366
LW2,0.0932,0.087,1.069,0.285,-0.078 0.264
LW3,0.1131,0.087,1.306,0.192,-0.057 0.283
LW4,-0.2093,0.085,-2.458,0.014,-0.376 -0.042

0,1,2,3
Omnibus:,0.085,Durbin-Watson:,3.003
Prob(Omnibus):,0.958,Jarque-Bera (JB):,0.132
Skew:,-0.007,Prob(JB):,0.936
Kurtosis:,2.96,Cond. No.,4.24e+16


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [308]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,5.462
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,5.4e-05
Time:,03:47:34,Log-Likelihood:,-1262.3
No. Observations:,1758,AIC:,2537.0
Df Residuals:,1752,BIC:,2570.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.6205,0.031,20.173,0.000,0.560 0.681
DC,0.0216,0.007,2.904,0.004,0.007 0.036
DLW,0.0001,0.008,0.015,0.988,-0.016 0.016
DRW,0.0309,0.008,4.051,0.000,0.016 0.046
DLD,0.0045,0.008,0.603,0.547,-0.010 0.019
DRD,0.0060,0.008,0.712,0.477,-0.010 0.022

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.988
Prob(Omnibus):,1.0,Jarque-Bera (JB):,275.365
Skew:,-0.001,Prob(JB):,1.6e-60
Kurtosis:,1.061,Cond. No.,10.9


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **Logit**.

In [309]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.685429
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1752.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.01114
Time:,03:47:34,Log-Likelihood:,-1205.0
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,5.361e-05

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.4873,0.125,3.893,0.000,0.242 0.733
DC,0.0876,0.030,2.889,0.004,0.028 0.147
DLW,0.0003,0.033,0.010,0.992,-0.064 0.064
DRW,0.1250,0.031,4.020,0.000,0.064 0.186
DLD,0.0183,0.031,0.600,0.549,-0.042 0.078
DRD,0.0241,0.034,0.709,0.478,-0.042 0.091


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [310]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,8.876
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,7.73e-06
Time:,03:47:34,Log-Likelihood:,-1262.7
No. Observations:,1758,AIC:,2533.0
Df Residuals:,1754,BIC:,2555.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.6108,0.028,21.701,0.000,0.556 0.666
DC,0.0227,0.007,3.104,0.002,0.008 0.037
DLW,-0.0020,0.008,-0.256,0.798,-0.017 0.013
DRW,0.0320,0.007,4.274,0.000,0.017 0.047

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.988
Prob(Omnibus):,1.0,Jarque-Bera (JB):,275.778
Skew:,-0.0,Prob(JB):,1.3e-60
Kurtosis:,1.06,Cond. No.,9.34


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **Logit**.

In [311]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690257
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1754.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.00417
Time:,03:47:34,Log-Likelihood:,-1213.5
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.01724

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.1849,0.094,1.964,0.050,0.000 0.369
DC,0.0832,0.030,2.812,0.005,0.025 0.141
DLW,-0.0004,0.031,-0.013,0.989,-0.062 0.061
DRD,0.0441,0.032,1.373,0.170,-0.019 0.107


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [312]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.892
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.151
Time:,03:47:34,Log-Likelihood:,-1274.0
No. Observations:,1758,AIC:,2554.0
Df Residuals:,1755,BIC:,2571.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5223,0.017,30.806,0.000,0.489 0.556
DLD,0.0090,0.007,1.259,0.208,-0.005 0.023
DRD,0.0136,0.008,1.693,0.091,-0.002 0.029

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,3.009
Prob(Omnibus):,1.0,Jarque-Bera (JB):,290.48
Skew:,-0.0,Prob(JB):,8.380000000000001e-64
Kurtosis:,1.009,Cond. No.,3.43


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **Logit**.

In [313]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.692070
         Iterations 3


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1755.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.001553
Time:,03:47:34,Log-Likelihood:,-1216.7
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.1506

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0893,0.068,1.314,0.189,-0.044 0.222
DLD,0.0361,0.029,1.259,0.208,-0.020 0.092
DRD,0.0545,0.032,1.692,0.091,-0.009 0.118


- regress **goal differential** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [314]:
y = dt['GD']
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,6.291
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,8.55e-06
Time:,03:47:34,Log-Likelihood:,-4062.6
No. Observations:,1758,AIC:,8137.0
Df Residuals:,1752,BIC:,8170.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.6351,0.151,4.198,0.000,0.338 0.932
DC,0.1353,0.037,3.694,0.000,0.063 0.207
DLW,0.0086,0.040,0.218,0.828,-0.069 0.086
DRW,0.1120,0.038,2.985,0.003,0.038 0.186
DLD,0.0439,0.037,1.187,0.235,-0.029 0.116
DRD,0.0805,0.041,1.957,0.050,-0.000 0.161

0,1,2,3
Omnibus:,0.57,Durbin-Watson:,3.023
Prob(Omnibus):,0.752,Jarque-Bera (JB):,0.637
Skew:,-0.004,Prob(JB):,0.727
Kurtosis:,2.907,Cond. No.,10.9


- regress **goal differential** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [315]:
y = dt['GD']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,9.021
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,6.29e-06
Time:,03:47:34,Log-Likelihood:,-4064.8
No. Observations:,1758,AIC:,8138.0
Df Residuals:,1754,BIC:,8159.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5277,0.139,3.808,0.000,0.256 0.799
DC,0.1462,0.036,4.067,0.000,0.076 0.217
DLW,-0.0153,0.038,-0.407,0.684,-0.089 0.058
DRW,0.1264,0.037,3.432,0.001,0.054 0.199

0,1,2,3
Omnibus:,0.537,Durbin-Watson:,3.024
Prob(Omnibus):,0.765,Jarque-Bera (JB):,0.605
Skew:,-0.004,Prob(JB):,0.739
Kurtosis:,2.909,Cond. No.,9.34


- regress **goal differential** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [316]:
y = dt['GD']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,4.938
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.00727
Time:,03:47:34,Log-Likelihood:,-4073.3
No. Observations:,1758,AIC:,8153.0
Df Residuals:,1755,BIC:,8169.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1743,0.083,2.092,0.037,0.011 0.338
DLD,0.0681,0.035,1.932,0.053,-0.001 0.137
DRD,0.1104,0.039,2.797,0.005,0.033 0.188

0,1,2,3
Omnibus:,1.305,Durbin-Watson:,3.038
Prob(Omnibus):,0.521,Jarque-Bera (JB):,1.301
Skew:,-0.001,Prob(JB):,0.522
Kurtosis:,2.867,Cond. No.,3.43


- regress **goals for ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [317]:
y = dt['GF']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,9.651
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,2.56e-06
Time:,03:47:34,Log-Likelihood:,-3414.8
No. Observations:,1758,AIC:,6838.0
Df Residuals:,1754,BIC:,6860.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.2265,0.096,33.702,0.000,3.039 3.414
DC,0.1095,0.025,4.408,0.000,0.061 0.158
DLW,-0.0141,0.026,-0.543,0.588,-0.065 0.037
DRW,0.0832,0.025,3.272,0.001,0.033 0.133

0,1,2,3
Omnibus:,43.25,Durbin-Watson:,2.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.967
Skew:,0.395,Prob(JB):,1.04e-10
Kurtosis:,3.05,Cond. No.,9.34


- regress **goals for** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [318]:
y = dt['GF']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,7.61
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.000512
Time:,03:47:34,Log-Likelihood:,-3421.6
No. Observations:,1758,AIC:,6849.0
Df Residuals:,1755,BIC:,6866.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.0041,0.058,52.231,0.000,2.891 3.117
DLD,0.0498,0.024,2.049,0.041,0.002 0.097
DRD,0.0993,0.027,3.646,0.000,0.046 0.153

0,1,2,3
Omnibus:,43.233,Durbin-Watson:,2.091
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.057
Skew:,0.396,Prob(JB):,9.97e-11
Kurtosis:,2.995,Cond. No.,3.43


- regress **goals against ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [319]:
y = dt['GA']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.581
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.192
Time:,03:47:34,Log-Likelihood:,-3426.8
No. Observations:,1758,AIC:,6862.0
Df Residuals:,1754,BIC:,6884.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.6988,0.096,27.998,0.000,2.510 2.888
DC,-0.0367,0.025,-1.469,0.142,-0.086 0.012
DLW,0.0012,0.026,0.047,0.963,-0.050 0.053
DRW,-0.0431,0.026,-1.683,0.093,-0.093 0.007

0,1,2,3
Omnibus:,44.472,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.433
Skew:,0.402,Prob(JB):,5.01e-11
Kurtosis:,3.018,Cond. No.,9.34


- regress **goals against** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [320]:
y = dt['GA']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.3156
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.729
Time:,03:47:34,Log-Likelihood:,-3428.9
No. Observations:,1758,AIC:,6864.0
Df Residuals:,1755,BIC:,6880.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8298,0.058,48.998,0.000,2.717 2.943
DLD,-0.0182,0.024,-0.747,0.455,-0.066 0.030
DRD,-0.0111,0.027,-0.405,0.686,-0.065 0.043

0,1,2,3
Omnibus:,45.976,Durbin-Watson:,2.024
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.156
Skew:,0.409,Prob(JB):,2.12e-11
Kurtosis:,3.019,Cond. No.,3.43
