# Data

## season_game_level_data

In [2]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [3]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [4]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [5]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [6]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [7]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [8]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [9]:
# dg = dp[['Season', 'GameNumber', 'EventTeamCode', 'VTeamCode', 'HTeamCode']]
# dg = dg.drop_duplicates(['Season', 'GameNumber',  'EventTeamCode'])
# dg = dg.rename(columns={'EventTeamCode': 'Team'})
# dg['Opp'] = dg.apply(lambda x: x['HTeamCode'] if x['Team'] == x['HTeamCode'] else x['VTeamCode'], axis=1)
# dg['Designation'] = dg.apply(lambda x: 'home' if x['Team'] == x['HTeamCode'] else 'away', axis=1)
# dg = dg[['Season', 'GameNumber', 'Team', 'Opp', 'Designation']]

In [10]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [11]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [12]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [13]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [14]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [15]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [16]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_data.csv', index='False', sep=',')

## season_level_data

In [17]:
dm = da

events that happened in regulation time only

In [18]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [19]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [20]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [21]:
dm.shape

(1796745, 24)

In [22]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTeamCode',
       'EventTimeFromTwenty', 'EventTimeFromZero', 'EventType', 'GameDate',
       'GameNumber', 'HTeamCode', 'Length', 'PenaltyType', 'Period',
       'PlayerName', 'PlayerNumber', 'Season', 'ShotResult', 'ShotType',
       'VTeamCode', 'Zone', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [23]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [24]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [25]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [26]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [27]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [28]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [34]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats_per_player_centers_wingers_defensemen.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats_per_player_centers_wingers_defensemen.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [35]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,TOR,37.0,BRENT,C
2,2010,MTL,14.0,PLEKANEC,C
3,2010,MTL,76.0,SUBBAN,D
4,2010,TOR,35.0,GIGUERE,G


In [36]:
dp.shape

(1058, 5)

In [54]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/player_rank_by_goals_assists_points_per_time_on_ice_for_centers_wingers_defensemen.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/player_rank_by_goals_assists_points_per_time_on_ice_for_centers_wingers_defensemen.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [57]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
709,2010,ANA,5.0,SBISA,D,3
634,2010,ANA,32.0,LYDMAN,D,2
541,2010,ANA,54.0,FOWLER,D,1
789,2010,ANA,23.0,MARA,D,3
7,2010,ANA,10.0,PERRY,W,1


- merge player position and player rankings and drop goaltenders

In [58]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1058, 6)

In [59]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              256
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [60]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1058, 6)

In [61]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
996,2010,OTT,51.0,SMITH,D,4.0


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [62]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'D') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [63]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [64]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,W,3.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [65]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [66]:
dw.shape

(3688734, 12)

In [67]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130215, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [68]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
#dw['Position'] = dw.apply(lambda x: 'D' if (x['PlayerPosition']=='D') else 'F', 1)
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,5.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0
2,2010,20001,MTL,57.0,POULIOT,W,3.0,2,3,1,TOR,MTL,18.0,7.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0


- count the amount of forwards and defensemen by team per game.

In [69]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,3.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0


### keep games that have 4 C, 8 W  and 6 D per team!!!!

In [70]:
dw = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6)).all())

In [71]:
dw.shape

(10116, 17)

In [72]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')

- create a new dataset using team roster player rank

In [73]:
dv = dw

In [75]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0
5,2010,20005,EDM,W,4,0,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.375
6,2010,20006,CBJ,C,2,3,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,2.5
7,2010,20006,CBJ,D,2,3,18.0,6.0,4.0,8.0,6.0,SJ,CBJ,2.0
8,2010,20006,CBJ,W,2,3,18.0,8.0,4.0,8.0,6.0,SJ,CBJ,2.625
9,2010,20006,SJ,C,3,2,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,2.0


In [76]:
dv.shape

(1686, 14)

- create columns for team win and team loss. 

In [77]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']!=x['WinTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667,0,1
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0,1,0


- display games played, games won, games loss, goals for and goals against by team for the season.

In [79]:
dv['GP'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'PlayerPosition', 'WinTeam'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'PlayerPosition', 'LossTeam'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,9,12
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667,0,1,23,9,12
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,9,12
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,12
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0,1,0,26,9,12


- create columns with the mean ranking for forward and defenseman by team per game.

In [81]:
dv['Rank_C'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dv['Rank_W'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dv['Rank_D'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dv['Rank_C'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_C'].apply(lambda x: x.ffill().bfill())
dv['Rank_W'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_W'].apply(lambda x: x.ffill().bfill())
dv['Rank_D'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_D'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_C,Rank_W,Rank_D
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,9,12,2.75,2.5,2.166667
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667,0,1,23,9,12,2.75,2.5,2.166667
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,9,12,2.75,2.5,2.166667
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,12,2.75,2.375,2.0
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0,1,0,26,9,12,2.75,2.375,2.0


- compute the mean per position by team for the season.

In [82]:
dv['Mean_C']= dv.groupby(['Season', 'TeamCode'])['Rank_C'].transform('mean')
dv['Mean_W']= dv.groupby(['Season', 'TeamCode'])['Rank_W'].transform('mean')
dv['Mean_D']= dv.groupby(['Season', 'TeamCode'])['Rank_D'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_C,Rank_W,Rank_D,Mean_C,Mean_W,Mean_D
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,12,2.75,2.375,2.0,2.826923,2.524038,2.179487
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0,1,0,26,9,12,2.75,2.375,2.0,2.826923,2.524038,2.179487


- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [83]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_C,Rank_W,Rank_D,Mean_C,Mean_W,Mean_D,L,W
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087,12,11
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.166667,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087,12,11
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,9,12,2.75,2.5,2.166667,2.608696,2.201087,1.76087,12,11
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,12,2.75,2.375,2.0,2.826923,2.524038,2.179487,17,9
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.0,1,0,26,9,12,2.75,2.375,2.0,2.826923,2.524038,2.179487,17,9


- compute win and loss percent by team. Drop duplicate observations.

In [84]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'Mean_C', 'Mean_W', 'Mean_D']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'Mean_C', 'Mean_W', 'Mean_D']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_C,Mean_W,Mean_D
0,2010,CGY,23,11,12,75,75,0.478261,0.521739,2.608696,2.201087,1.76087
3,2010,EDM,26,9,17,67,88,0.346154,0.653846,2.826923,2.524038,2.179487
6,2010,CBJ,25,9,16,69,86,0.36,0.64,2.53,2.63,1.986667
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,2.044118,2.522059,1.931373
12,2010,ANA,10,3,7,18,32,0.3,0.7,2.825,2.4875,2.233333


- rank teams based on win percent, mean forwards and mean defensemen. 

In [85]:
dv['Rank_Win'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['Rank_C'] = dv.groupby(['Season'])['Mean_C'].rank(ascending=True)
dv['Rank_W'] = dv.groupby(['Season'])['Mean_W'].rank(ascending=True)
dv['Rank_D'] = dv.groupby(['Season'])['Mean_D'].rank(ascending=True)
dv = dv.sort_values(['Season', 'Rank_Win', 'Rank_C', 'Rank_W', 'Rank_D'], ascending=[True, True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_C,Mean_W,Mean_D,Rank_Win,Rank_C,Rank_W,Rank_D
39,2010,NSH,16,13,3,55,29,0.8125,0.1875,2.46875,2.453125,1.5625,1.0,16.0,11.0,2.0
105,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.288462,2.600962,2.160256,2.0,11.0,20.0,22.0
15,2010,DET,25,18,7,93,63,0.72,0.28,2.25,2.32,1.78,3.0,9.0,8.0,6.0
45,2010,LA,25,17,8,68,54,0.68,0.32,2.56,2.755,1.926667,4.0,20.0,27.0,14.0
18,2010,BUF,18,12,6,62,52,0.666667,0.333333,2.694444,2.090278,1.907407,5.0,25.0,2.0,13.0
51,2010,PHI,31,19,12,102,82,0.612903,0.387097,1.774194,2.423387,1.451613,6.0,1.0,9.0,1.0
66,2010,CHI,20,12,8,68,54,0.6,0.4,1.9875,2.16875,1.858333,7.5,2.0,4.0,12.0
573,2010,OTT,10,6,4,22,22,0.6,0.4,2.925,2.725,2.233333,7.5,29.0,26.0,25.0
21,2010,NYR,22,13,9,72,56,0.590909,0.409091,2.0,2.215909,2.022727,9.0,3.0,7.0,18.0
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,2.044118,2.522059,1.931373,10.0,4.0,16.0,15.0


In [86]:
dv.shape

(30, 16)

In [59]:
#dv.to_csv('season_team_roster_ranking.csv', index='False')

In [87]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')

## season_game_team_roster_data

- use the team roster player rank dataset (dw) to display the roster quality by team per game

In [88]:
dx = dw

In [89]:
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank']]
dx = dx.rename(columns={'PlayerPosition': 'Position'})
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank
6426,2010,20005,CGY,18.0,13.0,C,2.0
6427,2010,20005,CGY,18.0,12.0,W,1.0
6428,2010,20005,CGY,18.0,40.0,W,1.0
6429,2010,20005,CGY,18.0,3.0,D,3.0
6431,2010,20005,CGY,18.0,28.0,D,2.0


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 19 players per team and 38 per game for the dataset to be correct.

In [90]:
dx['playercount'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [91]:
dx['rosterposition'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank (10 columns). 


In [92]:
dx = pd.pivot_table(dx, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dx = dx.reset_index()
dx.columns = ['_'.join(str(s).strip() for s in col if s) for col in dx.columns]
dx.reset_index()
dx = dx.fillna(0)
dx = dx.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dx.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20005,CGY,18.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0
1,2010,20005,EDM,18.0,0.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0
2,2010,20006,CBJ,18.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,3.0,2.0,2.0
3,2010,20006,SJ,18.0,2.0,1.0,0.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,3.0
4,2010,20009,ANA,18.0,1.0,1.0,0.0,2.0,2.0,0.0,4.0,3.0,1.0,2.0,2.0
5,2010,20009,DET,18.0,1.0,2.0,1.0,0.0,3.0,2.0,1.0,1.0,4.0,3.0,0.0
6,2010,20013,BUF,18.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,4.0,1.0,1.0
7,2010,20013,NYR,18.0,0.0,4.0,0.0,0.0,1.0,4.0,1.0,2.0,1.0,4.0,1.0
8,2010,20015,DAL,18.0,2.0,0.0,1.0,1.0,0.0,2.0,4.0,2.0,2.0,1.0,3.0
9,2010,20015,NYI,18.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,0.0,4.0,1.0,3.0


In [93]:
dx.shape

(562, 15)

In [67]:
#dx.to_csv('season_game_team_roster.csv', index='False', sep=',')

In [94]:
dx.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')
#dx.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')

- create a dataset that will display the mean of forwards and defencemen by season per team

In [95]:
dz = dx

In [96]:
# mean centers ranking per team
dz['MeanC1'] = dz.groupby(['Season', 'TeamCode'])['C1'].transform('mean')
dz['MeanC2'] = dz.groupby(['Season', 'TeamCode'])['C2'].transform('mean')
dz['MeanC3'] = dz.groupby(['Season', 'TeamCode'])['C3'].transform('mean')
dz['MeanC4'] = dz.groupby(['Season', 'TeamCode'])['C4'].transform('mean')

# mean wing ranking per team
dz['MeanW1'] = dz.groupby(['Season', 'TeamCode'])['W1'].transform('mean')
dz['MeanW2'] = dz.groupby(['Season', 'TeamCode'])['W2'].transform('mean')
dz['MeanW3'] = dz.groupby(['Season', 'TeamCode'])['W3'].transform('mean')
dz['MeanW4'] = dz.groupby(['Season', 'TeamCode'])['W4'].transform('mean')


# meandefense ranking per team
dz['MeanD1'] = dz.groupby(['Season', 'TeamCode'])['D1'].transform('mean')
dz['MeanD2'] = dz.groupby(['Season', 'TeamCode'])['D2'].transform('mean')
dz['MeanD3'] = dz.groupby(['Season', 'TeamCode'])['D3'].transform('mean')

dz.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3
0,2010,20005,CGY,18.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,0.0,1.869565,1.826087,0.304348,2.0,2.73913,2.913043,0.347826,1.913043,3.608696,0.478261
1,2010,20005,EDM,18.0,0.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,0.0,1.692308,1.307692,1.0,0.538462,4.115385,1.961538,1.384615,0.692308,3.538462,1.769231
2,2010,20006,CBJ,18.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,3.0,2.0,2.0,0.0,1.88,2.12,0.0,0.84,3.0,2.44,1.72,0.4,5.28,0.32
3,2010,20006,SJ,18.0,2.0,1.0,0.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,3.0,1.941176,1.0,0.0,1.058824,2.941176,0.764706,1.470588,2.823529,1.411765,3.588235,1.0
4,2010,20009,ANA,18.0,1.0,1.0,0.0,2.0,2.0,0.0,4.0,3.0,1.0,2.0,2.0,0.9,1.0,0.0,2.1,2.7,1.0,2.0,2.3,1.9,0.8,3.3


- drop duplicates by season and team

In [97]:
dz = dz.drop_duplicates(['Season', 'TeamCode'])

In [98]:
dz = dz[['Season', 'TeamCode', 'RosterCount', 'MeanC1', 'MeanC2', 'MeanC3', 'MeanC4', 'MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']]
dz.head()

Unnamed: 0,Season,TeamCode,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3
0,2010,CGY,18.0,0.0,1.869565,1.826087,0.304348,2.0,2.73913,2.913043,0.347826,1.913043,3.608696,0.478261
1,2010,EDM,18.0,0.0,1.692308,1.307692,1.0,0.538462,4.115385,1.961538,1.384615,0.692308,3.538462,1.769231
2,2010,CBJ,18.0,0.0,1.88,2.12,0.0,0.84,3.0,2.44,1.72,0.4,5.28,0.32
3,2010,SJ,18.0,1.941176,1.0,0.0,1.058824,2.941176,0.764706,1.470588,2.823529,1.411765,3.588235,1.0
4,2010,ANA,18.0,0.9,1.0,0.0,2.1,2.7,1.0,2.0,2.3,1.9,0.8,3.3


In [99]:
dz.shape

(30, 14)

In [74]:
#dz.to_csv('season_team.csv', index='False', sep=',')

In [100]:
dz.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_team_centers_wingers_defensemen.csv', index='False', sep=',')
#dz.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_team_centers_wingers_defensemen.csv', index='False', sep=',')

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [101]:
dy = dx

In [102]:
dy.loc[dy.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dy = dy.fillna(2)

In [103]:
dy.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3,A
0,2010,20005,CGY,18.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,0.0,1.869565,1.826087,0.304348,2.0,2.73913,2.913043,0.347826,1.913043,3.608696,0.478261,1.0
1,2010,20005,EDM,18.0,0.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,4.0,2.0,1.0,0.0,1.692308,1.307692,1.0,0.538462,4.115385,1.961538,1.384615,0.692308,3.538462,1.769231,2.0
2,2010,20006,CBJ,18.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,3.0,2.0,2.0,0.0,1.88,2.12,0.0,0.84,3.0,2.44,1.72,0.4,5.28,0.32,1.0
3,2010,20006,SJ,18.0,2.0,1.0,0.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,3.0,1.941176,1.0,0.0,1.058824,2.941176,0.764706,1.470588,2.823529,1.411765,3.588235,1.0,2.0
4,2010,20009,ANA,18.0,1.0,1.0,0.0,2.0,2.0,0.0,4.0,3.0,1.0,2.0,2.0,0.9,1.0,0.0,2.1,2.7,1.0,2.0,2.3,1.9,0.8,3.3,1.0


- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [104]:
dy = pd.pivot_table(dy, index=['Season', 'GameNumber'], columns=['A'], values=['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3'])
dy = dy.reset_index()
dy.columns = ['_'.join(str(s).strip() for s in col if s) for col in dy.columns]
dy = dy.reset_index()
dy = dy.rename(columns={'C1_1.0': 'VC1', 'C2_1.0': 'VC2', 'C3_1.0': 'VC3', 'C4_1.0': 'VC4', 'W1_1.0': 'VW1', 'W2_1.0': 'VW2', 'W3_1.0': 'VW3', 'W4_1.0': 'VW4', 'D1_1.0': 'VD1', 'D2_1.0': 'VD2', 'D3_1.0': 'VD3', 'C1_2.0': 'HC1', 'C2_2.0': 'HC2', 'C3_2.0': 'HC3', 'C4_2.0': 'HC4', 'W1_2.0': 'HW1', 'W2_2.0': 'HW2', 'W3_2.0': 'HW3', 'W4_2.0': 'HW4', 'D1_2.0': 'HD1', 'D2_2.0': 'HD2', 'D3_2.0': 'HD3'})
dy = dy[['Season', 'GameNumber', 'VC1', 'VC2', 'VC3', 'VC4', 'VW1', 'VW2', 'VW3', 'VW4', 'VD1', 'VD2', 'VD3', 'HC1', 'HC2', 'HC3', 'HC4', 'HW1', 'HW2', 'HW3', 'HW4', 'HD1', 'HD2', 'HD3']]
dy.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3
0,2010,20005,0.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,0.0,2.0,1.0,1.0,1.0,4.0,2.0,1.0,1.0,4.0,1.0
1,2010,20006,0.0,2.0,2.0,0.0,1.0,3.0,2.0,2.0,0.0,6.0,0.0,2.0,1.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0,4.0,1.0
2,2010,20009,1.0,1.0,0.0,2.0,3.0,1.0,2.0,2.0,2.0,0.0,4.0,1.0,2.0,1.0,0.0,1.0,4.0,3.0,0.0,3.0,2.0,1.0
3,2010,20013,1.0,1.0,2.0,0.0,2.0,4.0,1.0,1.0,2.0,2.0,2.0,0.0,4.0,0.0,0.0,2.0,1.0,4.0,1.0,1.0,4.0,1.0
4,2010,20015,2.0,0.0,1.0,1.0,2.0,2.0,1.0,3.0,0.0,2.0,4.0,1.0,1.0,1.0,1.0,0.0,4.0,1.0,3.0,3.0,1.0,2.0


In [105]:
dy.shape

(281, 24)

In [81]:
#dy.to_csv('season_game_roster.csv', index='False', sep=',')

In [106]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')

# Roster Analysis

## season_level_analysis

#### $WinPc = \beta_{0} + \beta_{1}MeanC_{1} + \beta_{2}MeanC_{2}+ \beta_{3}MeanC_{3} + \beta_{4}MeanC_{4} + \beta_{5}MeanW_{1} + \beta_{6}MeanW_{2}+ \beta_{7}MeanW_{3} + \beta_{8}MeanW_{4} + \beta_{9}MeanD_{1} + \beta_{10}MeanD_{2}+ \beta_{11}MeanD_{3} + e_{s}$

- merge season_team dataset (dz) and season_team_roster_ranking (dv) for roster analysis at the season level. Use **ds** as the merging dataset.

In [107]:
ds = dv.merge(dz, on=['Season', 'TeamCode'], how='left')
ds.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_C,Mean_W,Mean_D,Rank_Win,Rank_C,Rank_W,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3
0,2010,NSH,16,13,3,55,29,0.8125,0.1875,2.46875,2.453125,1.5625,1.0,16.0,11.0,2.0,18.0,0.0,2.25,1.625,0.125,0.6875,3.5,3.3125,0.5,3.625,1.375,1.0
1,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.288462,2.600962,2.160256,2.0,11.0,20.0,22.0,18.0,2.0,0.0,0.846154,1.153846,1.0,2.576923,3.038462,1.384615,1.5,2.038462,2.461538
2,2010,DET,25,18,7,93,63,0.72,0.28,2.25,2.32,1.78,3.0,9.0,8.0,6.0,18.0,0.64,1.8,1.48,0.08,0.96,3.72,3.12,0.2,2.76,1.8,1.44
3,2010,LA,25,17,8,68,54,0.68,0.32,2.56,2.755,1.926667,4.0,20.0,27.0,14.0,18.0,0.88,1.0,1.12,1.0,0.88,2.0,3.32,1.8,1.92,2.6,1.48
4,2010,BUF,18,12,6,62,52,0.666667,0.333333,2.694444,2.090278,1.907407,5.0,25.0,2.0,13.0,18.0,0.111111,1.0,2.888889,0.0,1.888889,4.111111,1.388889,0.611111,1.666667,3.222222,1.111111


- mean goals for and mean goals against per team.

In [108]:
ds['meanGF'] = ds['GF']/ ds['GP']
ds['meanGA'] = ds['GA']/ ds['GP']

In [109]:
ds.shape

(30, 30)

### summary analysis

In [110]:
ds.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,Mean_C,Mean_W,Mean_D,Rank_Win,Rank_C,Rank_W,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3,meanGF,meanGA
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,18.733333,9.366667,9.366667,53.133333,53.133333,0.480995,0.519005,2.423752,2.48852,1.98981,15.5,15.5,15.5,15.5,18.0,0.820393,1.249826,1.344161,0.58562,1.256348,2.975693,2.371412,1.396547,1.678715,2.703709,1.617576,2.717741,2.814605
std,0.0,7.750121,5.327116,4.810071,26.654537,25.429732,0.182702,0.182702,0.305459,0.236092,0.252466,8.80047,8.803408,8.802429,8.802429,0.0,0.847599,0.961599,0.807958,0.569055,0.917852,1.162235,0.921894,0.804711,0.902039,1.169535,1.008729,0.610354,0.478229
min,2010.0,4.0,0.0,2.0,7.0,10.0,0.0,0.1875,1.774194,2.0,1.451613,1.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.764706,0.0,0.0,0.0,0.636364,0.0,1.615385,1.8125
25%,2010.0,13.25,6.0,6.25,31.0,33.25,0.387311,0.402273,2.226562,2.345847,1.833333,7.875,8.25,8.25,8.5,18.0,0.0,0.779167,0.909848,0.09125,0.575721,2.053322,1.845223,0.75,1.016667,1.85,0.910985,2.270089,2.526364
50%,2010.0,17.0,9.0,8.0,58.0,49.0,0.481066,0.518934,2.464375,2.519363,1.947829,15.5,15.5,15.5,15.5,18.0,0.833333,1.0,1.393846,0.348485,1.0,2.95,2.603896,1.358974,1.651515,2.78341,1.353333,2.74,2.861111
75%,2010.0,25.0,12.75,12.0,74.25,72.0,0.597727,0.612689,2.600852,2.6575,2.174679,22.75,22.75,22.75,22.75,18.0,1.35,1.852174,1.977273,1.0,1.934722,3.913306,3.077797,1.955882,2.0,3.472028,2.423077,3.269763,3.132051
max,2010.0,33.0,21.0,22.0,102.0,106.0,0.8125,1.0,3.0,2.990385,2.52381,30.0,30.0,30.0,30.0,18.0,2.967742,4.0,2.888889,2.1,2.941176,5.0,3.923077,2.823529,3.645161,5.28,3.727273,3.72,3.833333


### estimate roster model 

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [111]:
print ('season level analysis (win percent) by mean roster position')
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3' ]] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/all_events_per_toi/winpc_ols_by_mean_roster_position_centers_wingers_defensemen.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season level analysis (win percent) by mean roster position
                            OLS Regression Results                            
Dep. Variable:                  WinPc   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                 -0.016
Method:                 Least Squares   F-statistic:                    0.9420
Date:                Tue, 27 Feb 2018   Prob (F-statistic):              0.504
Time:                        13:02:33   Log-Likelihood:                 13.537
No. Observations:                  30   AIC:                            -9.074
Df Residuals:                      21   BIC:                             3.537
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------

FileNotFoundError: [Errno 2] No such file or directory: '/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/all_events_per_toi/winpc_ols_by_mean_roster_position_centers_wingers_defensemen.tex'

In [113]:
result.params

const     0.015575
MeanC1    0.001274
MeanC2   -0.015880
MeanC3    0.000002
MeanC4    0.076903
MeanW1    0.078236
MeanW2    0.045018
MeanW3    0.033770
MeanW4   -0.032428
MeanD1    0.067630
MeanD2    0.015509
MeanD3    0.010308
dtype: float64

- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [114]:
#y = ds['WinPc']  
#X = sm.add_constant(ds[['MeanC1']] )
#result = sm.OLS(y, X).fit()
#result.summary()

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **Logit**.  The purpose is to deterimine the impact each roster position has on team winning percent.

In [115]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.660514
         Iterations 19


  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,21.0
Method:,MLE,Df Model:,8.0
Date:,"Tue, 27 Feb 2018",Pseudo R-squ.:,0.04522
Time:,13:04:20,Log-Likelihood:,-19.815
converged:,True,LL-Null:,-20.754
,,LLR p-value:,0.9846

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,2.4910,,,,nan nan
MeanC1,-0.6769,,,,nan nan
MeanC2,-0.7494,,,,nan nan
MeanC3,-0.6856,,,,nan nan
MeanC4,-0.3702,,,,nan nan
MeanW1,0.1982,4.26e+06,4.66e-08,1.000,-8.34e+06 8.34e+06
MeanW2,0.0620,4.3e+06,1.44e-08,1.000,-8.42e+06 8.42e+06
MeanW3,0.0111,4.22e+06,2.64e-09,1.000,-8.27e+06 8.27e+06
MeanW4,-0.2573,4.3e+06,-5.98e-08,1.000,-8.43e+06 8.43e+06


- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [92]:
#y = ds['WinPc']  
#X = sm.add_constant(ds[['MeanC1']] )
#result = sm.Logit(y, X).fit()
#result.summary()

Optimization terminated successfully.
         Current function value: 0.690507
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,28.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.003666
Time:,13:29:19,Log-Likelihood:,-20.715
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.6962

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.2654,0.954,-0.278,0.781,-2.136 1.605
MeanC1,0.1490,0.535,0.278,0.781,-0.900 1.198


- regress **mean goals for** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [116]:
y = ds['meanGF']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGF,R-squared:,0.327
Model:,OLS,Adj. R-squared:,0.071
Method:,Least Squares,F-statistic:,1.276
Date:,"Tue, 27 Feb 2018",Prob (F-statistic):,0.308
Time:,13:05:26,Log-Likelihood:,-21.305
No. Observations:,30,AIC:,60.61
Df Residuals:,21,BIC:,73.22
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0828,0.006,13.876,0.000,0.070 0.095
MeanC1,0.2658,0.163,1.630,0.118,-0.073 0.605
MeanC2,0.1222,0.121,1.007,0.326,-0.130 0.375
MeanC3,0.0805,0.114,0.706,0.488,-0.157 0.318
MeanC4,-0.1371,0.213,-0.644,0.526,-0.579 0.305
MeanW1,0.2229,0.112,1.989,0.060,-0.010 0.456
MeanW2,0.1500,0.085,1.761,0.093,-0.027 0.327
MeanW3,0.1605,0.094,1.716,0.101,-0.034 0.355
MeanW4,0.1293,0.130,0.995,0.331,-0.141 0.400

0,1,2,3
Omnibus:,5.226,Durbin-Watson:,1.319
Prob(Omnibus):,0.073,Jarque-Bera (JB):,2.024
Skew:,-0.25,Prob(JB):,0.363
Kurtosis:,1.829,Cond. No.,1.28e+17


- regress **mean goals against** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [117]:
y = ds['meanGA']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGA,R-squared:,0.229
Model:,OLS,Adj. R-squared:,-0.065
Method:,Least Squares,F-statistic:,0.779
Date:,"Tue, 27 Feb 2018",Prob (F-statistic):,0.626
Time:,13:06:21,Log-Likelihood:,-16.032
No. Observations:,30,AIC:,50.06
Df Residuals:,21,BIC:,62.67
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0840,0.005,16.774,0.000,0.074 0.094
MeanC1,0.1928,0.137,1.410,0.173,-0.092 0.477
MeanC2,0.1642,0.102,1.612,0.122,-0.048 0.376
MeanC3,0.1588,0.096,1.660,0.112,-0.040 0.358
MeanC4,-0.1797,0.178,-1.007,0.325,-0.551 0.191
MeanW1,0.1180,0.094,1.255,0.223,-0.077 0.313
MeanW2,0.1336,0.071,1.870,0.075,-0.015 0.282
MeanW3,0.0764,0.078,0.974,0.341,-0.087 0.240
MeanW4,0.3440,0.109,3.154,0.005,0.117 0.571

0,1,2,3
Omnibus:,0.878,Durbin-Watson:,0.713
Prob(Omnibus):,0.645,Jarque-Bera (JB):,0.663
Skew:,0.352,Prob(JB):,0.718
Kurtosis:,2.81,Cond. No.,1.28e+17


## season_game_level_analysis

#### $HomeWin = \beta_{0} + \beta_{1}DC_{1} + \beta_{2}DC_{2} + \beta_{3}DC_{3} + \beta_{4}DC_{4} + \beta_{5}DW_{1} + \beta_{6}DW_{2} + \beta_{7}DW_{3} + \beta_{8}DW_{4} + + \beta_{9}DD_{1} + \beta_{10}DD_{2} + \beta_{11}DD_{3} + e_{s,g}$

- merge season game data (dg) and season game roster (dy).

In [122]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [123]:
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3
0,2010,20005,0.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,0.0,2.0,1.0,1.0,1.0,4.0,2.0,1.0,1.0,4.0,1.0
1,2010,20006,0.0,2.0,2.0,0.0,1.0,3.0,2.0,2.0,0.0,6.0,0.0,2.0,1.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0,4.0,1.0
2,2010,20009,1.0,1.0,0.0,2.0,3.0,1.0,2.0,2.0,2.0,0.0,4.0,1.0,2.0,1.0,0.0,1.0,4.0,3.0,0.0,3.0,2.0,1.0
3,2010,20013,1.0,1.0,2.0,0.0,2.0,4.0,1.0,1.0,2.0,2.0,2.0,0.0,4.0,0.0,0.0,2.0,1.0,4.0,1.0,1.0,4.0,1.0
4,2010,20015,2.0,0.0,1.0,1.0,2.0,2.0,1.0,3.0,0.0,2.0,4.0,1.0,1.0,1.0,1.0,0.0,4.0,1.0,3.0,3.0,1.0,2.0


In [118]:
dl = dg.merge(dy, on=['Season', 'GameNumber'], how='left')
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,,,,,,,,,,,,,,,,,,,,,,
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,,,,,,,,,,,,,,,,,,,,,,
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,,,,,,,,,,,,,,,,,,,,,,
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI,,,,,,,,,,,,,,,,,,,,,,
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY,0.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,0.0,2.0,1.0,1.0,1.0,4.0,2.0,1.0,1.0,4.0,1.0


- determine if the home or away team won the game.

In [119]:
dl['WinTeam'] = dl.apply(lambda x: 'HOME' if x['GD'] > 0 else 'AWAY', axis=1)

- Calculate the difference between player quality per game for all positions with respect to home team ( Home Team - Visitor Team). There are 5 positions and 2 types of player quality. This will give us a total of 10 differenecs. 

In [103]:
dl.shape

(1230, 47)

- total of forwards and defensemen by team per game.

In [120]:
dl['VC'] = dl['VC1'] + dl['VC2'] + dl['VC3'] + dl['VC4']
dl['VW'] = dl['VW1'] + dl['VW2'] + dl['VW3'] + dl['VW4']
dl['VD'] = dl['VD1'] + dl['VD2'] + dl['VD3'] 
dl['HC'] = dl['HC1'] + dl['HC2'] + dl['HC3'] + dl['HC4']
dl['HW'] = dl['HW1'] + dl['HW2'] + dl['HW3'] + dl['HW4']
dl['HD'] = dl['HD1'] + dl['HD2'] + dl['HD3']

- total of forwards and defensemen per game.

In [121]:
dl['C'] = dl['VC'] + dl['HC']
dl['W'] = dl['VW'] + dl['HW']
dl['D'] = dl['VD'] + dl['HD']
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3,VC,VW,VD,HC,HW,HD,C,W,D
0,2010,20001,MTL,TOR,2,3,1,3,2,HOME,MTL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2010,20002,PHI,PIT,3,2,-1,2,3,AWAY,PIT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2010,20003,CAR,MIN,4,3,-1,3,4,AWAY,MIN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2010,20004,CHI,COL,3,4,1,4,3,HOME,CHI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2010,20005,CGY,EDM,0,4,4,4,0,HOME,CGY,0.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,0.0,2.0,1.0,1.0,1.0,4.0,2.0,1.0,1.0,4.0,1.0,4.0,8.0,6.0,4.0,8.0,6.0,8.0,16.0,12.0


- **keep games with 4 C, 8 W and 6 defensemen per team.**

In [106]:
dl = dl[((dl['VC'] == 4) & (dl['VW'] == 6) & (dl['VD'] == 6) & (dl['HC'] == 12) & (dl['HW'] == 6)  & (dl['HD'] == 6))]

In [107]:
dl.shape

(879, 53)

In [108]:
dl['VF'].value_counts()

12.0    879
Name: VF, dtype: int64

In [109]:
dl['VD'].value_counts()

6.0    879
Name: VD, dtype: int64

In [110]:
dl['HF'].value_counts()

12.0    879
Name: HF, dtype: int64

In [111]:
dl['HD'].value_counts()

6.0    879
Name: HD, dtype: int64

### summary analysis

In [112]:
dl.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3,VF,VD,HF,HD,F,D
count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
mean,2010.0,20613.112628,2.763367,2.960182,0.196815,2.960182,2.763367,1.722412,1.331058,0.971559,0.241183,1.970421,0.954494,0.517634,0.381115,1.67463,1.346985,0.362912,0.525597,0.910125,1.307167,0.792947,1.152446,1.152446,0.684869,1.607509,1.385666,0.744027,0.25711,1.784983,0.955631,0.502844,0.681456,1.788396,1.39818,0.448237,0.445961,1.080774,1.356086,0.536974,1.010239,1.249147,0.76678,12.0,6.0,12.0,6.0,24.0,12.0
std,0.0,349.491101,1.672088,1.727312,2.455284,1.727312,1.672088,0.747257,1.000377,0.62438,0.51056,0.946899,0.848541,0.66603,0.630766,1.067721,0.976471,0.492808,0.712062,0.885147,0.888464,0.725642,0.74977,0.985991,0.6628,0.830505,1.034848,0.660819,0.493573,0.918279,0.816452,0.601576,0.761254,0.990065,0.839403,0.515583,0.6502,0.878278,0.926419,0.61728,0.842296,0.956001,0.790886,0.0,0.0,0.0,0.0,0.0,0.0
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
25%,2010.0,20317.5,2.0,2.0,-1.0,2.0,2.0,1.0,0.5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
50%,2010.0,20613.0,3.0,3.0,1.0,3.0,3.0,2.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
75%,2010.0,20913.5,4.0,4.0,2.0,4.0,4.0,2.0,2.0,1.0,0.0,3.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
max,2010.0,21230.0,10.0,9.0,7.0,9.0,10.0,3.0,4.0,3.0,2.0,4.0,3.0,3.0,3.0,5.0,4.0,2.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,2.0,4.0,4.0,2.0,3.0,5.0,5.0,2.0,3.0,3.0,4.0,2.0,3.0,4.0,3.0,12.0,6.0,12.0,6.0,24.0,12.0


In [113]:
#dl = dl[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'HGF', 'VGF', 'GD','WinTeam', 'VF1', 'VF2', 'VD1', 'VD2', 'HF1', 'HF2', 'HD1', 'HD2']]

- determine if a game was won by the home or visitor team.
- compute the difference in quality of forwards and defensemen between home and visitor team per game (DF1, DF2, DD1, DD2). 

In [114]:
dl['HomeWin'] = dl.apply(lambda x: 1 if x['WinTeam']=='HOME' else 0, axis=1)

# difference in center position per game
dl['DC1'] = dl['HC1'] - dl['VC1']
dl['DC2'] = dl['HC2'] - dl['VC2']
dl['DC3'] = dl['HC3'] - dl['VC3']
dl['DC4'] = dl['HC4'] - dl['VC4']

# difference in left position per game
dl['DLW1'] = dl['HLW1'] - dl['VLW1']
dl['DLW2'] = dl['HLW2'] - dl['VLW2']
dl['DLW3'] = dl['HLW3'] - dl['VLW3']
dl['DLW4'] = dl['HLW4'] - dl['VLW4']

# difference in right wing position per game
dl['DRW1'] = dl['HRW1'] - dl['VRW1']
dl['DRW2'] = dl['HRW2'] - dl['VRW2']
dl['DRW3'] = dl['HRW3'] - dl['VRW3']
dl['DRW4'] = dl['HRW4'] - dl['VRW4']

# difference in left defence position per game
dl['DLD1'] = dl['HLD1'] - dl['VLD1']
dl['DLD2'] = dl['HLD2'] - dl['VLD2']
dl['DLD3'] = dl['HLD3'] - dl['VLD3']

# difference in right defence position per game
dl['DRD1'] = dl['HRD1'] - dl['VRD1']
dl['DRD2'] = dl['HRD2'] - dl['VRD2']
dl['DRD3'] = dl['HRD3'] - dl['VRD3']

In [115]:
dl.groupby(['WinTeam'])['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3', ].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,DC1,DC2,DC3,DC4,DLW1,DLW2,DLW3,DLW4,DRW1,DRW2,DRW3,DRW4,DLD1,DLD2,DLD3,DRD1,DRD2,DRD3
WinTeam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AWAY,count,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0
AWAY,mean,-0.220379,0.156398,-0.203791,0.0,-0.244076,0.042654,-0.028436,0.28673,0.054502,0.116114,0.075829,-0.035545,0.187204,0.014218,-0.227488,-0.165877,0.106635,0.085308
AWAY,std,1.116179,1.456966,0.965527,0.732678,1.350517,1.236069,0.947128,0.914784,1.465151,1.301662,0.678081,0.963055,1.244666,1.215414,0.904149,1.059275,1.306117,1.055389
AWAY,min,-3.0,-3.0,-3.0,-2.0,-4.0,-3.0,-3.0,-2.0,-4.0,-4.0,-2.0,-3.0,-3.0,-3.0,-2.0,-2.0,-4.0,-2.0
AWAY,25%,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
AWAY,50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWAY,75%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
AWAY,max,3.0,3.0,3.0,2.0,3.0,4.0,2.0,3.0,4.0,4.0,1.0,3.0,3.0,3.0,2.0,2.0,4.0,3.0
HOME,count,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0
HOME,mean,-0.017505,-0.039387,-0.249453,0.030635,-0.131291,-0.037199,-0.002188,0.31291,0.16849,-0.008753,0.094092,-0.12035,0.155361,0.080963,-0.282276,-0.12035,0.087527,0.078775


### estimate roster model

- regress **home win** on the difference in number of home and visitor players by position and quality. Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on home team success.

In [116]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.017
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.9146
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.552
Time:,13:48:00,Log-Likelihood:,-629.87
No. Observations:,879,AIC:,1294.0
Df Residuals:,862,BIC:,1375.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5197,0.020,26.561,0.000,0.481 0.558
DC1,0.0264,0.020,1.340,0.180,-0.012 0.065
DC2,-0.0160,0.015,-1.038,0.299,-0.046 0.014
DC3,-0.0229,0.020,-1.131,0.258,-0.063 0.017
DC4,0.0253,0.025,1.016,0.310,-0.024 0.074
DLW1,0.0087,0.015,0.588,0.556,-0.020 0.038
DLW2,-0.0003,0.016,-0.020,0.984,-0.033 0.032
DLW3,-0.0011,0.021,-0.054,0.957,-0.042 0.040
DLW4,0.0053,0.019,0.283,0.778,-0.032 0.042

0,1,2,3
Omnibus:,0.828,Durbin-Watson:,1.903
Prob(Omnibus):,0.661,Jarque-Bera (JB):,136.834
Skew:,-0.075,Prob(JB):,1.94e-30
Kurtosis:,1.073,Cond. No.,8480000000000000.0


In [118]:
result.params

const    0.519712
DC1      0.026447
DC2     -0.016003
DC3     -0.022937
DC4      0.025272
DLW1     0.008666
DLW2    -0.000324
DLW3    -0.001123
DLW4     0.005334
DRW1     0.010842
DRW2    -0.010709
DRW3    -0.008840
DRW4    -0.016627
DLD1    -0.014807
DLD2     0.005874
DLD3    -0.003115
DRD1     0.010779
DRD2     0.006260
DRD3    -0.004991
dtype: float64

- By increasing the differential of **elite** player quality in forwards and defense (home team – visitor team) by one unit, home win **increases** by 0.4 and 1 game respectfully.
- By increasing the differential of **secondary** player quality in forwards and defense (home team – visitor team) by one unit, home win **decreases** by 0.4 and 1 game respectfully.

- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [119]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW1', 'DLD1', 'DRD1']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.161
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.0564
Time:,13:49:22,Log-Likelihood:,-631.87
No. Observations:,879,AIC:,1276.0
Df Residuals:,873,BIC:,1304.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5285,0.017,30.487,0.000,0.494 0.563
DC1,0.0403,0.016,2.559,0.011,0.009 0.071
DLW1,0.0124,0.013,0.933,0.351,-0.014 0.039
DRW1,0.0168,0.012,1.396,0.163,-0.007 0.040
DLD1,-0.0179,0.014,-1.243,0.214,-0.046 0.010
DRD1,0.0037,0.016,0.230,0.818,-0.028 0.035

0,1,2,3
Omnibus:,0.847,Durbin-Watson:,1.9
Prob(Omnibus):,0.655,Jarque-Bera (JB):,139.504
Skew:,-0.075,Prob(JB):,5.09e-31
Kurtosis:,1.054,Cond. No.,1.8


- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [120]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW2', 'DLD1', 'DRD1']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.686660
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.008224
Time:,13:54:54,Log-Likelihood:,-603.57
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.07496

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.1225,0.070,1.746,0.081,-0.015 0.260
DC1,0.1583,0.065,2.450,0.014,0.032 0.285
DLW1,0.0291,0.052,0.561,0.575,-0.073 0.131
DRW2,-0.0567,0.053,-1.075,0.282,-0.160 0.047
DLD1,-0.0647,0.058,-1.119,0.263,-0.178 0.049
DRD1,0.0302,0.064,0.476,0.634,-0.094 0.155


- regress **home win** on the difference in number of secondary quality home and visitor players by position (DC2, DLW2, DRW2, DLD2, DRD2). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [122]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.408
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.219
Time:,13:58:21,Log-Likelihood:,-633.74
No. Observations:,879,AIC:,1279.0
Df Residuals:,873,BIC:,1308.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5211,0.017,30.753,0.000,0.488 0.554
DC2,-0.0225,0.012,-1.956,0.051,-0.045 7.19e-05
DLW2,-0.0128,0.014,-0.917,0.359,-0.040 0.015
DRW2,-0.0165,0.013,-1.280,0.201,-0.042 0.009
DLD2,0.0090,0.014,0.639,0.523,-0.019 0.037
DRD2,0.0051,0.013,0.378,0.706,-0.021 0.031

0,1,2,3
Omnibus:,0.894,Durbin-Watson:,1.897
Prob(Omnibus):,0.64,Jarque-Bera (JB):,141.924
Skew:,-0.078,Prob(JB):,1.52e-31
Kurtosis:,1.038,Cond. No.,1.63


In [121]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.688334
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.005806
Time:,13:55:48,Log-Likelihood:,-605.05
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.2157

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0851,0.068,1.248,0.212,-0.049 0.219
DC2,-0.0909,0.046,-1.957,0.050,-0.182 0.000
DLW2,-0.0520,0.056,-0.922,0.356,-0.163 0.059
DRW2,-0.0670,0.052,-1.284,0.199,-0.169 0.035
DLD2,0.0363,0.057,0.639,0.523,-0.075 0.148
DRD2,0.0204,0.054,0.377,0.706,-0.086 0.126


- regress **home win** on the difference in number of third quality home and visitor players by position (DC3, DLW3, DRW3, DLD3, DRD3). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [123]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.3065
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.909
Time:,14:07:24,Log-Likelihood:,-636.5
No. Observations:,879,AIC:,1285.0
Df Residuals:,873,BIC:,1314.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5123,0.018,27.964,0.000,0.476 0.548
DC3,-0.0129,0.019,-0.668,0.504,-0.051 0.025
DLW3,0.0038,0.020,0.197,0.844,-0.034 0.042
DRW3,0.0103,0.025,0.410,0.682,-0.039 0.059
DLD3,-0.0157,0.018,-0.854,0.393,-0.052 0.020
DRD3,-0.0022,0.017,-0.128,0.899,-0.037 0.032

0,1,2,3
Omnibus:,0.936,Durbin-Watson:,1.895
Prob(Omnibus):,0.626,Jarque-Bera (JB):,145.48
Skew:,-0.079,Prob(JB):,2.57e-32
Kurtosis:,1.013,Cond. No.,1.85


In [124]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691477
         Iterations 3


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.001266
Time:,14:07:35,Log-Likelihood:,-607.81
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.9082

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0494,0.073,0.674,0.500,-0.094 0.193
DC3,-0.0519,0.077,-0.670,0.503,-0.204 0.100
DLW3,0.0154,0.078,0.198,0.843,-0.137 0.168
DRW3,0.0411,0.100,0.411,0.681,-0.155 0.238
DLD3,-0.0629,0.073,-0.856,0.392,-0.207 0.081
DRD3,-0.0090,0.070,-0.128,0.898,-0.146 0.128


- regress **home win** on the difference in number of bottom quality home and visitor forwards (DC4, DLW4, DRW4). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [125]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.7699
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.511
Time:,14:07:43,Log-Likelihood:,-636.11
No. Observations:,879,AIC:,1280.0
Df Residuals:,875,BIC:,1299.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5174,0.018,29.250,0.000,0.483 0.552
DC4,0.0182,0.024,0.771,0.441,-0.028 0.064
DLW4,0.0010,0.018,0.054,0.957,-0.035 0.037
DRW4,-0.0242,0.018,-1.346,0.179,-0.059 0.011

0,1,2,3
Omnibus:,0.936,Durbin-Watson:,1.886
Prob(Omnibus):,0.626,Jarque-Bera (JB):,144.969
Skew:,-0.079,Prob(JB):,3.31e-32
Kurtosis:,1.017,Cond. No.,1.71


In [126]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691036
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,875.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.001904
Time:,14:07:50,Log-Likelihood:,-607.42
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.5092

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0698,0.071,0.985,0.325,-0.069 0.209
DC4,0.0729,0.094,0.772,0.440,-0.112 0.258
DLW4,0.0040,0.073,0.054,0.957,-0.139 0.147
DRW4,-0.0971,0.072,-1.346,0.178,-0.239 0.044


#### goal differential 

- regress **goal differential** on the difference in number of home and visitor players by position and quality (DF1, DF2, DD1, DD2). Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on goal differential.

In [127]:
y = dl['GD']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.011
Model:,OLS,Adj. R-squared:,-0.007
Method:,Least Squares,F-statistic:,0.6169
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.872
Time:,14:09:27,Log-Likelihood:,-2031.3
No. Observations:,879,AIC:,4097.0
Df Residuals:,862,BIC:,4178.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2494,0.096,2.588,0.010,0.060 0.439
DC1,0.1370,0.097,1.410,0.159,-0.054 0.328
DC2,-0.0339,0.076,-0.447,0.655,-0.183 0.115
DC3,-0.0637,0.100,-0.637,0.524,-0.260 0.132
DC4,0.0539,0.123,0.440,0.660,-0.187 0.294
DLW1,0.0600,0.073,0.828,0.408,-0.082 0.202
DLW2,-0.0397,0.081,-0.490,0.624,-0.199 0.119
DLW3,-0.0274,0.102,-0.269,0.788,-0.228 0.173
DLW4,-0.0732,0.093,-0.787,0.431,-0.256 0.109

0,1,2,3
Omnibus:,0.219,Durbin-Watson:,1.978
Prob(Omnibus):,0.896,Jarque-Bera (JB):,0.306
Skew:,0.015,Prob(JB):,0.858
Kurtosis:,2.914,Cond. No.,8480000000000000.0


## season_game_team_level_analysis

#### $Win = \beta_{0} + \beta_{1}C_{1} + \beta_{2}C_{2} + \beta_{3}C_{3} + \beta_{4}C_{4} + \beta_{5}LW_{1} + \beta_{6}LW_{2} + \beta_{7}LW_{3} + \beta_{8}LW_{4} + \beta_{9}RW_{1} + \beta_{10}RW_{2} + \beta_{11}RW_{3} + \beta_{12}RW_{4} + \beta_{13}LD_{1} + \beta_{14}LD_{2} + \beta_{15}LD_{3} + \beta_{16}RD_{1} + \beta_{17}RD_{2} + \beta_{18}RD_{3} + e_{s,g,t}$

- use season game data (dg) and season game team roster (dx) to conduct season game team level analysis (dt).

In [128]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [129]:
dt = dg.merge(dx, on=['Season', 'GameNumber'], how='left')
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,MTL,18.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,1.516667,1.916667,1.033333,0.366667,2.416667,0.0,1.0,0.05,1.483333,1.0,0.716667,0.5,2.383333,0.133333,0.083333,0.6,2.516667,0.283333,1.0
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,1.0,0.0,0.984375,1.578125,0.984375,0.21875,2.015625,0.578125,0.859375,1.21875,2.0,0.96875,0.59375,0.0,0.671875,2.046875,0.546875,0.0,0.0,2.734375,
2,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,2.621212,0.0,0.0,1.439394,2.0,1.151515,0.984848,0.0,2.984848,0.0,0.212121,0.590909,1.151515,1.060606,1.0
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,0.0,1.357143,2.014286,0.0,1.085714,0.8,0.771429,0.0,1.771429,1.485714,1.571429,0.971429,0.171429,1.528571,0.957143,0.9,0.757143,1.085714,0.771429,
4,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,CAR,18.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2.153846,0.615385,1.153846,0.123077,2.846154,0.538462,0.246154,0.0,1.0,1.723077,0.969231,0.630769,1.938462,0.4,1.061538,0.907692,0.492308,1.2,1.0


In [130]:
dt.shape

(2109, 50)

- Sum up goals for and against by team per game and find the goal differential (GD) per game. Assign a value of 1 to the team that won the game. 

In [131]:
dt['GD'] = dt.apply(lambda x: (x['HGF'] - x['VGF']) if x['HTeamCode']== x['TeamCode'] else (x['VGF'] - x['HGF']), 1)
dt['Win'] = dt.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dt['GF'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']== x['TeamCode'] else x['VGF'], 1)
dt['GA'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']!= x['TeamCode'] else x['VGF'], 1)
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA
0,2010,20001,MTL,TOR,2,3,-1,3,2,TOR,MTL,MTL,18.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,1.516667,1.916667,1.033333,0.366667,2.416667,0.0,1.0,0.05,1.483333,1.0,0.716667,0.5,2.383333,0.133333,0.083333,0.6,2.516667,0.283333,1.0,0,2,3
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,1.0,1.0,0.0,0.984375,1.578125,0.984375,0.21875,2.015625,0.578125,0.859375,1.21875,2.0,0.96875,0.59375,0.0,0.671875,2.046875,0.546875,0.0,0.0,2.734375,,1,3,2
2,2010,20002,PHI,PIT,3,2,1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2.878788,0.0,0.893939,0.030303,2.621212,0.0,0.0,1.439394,2.0,1.151515,0.984848,0.0,2.984848,0.0,0.212121,0.590909,1.151515,1.060606,1.0,1,3,2
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,0.0,1.357143,2.014286,0.0,1.085714,0.8,0.771429,0.0,1.771429,1.485714,1.571429,0.971429,0.171429,1.528571,0.957143,0.9,0.757143,1.085714,0.771429,,0,2,3
4,2010,20003,CAR,MIN,4,3,1,3,4,CAR,MIN,CAR,18.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2.153846,0.615385,1.153846,0.123077,2.846154,0.538462,0.246154,0.0,1.0,1.723077,0.969231,0.630769,1.938462,0.4,1.061538,0.907692,0.492308,1.2,1.0,1,4,3


- total of forwards and defensemen by team per game.

In [132]:
dt['F'] = dt['C1'] + dt['C2'] + dt['C3'] + dt['C4'] + dt['LW1'] + dt['LW2'] + dt['LW3'] + dt['LW4'] + dt['RW1'] + dt['RW2'] + dt['RW3'] + dt['RW4']   
dt['D'] = dt['LD1'] + dt['LD2'] + dt['LD3'] + dt['RD1'] + dt['RD2'] + dt['RD3']

- display the difference of quality per position.

In [133]:
dt['DC'] = dt['C1'] - dt['C2'] - dt['C3'] - dt['C4']
dt['DLW'] = dt['LW1'] - dt['LW2'] - dt['LW3'] - dt['LW4']
dt['DRW'] = dt['RW1'] - dt['RW2'] - dt['RW3'] - dt['RW4']
dt['DLD'] = dt['LD1'] - dt['LD2'] - dt['LD3']
dt['DRD'] = dt['RD1'] - dt['RD2'] - dt['RD3']

In [134]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [135]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

-**keep only games that have 12 forwards and 6 defensemen.**

In [136]:
dt = dt.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['F'] == 12) & (x['D'] == 6)).all())

In [137]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [138]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

In [139]:
dt.shape

(1758, 60)

### summary analysis

In [140]:
dt.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA,F,D,DC,DLW,DRW,DLD,DRD
count,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,879.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0
mean,2010.0,20613.112628,2.763367,2.960182,0.0,2.960182,2.763367,18.0,1.66496,1.358362,0.857793,0.249147,0.995449,1.331627,0.66496,1.877702,0.955063,0.510239,0.531286,1.081342,1.200796,0.725825,1.731513,1.372582,0.405575,0.485779,1.66496,1.358362,0.857793,0.249147,1.877702,0.955063,0.510239,0.531286,1.731513,1.372582,0.405575,0.485779,0.995449,1.331627,0.66496,1.081342,1.200796,0.725825,1.0,0.5,2.861775,2.861775,12.0,6.0,-0.800341,-0.118885,-0.532423,-1.001138,-0.845279
std,0.0,349.39163,1.671612,1.72682,2.462467,1.72682,1.671612,0.0,0.791841,1.017835,0.652672,0.502058,0.885591,0.907711,0.685512,0.937035,0.832415,0.634484,0.714824,0.800316,0.972039,0.730601,1.030904,0.910621,0.505983,0.682801,0.652006,0.897918,0.529614,0.387714,0.832274,0.697164,0.542237,0.550689,0.902707,0.74096,0.443234,0.544944,0.78963,0.7779,0.528827,0.68326,0.82986,0.603885,0.0,0.500142,1.702289,1.702289,0.0,0.0,1.61399,1.774119,1.785869,1.700714,1.608868
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,6.0,-6.0,-5.0,-5.0,-4.0,-5.0
25%,2010.0,20317.25,2.0,2.0,-2.0,2.0,2.0,18.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.030303,0.615385,0.363636,0.0,1.266667,0.416667,0.0,0.05,1.0,0.833333,0.0,0.028571,0.362319,0.833333,0.212121,0.590909,0.492308,0.090909,1.0,0.0,2.0,2.0,12.0,6.0,-2.0,-1.0,-2.0,-2.0,-2.0
50%,2010.0,20613.0,3.0,3.0,0.0,3.0,3.0,18.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,1.783333,1.428571,0.984375,0.123077,1.925373,0.942857,0.358209,0.289855,1.742857,1.257143,0.181818,0.227273,0.878788,1.545455,0.603448,1.0,1.085714,0.742857,1.0,0.5,3.0,3.0,12.0,6.0,-1.0,0.0,0.0,-1.0,-1.0
75%,2010.0,20913.75,4.0,4.0,2.0,4.0,4.0,18.0,2.0,2.0,1.0,0.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,1.984848,1.215385,0.28125,2.6,1.438241,0.87931,0.984375,2.318841,1.787234,0.912281,0.796875,1.528571,1.861538,1.016667,1.7,1.628571,1.014286,1.0,1.0,4.0,4.0,12.0,6.0,0.0,1.0,1.0,0.0,1.0
max,2010.0,21230.0,10.0,9.0,8.0,9.0,10.0,18.0,3.0,4.0,3.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,5.0,5.0,2.0,3.0,2.878788,3.0625,1.641791,1.626866,3.567164,2.820513,1.716667,1.894737,3.783333,3.384615,1.257143,1.871429,2.984848,2.877193,1.974359,2.242424,3.343284,2.734375,1.0,1.0,10.0,10.0,12.0,6.0,3.0,4.0,5.0,3.0,3.0


In [141]:
dt.groupby(['Win'])['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4', 'RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3,C4,LW1,LW2,LW3,LW4,RW1,RW2,RW3,RW4,LD1,LD2,LD3,RD1,RD2,RD3
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
0,mean,1.623436,1.390216,0.872582,0.263936,1.8157,0.960182,0.494881,0.566553,1.688282,1.407281,0.382253,0.534699,0.98521,1.341297,0.666667,1.055745,1.236633,0.714448
0,std,0.787399,0.996095,0.636521,0.514889,0.95344,0.83462,0.618366,0.733091,1.040517,0.942657,0.506859,0.713274,0.887016,0.922764,0.696013,0.804506,0.986688,0.720135
0,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,25%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0,50%,2.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
0,75%,2.0,2.0,1.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0
0,max,3.0,4.0,3.0,2.0,4.0,4.0,3.0,3.0,5.0,5.0,2.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0
1,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
1,mean,1.706485,1.326507,0.843003,0.234357,1.939704,0.949943,0.525597,0.496018,1.774744,1.337884,0.428896,0.43686,1.005688,1.321957,0.663254,1.10694,1.16496,0.737201


### estimate roster model

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [142]:
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,1.98
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.0116
Time:,14:44:29,Log-Likelihood:,-1260.1
No. Observations:,1758,AIC:,2554.0
Df Residuals:,1741,BIC:,2647.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0236,0.002,11.841,0.000,0.020 0.028
C1,0.0334,0.019,1.800,0.072,-0.003 0.070
C2,0.0074,0.014,0.515,0.607,-0.021 0.036
C3,-0.0073,0.020,-0.373,0.709,-0.046 0.031
C4,0.0186,0.025,0.733,0.464,-0.031 0.068
LW1,0.0596,0.014,4.115,0.000,0.031 0.088
LW2,0.0498,0.016,3.083,0.002,0.018 0.081
LW3,0.0388,0.022,1.802,0.072,-0.003 0.081
LW4,-0.0078,0.019,-0.423,0.672,-0.044 0.028

0,1,2,3
Omnibus:,0.005,Durbin-Watson:,2.972
Prob(Omnibus):,0.998,Jarque-Bera (JB):,272.661
Skew:,-0.004,Prob(JB):,6.2e-60
Kurtosis:,1.071,Cond. No.,4.45e+16


In [143]:
result.params

const    0.023605
C1       0.033352
C2       0.007395
C3      -0.007338
C4       0.018574
LW1      0.059626
LW2      0.049777
LW3      0.038823
LW4     -0.007838
RW1      0.034967
RW2      0.020053
RW3      0.051844
RW4     -0.015975
LD1      0.027966
LD2      0.016957
LD3      0.019273
RD1      0.030382
RD2      0.012143
RD3      0.034909
dtype: float64

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success

In [144]:
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.Logit(y, X).fit()
result.summary()

         Current function value: 0.684143
         Iterations: 35




0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1741.0
Method:,MLE,Df Model:,16.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.01299
Time:,14:45:56,Log-Likelihood:,-1202.7
converged:,False,LL-Null:,-1218.6
,,LLR p-value:,0.01108

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.0111,3.05e+06,-3.63e-09,1.000,-5.97e+06 5.97e+06
C1,0.0286,7.08e+05,4.05e-08,1.000,-1.39e+06 1.39e+06
C2,-0.0768,7e+05,-1.1e-07,1.000,-1.37e+06 1.37e+06
C3,-0.1370,7.07e+05,-1.94e-07,1.000,-1.39e+06 1.39e+06
C4,-0.0320,7.09e+05,-4.51e-08,1.000,-1.39e+06 1.39e+06
LW1,0.1355,7.08e+05,1.91e-07,1.000,-1.39e+06 1.39e+06
LW2,0.0953,7.07e+05,1.35e-07,1.000,-1.39e+06 1.39e+06
LW3,0.0508,7.12e+05,7.13e-08,1.000,-1.4e+06 1.4e+06
LW4,-0.1386,7.09e+05,-1.95e-07,1.000,-1.39e+06 1.39e+06


#### goal differential

- regress **goal differential** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success

In [145]:
y = dt['GD']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.02
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,2.243
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.00322
Time:,14:46:23,Log-Likelihood:,-4060.3
No. Observations:,1758,AIC:,8155.0
Df Residuals:,1741,BIC:,8248.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-0.0165,0.010,-1.688,0.092,-0.036 0.003
C1,0.0361,0.091,0.396,0.692,-0.143 0.215
C2,-0.1450,0.071,-2.053,0.040,-0.284 -0.006
C3,-0.1644,0.097,-1.701,0.089,-0.354 0.025
C4,-0.0119,0.125,-0.096,0.924,-0.256 0.233
LW1,0.1536,0.071,2.156,0.031,0.014 0.293
LW2,0.0948,0.079,1.195,0.232,-0.061 0.251
LW3,0.0312,0.106,0.295,0.768,-0.177 0.239
LW4,-0.1131,0.091,-1.242,0.215,-0.292 0.066

0,1,2,3
Omnibus:,0.497,Durbin-Watson:,3.004
Prob(Omnibus):,0.78,Jarque-Bera (JB):,0.567
Skew:,-0.008,Prob(JB):,0.753
Kurtosis:,2.913,Cond. No.,4.45e+16


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [147]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,3.106
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.00852
Time:,14:47:35,Log-Likelihood:,-1268.2
No. Observations:,1758,AIC:,2548.0
Df Residuals:,1752,BIC:,2581.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5243,0.016,32.121,0.000,0.492 0.556
DC,0.0140,0.008,1.814,0.070,-0.001 0.029
DLW,0.0134,0.007,1.920,0.055,-0.000 0.027
DRW,0.0153,0.007,2.195,0.028,0.002 0.029
DLD,8.88e-05,0.007,0.012,0.990,-0.014 0.014
DRD,0.0039,0.008,0.496,0.620,-0.011 0.019

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.991
Prob(Omnibus):,1.0,Jarque-Bera (JB):,282.962
Skew:,-0.001,Prob(JB):,3.59e-62
Kurtosis:,1.035,Cond. No.,3.72


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **Logit**.

In [148]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.688733
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1752.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.006368
Time:,14:48:04,Log-Likelihood:,-1210.8
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.008357

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0979,0.066,1.489,0.136,-0.031 0.227
DC,0.0568,0.031,1.816,0.069,-0.005 0.118
DLW,0.0542,0.028,1.922,0.055,-0.001 0.109
DRW,0.0617,0.028,2.193,0.028,0.007 0.117
DLD,0.0002,0.030,0.008,0.993,-0.058 0.058
DRD,0.0156,0.031,0.499,0.618,-0.046 0.077


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [149]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,5.096
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.00163
Time:,14:48:21,Log-Likelihood:,-1268.3
No. Observations:,1758,AIC:,2545.0
Df Residuals:,1754,BIC:,2567.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5215,0.014,38.605,0.000,0.495 0.548
DC,0.0143,0.008,1.874,0.061,-0.001 0.029
DLW,0.0140,0.007,2.039,0.042,0.001 0.028
DRW,0.0158,0.007,2.294,0.022,0.002 0.029

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.992
Prob(Omnibus):,1.0,Jarque-Bera (JB):,283.113
Skew:,-0.0,Prob(JB):,3.3299999999999997e-62
Kurtosis:,1.034,Cond. No.,2.51


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **Logit**.

In [150]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690127
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1754.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.004358
Time:,14:48:44,Log-Likelihood:,-1213.2
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.01397

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0813,0.058,1.397,0.162,-0.033 0.195
DC,0.0701,0.030,2.323,0.020,0.011 0.129
DLW,0.0426,0.028,1.541,0.123,-0.012 0.097
DRD,0.0240,0.030,0.793,0.428,-0.035 0.083


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [151]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.172
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.31
Time:,14:49:00,Log-Likelihood:,-1274.8
No. Observations:,1758,AIC:,2556.0
Df Residuals:,1755,BIC:,2572.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5149,0.016,32.429,0.000,0.484 0.546
DLD,0.0057,0.007,0.802,0.423,-0.008 0.020
DRD,0.0109,0.008,1.435,0.151,-0.004 0.026

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,3.008
Prob(Omnibus):,1.0,Jarque-Bera (JB):,291.437
Skew:,-0.0,Prob(JB):,5.1900000000000003e-64
Kurtosis:,1.005,Cond. No.,3.0


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **Logit**.

In [152]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.692480
         Iterations 3


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1755.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 20 Feb 2018",Pseudo R-squ.:,0.000963
Time:,14:49:14,Log-Likelihood:,-1217.4
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.3093

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0597,0.064,0.939,0.348,-0.065 0.184
DLD,0.0230,0.029,0.802,0.423,-0.033 0.079
DRD,0.0435,0.030,1.435,0.151,-0.016 0.103


- regress **goal differential** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [153]:
y = dt['GD']
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,5.136
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.000111
Time:,14:49:37,Log-Likelihood:,-4065.4
No. Observations:,1758,AIC:,8143.0
Df Residuals:,1752,BIC:,8176.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1948,0.080,2.431,0.015,0.038 0.352
DC,0.0957,0.038,2.516,0.012,0.021 0.170
DLW,0.0597,0.034,1.741,0.082,-0.008 0.127
DRW,0.0893,0.034,2.610,0.009,0.022 0.156
DLD,0.0241,0.036,0.670,0.503,-0.046 0.095
DRD,0.0467,0.038,1.221,0.222,-0.028 0.122

0,1,2,3
Omnibus:,0.801,Durbin-Watson:,3.012
Prob(Omnibus):,0.67,Jarque-Bera (JB):,0.853
Skew:,-0.004,Prob(JB):,0.653
Kurtosis:,2.892,Cond. No.,3.72


- regress **goal differential** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [154]:
y = dt['GD']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,8.012
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,2.65e-05
Time:,14:49:57,Log-Likelihood:,-4066.3
No. Observations:,1758,AIC:,8141.0
Df Residuals:,1754,BIC:,8162.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1412,0.066,2.129,0.033,0.011 0.271
DC,0.1022,0.037,2.733,0.006,0.029 0.176
DLW,0.0671,0.034,1.988,0.047,0.001 0.133
DRW,0.0966,0.034,2.865,0.004,0.030 0.163

0,1,2,3
Omnibus:,0.82,Durbin-Watson:,3.013
Prob(Omnibus):,0.664,Jarque-Bera (JB):,0.87
Skew:,-0.003,Prob(JB):,0.647
Kurtosis:,2.891,Cond. No.,2.51


- regress **goal differential** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [155]:
y = dt['GD']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,3.445
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.0321
Time:,14:50:12,Log-Likelihood:,-4074.8
No. Observations:,1758,AIC:,8156.0
Df Residuals:,1755,BIC:,8172.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1314,0.078,1.683,0.093,-0.022 0.285
DLD,0.0587,0.035,1.669,0.095,-0.010 0.128
DRD,0.0859,0.037,2.312,0.021,0.013 0.159

0,1,2,3
Omnibus:,1.102,Durbin-Watson:,3.036
Prob(Omnibus):,0.576,Jarque-Bera (JB):,1.124
Skew:,-0.001,Prob(JB):,0.57
Kurtosis:,2.876,Cond. No.,3.0


- regress **goals for ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [156]:
y = dt['GF']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.024
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,14.3
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,3.3e-09
Time:,14:50:30,Log-Likelihood:,-3408.0
No. Observations:,1758,AIC:,6824.0
Df Residuals:,1754,BIC:,6846.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.9972,0.046,65.695,0.000,2.908 3.087
DC,0.1156,0.026,4.493,0.000,0.065 0.166
DLW,0.0529,0.023,2.277,0.023,0.007 0.098
DRW,0.0687,0.023,2.963,0.003,0.023 0.114

0,1,2,3
Omnibus:,44.156,Durbin-Watson:,2.075
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.983
Skew:,0.399,Prob(JB):,6.28e-11
Kurtosis:,3.058,Cond. No.,2.51


- regress **goals for** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [157]:
y = dt['GF']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,6.413
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.00168
Time:,14:50:45,Log-Likelihood:,-3422.8
No. Observations:,1758,AIC:,6852.0
Df Residuals:,1755,BIC:,6868.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.9824,0.054,55.350,0.000,2.877 3.088
DLD,0.0493,0.024,2.031,0.042,0.002 0.097
DRD,0.0844,0.026,3.288,0.001,0.034 0.135

0,1,2,3
Omnibus:,45.288,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.349
Skew:,0.406,Prob(JB):,3.17e-11
Kurtosis:,3.027,Cond. No.,3.0


- regress **goals against ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [158]:
y = dt['GA']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.544
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.652
Time:,14:51:00,Log-Likelihood:,-3428.4
No. Observations:,1758,AIC:,6865.0
Df Residuals:,1754,BIC:,6887.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8559,0.046,61.876,0.000,2.765 2.946
DC,0.0134,0.026,0.513,0.608,-0.038 0.064
DLW,-0.0143,0.023,-0.607,0.544,-0.060 0.032
DRW,-0.0279,0.023,-1.189,0.234,-0.074 0.018

0,1,2,3
Omnibus:,45.768,Durbin-Watson:,2.021
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.933
Skew:,0.409,Prob(JB):,2.37e-11
Kurtosis:,3.012,Cond. No.,2.51


- regress **goals against** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [159]:
y = dt['GA']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.07451
Date:,"Tue, 20 Feb 2018",Prob (F-statistic):,0.928
Time:,14:51:13,Log-Likelihood:,-3429.1
No. Observations:,1758,AIC:,6864.0
Df Residuals:,1755,BIC:,6881.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8510,0.054,52.722,0.000,2.745 2.957
DLD,-0.0094,0.024,-0.386,0.700,-0.057 0.038
DRD,-0.0016,0.026,-0.061,0.951,-0.052 0.049

0,1,2,3
Omnibus:,46.057,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.258
Skew:,0.41,Prob(JB):,2.01e-11
Kurtosis:,3.015,Cond. No.,3.0
