# Data

## season_game_level_data

In [514]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [515]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [516]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [517]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [518]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [519]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [520]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [521]:
# dg = dp[['Season', 'GameNumber', 'EventTeamCode', 'VTeamCode', 'HTeamCode']]
# dg = dg.drop_duplicates(['Season', 'GameNumber',  'EventTeamCode'])
# dg = dg.rename(columns={'EventTeamCode': 'Team'})
# dg['Opp'] = dg.apply(lambda x: x['HTeamCode'] if x['Team'] == x['HTeamCode'] else x['VTeamCode'], axis=1)
# dg['Designation'] = dg.apply(lambda x: 'home' if x['Team'] == x['HTeamCode'] else 'away', axis=1)
# dg = dg[['Season', 'GameNumber', 'Team', 'Opp', 'Designation']]

In [522]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [523]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [524]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [525]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [526]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [527]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [528]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_data.csv', index='False', sep=',')

## season_level_data

In [529]:
dm = da

events that happened in regulation time only

In [530]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [531]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [532]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [533]:
dm.shape

(1796745, 24)

In [534]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTeamCode',
       'EventTimeFromTwenty', 'EventTimeFromZero', 'EventType', 'GameDate',
       'GameNumber', 'HTeamCode', 'Length', 'PenaltyType', 'Period',
       'PlayerName', 'PlayerNumber', 'Season', 'ShotResult', 'ShotType',
       'VTeamCode', 'Zone', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [535]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [536]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [537]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [538]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [539]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [540]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [541]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats_per_player.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats_per_player.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [542]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,TOR,37.0,BRENT,C
2,2010,MTL,14.0,PLEKANEC,C
3,2010,MTL,76.0,SUBBAN,LD
4,2010,TOR,35.0,GIGUERE,G


In [543]:
dp.shape

(1058, 5)

In [544]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/player_rank_by_goals_assists_points_per_time_on_ice.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/player_rank_by_goals_assists_points_per_time_on_ice.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [545]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
709,2010,ANA,5.0,SBISA,RD,3
503,2010,ANA,28.0,CHIPCHURA,C,4
313,2010,ANA,42.0,SEXTON,LW,3
731,2010,ANA,3.0,LILJA,RD,3
496,2010,ANA,22.0,MARCHANT,C,4


- merge player position and player rankings and drop goaltenders

In [546]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1058, 6)

In [547]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              256
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [548]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'RD') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'LD') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'LW') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'RW') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1058, 6)

In [549]:
ds1 = ds[(ds['PlayerPosition'] == 'RD') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
996,2010,OTT,51.0,SMITH,RD,4.0


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [550]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'RD') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [551]:
ds1 = ds[(ds['PlayerPosition'] == 'RD') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [552]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,RD,1.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,RD,3.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [553]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [554]:
dw.shape

(3688734, 12)

In [555]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130215, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [556]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw['Position'] = dw.apply(lambda x: 'D' if (x['PlayerPosition']=='LD') else 'D' if (x['PlayerPosition']=='RD') else 'F', 1)
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'Position'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,Position,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,F,12.0
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL,18.0,F,12.0
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL,18.0,F,12.0
3,2010,20001,MTL,26.0,GORGES,RD,1.0,2,3,1,TOR,MTL,18.0,D,6.0
4,2010,20001,MTL,75.0,GILL,RD,3.0,2,3,1,TOR,MTL,18.0,D,6.0


- count the amount of forwards and defensemen by team per game.

In [557]:
dw['FCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='F' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='D' else np.NaN, 1)
dw['FCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['FCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,Position,PositionCount,FCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
1,2010,20001,MTL,21.0,GIONTA,RW,2.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
2,2010,20001,MTL,57.0,POULIOT,LW,3.0,2,3,1,TOR,MTL,18.0,F,12.0,12.0,6.0
3,2010,20001,MTL,26.0,GORGES,RD,1.0,2,3,1,TOR,MTL,18.0,D,6.0,12.0,6.0
4,2010,20001,MTL,75.0,GILL,RD,3.0,2,3,1,TOR,MTL,18.0,D,6.0,12.0,6.0


### keep games that have only 12 F and 6 D per team!!!!

In [558]:
dw = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['FCount'] == 12) & (x['DCount'] == 6)).all())

In [559]:
dw.shape

(31644, 17)

In [560]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/team_roster_player_rank_by_goals_assists_points_per_toi.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/team_roster_player_rank_by_goals_assists_points_per_toi.csv', index='False', sep=',')

- create a new dataset using team roster player rank

In [561]:
dv = dw

In [562]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'GF', 'GA', 'RosterCount', 'PositionCount', 'FCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank
0,2010,20001,MTL,D,2,3,18.0,6.0,12.0,6.0,TOR,MTL,1.666667
1,2010,20001,MTL,F,2,3,18.0,12.0,12.0,6.0,TOR,MTL,2.75
2,2010,20001,TOR,D,3,2,18.0,6.0,12.0,6.0,TOR,MTL,1.666667
3,2010,20001,TOR,F,3,2,18.0,12.0,12.0,6.0,TOR,MTL,2.666667
4,2010,20002,PHI,D,3,2,18.0,6.0,12.0,6.0,PHI,PIT,1.833333
5,2010,20002,PHI,F,3,2,18.0,12.0,12.0,6.0,PHI,PIT,2.25
6,2010,20002,PIT,D,2,3,18.0,6.0,12.0,6.0,PHI,PIT,1.666667
7,2010,20002,PIT,F,2,3,18.0,12.0,12.0,6.0,PHI,PIT,2.25
8,2010,20003,CAR,D,4,3,18.0,6.0,12.0,6.0,CAR,MIN,1.833333
9,2010,20003,CAR,F,4,3,18.0,12.0,12.0,6.0,CAR,MIN,2.5


In [563]:
dv.shape

(3516, 13)

- create columns for team win and team loss. 

In [564]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']!=x['WinTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20001,MTL,D,2,3,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,0,1
1,2010,20001,MTL,F,2,3,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1
2,2010,20001,TOR,D,3,2,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,1,0
3,2010,20001,TOR,F,3,2,18.0,12.0,12.0,6.0,TOR,MTL,2.666667,1,0
4,2010,20002,PHI,D,3,2,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0


- display games played, games won, games loss, goals for and goals against by team for the season.

In [565]:
dv['GP'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'Position', 'WinTeam'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'Position', 'LossTeam'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'Position', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,0,1,60,31,30
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,1,0,64,31,30
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.666667,1,0,64,31,30
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30


- create columns with the mean ranking for forward and defenseman by team per game.

In [566]:
dv['Rank_F'] = dv.apply(lambda x: x['Rank'] if x['Position']=='F' else np.NaN, 1)
dv['Rank_D'] = dv.apply(lambda x: x['Rank'] if x['Position']=='D' else np.NaN, 1)
dv['Rank_F'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_F'].apply(lambda x: x.ffill().bfill())
dv['Rank_D'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['Rank_D'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,0,1,60,31,30,2.75,1.666667
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.666667
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,1,0,64,31,30,2.666667,1.666667
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.666667,1,0,64,31,30,2.666667,1.666667
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333


- compute the mean per position by team for the season.

In [567]:
dv['Mean_F']= dv.groupby(['Season', 'TeamCode'])['Rank_F'].transform('mean')
dv['Mean_D']= dv.groupby(['Season', 'TeamCode'])['Rank_D'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D,Mean_F,Mean_D
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,0,1,60,31,30,2.75,1.666667,2.576389,1.808333
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.666667,2.576389,1.808333
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,1,0,64,31,30,2.666667,1.666667,2.606771,1.942708
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.666667,1,0,64,31,30,2.666667,1.666667,2.606771,1.942708
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333,2.22096,1.742424


- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [568]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,FCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,Rank_F,Rank_D,Mean_F,Mean_D,L,W
0,2010,20001,MTL,D,165,169,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,0,1,60,31,30,2.75,1.666667,2.576389,1.808333,30,30
1,2010,20001,MTL,F,165,169,18.0,12.0,12.0,6.0,TOR,MTL,2.75,0,1,60,31,30,2.75,1.666667,2.576389,1.808333,30,30
2,2010,20001,TOR,D,179,202,18.0,6.0,12.0,6.0,TOR,MTL,1.666667,1,0,64,31,30,2.666667,1.666667,2.606771,1.942708,33,31
3,2010,20001,TOR,F,179,202,18.0,12.0,12.0,6.0,TOR,MTL,2.666667,1,0,64,31,30,2.666667,1.666667,2.606771,1.942708,33,31
4,2010,20002,PHI,D,219,188,18.0,6.0,12.0,6.0,PHI,PIT,1.833333,1,0,66,39,30,2.25,1.833333,2.22096,1.742424,27,39


- compute win and loss percent by team. Drop duplicate observations.

In [569]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'Mean_F', 'Mean_D']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'Mean_F', 'Mean_D']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D
0,2010,MTL,60,30,30,165,169,0.5,0.5,2.576389,1.808333
2,2010,TOR,64,31,33,179,202,0.484375,0.515625,2.606771,1.942708
4,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.22096,1.742424
6,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.515476,1.680952
8,2010,CAR,65,32,33,183,183,0.492308,0.507692,2.34359,1.828205


- rank teams based on win percent, mean forwards and mean defensemen. 

In [570]:
dv['Rank_W'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['Rank_F'] = dv.groupby(['Season'])['Mean_F'].rank(ascending=True)
dv['Rank_D'] = dv.groupby(['Season'])['Mean_D'].rank(ascending=True)
dv = dv.sort_values(['Season', 'Rank_W', 'Rank_F', 'Rank_D'], ascending=[True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D
198,2010,VAN,58,40,18,198,137,0.689655,0.310345,2.482759,2.100575,1.0,15.0,24.0
12,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.12234,2.219858,2.0,3.0,27.0
4,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.22096,1.742424,3.0,4.0,11.0
30,2010,DET,61,36,25,197,181,0.590164,0.409836,2.330601,1.980874,4.0,7.0,21.0
28,2010,ANA,60,35,25,169,168,0.583333,0.416667,2.568056,2.366667,5.0,20.0,29.0
58,2010,TB,35,20,15,112,109,0.571429,0.428571,2.078571,1.642857,6.5,1.0,8.0
6,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.515476,1.680952,6.5,18.0,10.0
178,2010,PHX,57,32,25,173,157,0.561404,0.438596,2.121345,1.400585,8.0,2.0,4.0
188,2010,BOS,66,37,29,200,163,0.560606,0.439394,2.232323,1.378788,9.0,5.0,3.0
16,2010,CGY,67,37,30,213,191,0.552239,0.447761,2.365672,1.457711,10.0,10.0,5.0


In [571]:
dv.shape

(30, 14)

In [572]:
#dv.to_csv('season_team_roster_ranking.csv', index='False')

In [573]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_team_roster_ranking_by_goals_assists_points_per_toi.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_team_roster_ranking_by_goals_assists_points_per_toi.csv', index='False', sep=',')

## season_game_team_roster_data

- use the team roster player rank dataset (dw) to display the roster quality by team per game

In [574]:
dx = dw

In [575]:
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank']]
dx = dx.rename(columns={'PlayerPosition': 'Position'})
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank
0,2010,20001,MTL,18.0,11.0,C,2.0
1,2010,20001,MTL,18.0,21.0,RW,2.0
2,2010,20001,MTL,18.0,57.0,LW,3.0
3,2010,20001,MTL,18.0,26.0,RD,1.0
4,2010,20001,MTL,18.0,75.0,RD,3.0


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 19 players per team and 38 per game for the dataset to be correct.

In [576]:
dx['playercount'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [577]:
dx['rosterposition'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank (10 columns). 


In [578]:
dx = pd.pivot_table(dx, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dx = dx.reset_index()
dx.columns = ['_'.join(str(s).strip() for s in col if s) for col in dx.columns]
dx.reset_index()
dx = dx.fillna(0)
dx = dx.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_LW_1.0': 'LW1', 'rosterposition_LW_2.0': 'LW2', 'rosterposition_LW_3.0': 'LW3', 'rosterposition_LW_4.0': 'LW4', 'rosterposition_RW_1.0': 'RW1', 'rosterposition_RW_2.0': 'RW2', 'rosterposition_RW_3.0': 'RW3', 'rosterposition_RW_4.0': 'RW4', 'rosterposition_LD_1.0': 'LD1', 'rosterposition_LD_2.0': 'LD2', 'rosterposition_LD_3.0': 'LD3', 'rosterposition_LD_4.0': 'LD4', 'rosterposition_RD_1.0': 'RD1', 'rosterposition_RD_2.0': 'RD2', 'rosterposition_RD_3.0': 'RD3', 'rosterposition_RD_4.0': 'RD4' })
dx.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4
0,2010,20001,MTL,18.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0,1.0
1,2010,20001,TOR,18.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0
2,2010,20002,PHI,18.0,3.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0
4,2010,20003,CAR,18.0,2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0
5,2010,20003,MIN,18.0,1.0,2.0,1.0,0.0,1.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,2.0,1.0,1.0
6,2010,20004,CHI,18.0,2.0,1.0,1.0,0.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,0.0
7,2010,20004,COL,18.0,2.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,3.0,0.0
8,2010,20005,CGY,18.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
9,2010,20005,EDM,18.0,0.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0


In [579]:
dx.shape

(1758, 22)

In [580]:
#dx.to_csv('season_game_team_roster.csv', index='False', sep=',')

In [581]:
dx.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_game_team_roster_by_goals_assists_points_per_toi.csv', index='False', sep=',')
#dx.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_game_team_roster_by_goals_assists_points_per_toi.csv', index='False', sep=',')

- create a dataset that will display the mean of forwards and defencemen by season per team

In [582]:
dz = dx

In [583]:
# mean centers ranking per team
dz['MeanC1'] = dz.groupby(['Season', 'TeamCode'])['C1'].transform('mean')
dz['MeanC2'] = dz.groupby(['Season', 'TeamCode'])['C2'].transform('mean')
dz['MeanC3'] = dz.groupby(['Season', 'TeamCode'])['C3'].transform('mean')
dz['MeanC4'] = dz.groupby(['Season', 'TeamCode'])['C4'].transform('mean')

# mean left wing ranking per team
dz['MeanLW1'] = dz.groupby(['Season', 'TeamCode'])['LW1'].transform('mean')
dz['MeanLW2'] = dz.groupby(['Season', 'TeamCode'])['LW2'].transform('mean')
dz['MeanLW3'] = dz.groupby(['Season', 'TeamCode'])['LW3'].transform('mean')
dz['MeanLW4'] = dz.groupby(['Season', 'TeamCode'])['LW4'].transform('mean')

# mean right wing ranking per team
dz['MeanRW1'] = dz.groupby(['Season', 'TeamCode'])['RW1'].transform('mean')
dz['MeanRW2'] = dz.groupby(['Season', 'TeamCode'])['RW2'].transform('mean')
dz['MeanRW3'] = dz.groupby(['Season', 'TeamCode'])['RW3'].transform('mean')
dz['MeanRW4'] = dz.groupby(['Season', 'TeamCode'])['RW4'].transform('mean')

# mean left defense ranking per team
dz['MeanLD1'] = dz.groupby(['Season', 'TeamCode'])['LD1'].transform('mean')
dz['MeanLD2'] = dz.groupby(['Season', 'TeamCode'])['LD2'].transform('mean')
dz['MeanLD3'] = dz.groupby(['Season', 'TeamCode'])['LD3'].transform('mean')

# mean right defense ranking per team
dz['MeanRD1'] = dz.groupby(['Season', 'TeamCode'])['RD1'].transform('mean')
dz['MeanRD2'] = dz.groupby(['Season', 'TeamCode'])['RD2'].transform('mean')
dz['MeanRD3'] = dz.groupby(['Season', 'TeamCode'])['RD3'].transform('mean')

dz.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,20001,MTL,18.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.5,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.0,2.483333,0.416667,0.8,0.466667,1.916667,0.216667,1.85,0.6,0.95
1,2010,20001,TOR,18.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.984375,2.5625,0.21875,0.0,2.015625,0.578125,2.078125,2.0,0.609375,0.359375,0.59375,1.421875,0.671875,1.171875,1.0,0.828125,0.90625
2,2010,20002,PHI,18.0,3.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,0.5,1.628571,2.242857,0.085714,0.0,1.642857,0.514286,1.185714,0.514286,1.957143,1.557143,0.171429,0.957143,1.528571,0.9,1.857143,0.757143,0.0
4,2010,20003,CAR,18.0,2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.984615,0.0,1.169231,0.892308,1.107692,1.738462,0.261538,0.523077,1.0,0.0,2.692308,0.630769,0.4,1.938462,1.061538,1.692308,0.907692,0.0


- drop duplicates by season and team

In [584]:
dz = dz.drop_duplicates(['Season', 'TeamCode'])

In [585]:
dz = dz[['Season', 'TeamCode', 'RosterCount', 'MeanC1', 'MeanC2', 'MeanC3', 'MeanC4', 'MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']]
dz.head()

Unnamed: 0,Season,TeamCode,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,MTL,18.0,0.0,2.5,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.0,2.483333,0.416667,0.8,0.466667,1.916667,0.216667,1.85,0.6,0.95
1,2010,TOR,18.0,0.0,0.984375,2.5625,0.21875,0.0,2.015625,0.578125,2.078125,2.0,0.609375,0.359375,0.59375,1.421875,0.671875,1.171875,1.0,0.828125,0.90625
2,2010,PHI,18.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273
3,2010,PIT,18.0,0.5,1.628571,2.242857,0.085714,0.0,1.642857,0.514286,1.185714,0.514286,1.957143,1.557143,0.171429,0.957143,1.528571,0.9,1.857143,0.757143,0.0
4,2010,CAR,18.0,1.984615,0.0,1.169231,0.892308,1.107692,1.738462,0.261538,0.523077,1.0,0.0,2.692308,0.630769,0.4,1.938462,1.061538,1.692308,0.907692,0.0


In [586]:
dz.shape

(30, 21)

In [587]:
#dz.to_csv('season_team.csv', index='False', sep=',')

In [588]:
dz.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_team_by_goals_assists_points_per_toi.csv', index='False', sep=',')
#dz.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_team_by_goals_assists_points_per_toi.csv', index='False', sep=',')

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [589]:
dy = dx

In [590]:
dy.loc[dy.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dy = dy.fillna(2)

In [591]:
dy.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A
0,2010,20001,MTL,18.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.5,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.0,2.483333,0.416667,0.8,0.466667,1.916667,0.216667,1.85,0.6,0.95,1.0
1,2010,20001,TOR,18.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.984375,2.5625,0.21875,0.0,2.015625,0.578125,2.078125,2.0,0.609375,0.359375,0.59375,1.421875,0.671875,1.171875,1.0,0.828125,0.90625,2.0
2,2010,20002,PHI,18.0,3.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273,1.0
3,2010,20002,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,0.5,1.628571,2.242857,0.085714,0.0,1.642857,0.514286,1.185714,0.514286,1.957143,1.557143,0.171429,0.957143,1.528571,0.9,1.857143,0.757143,0.0,2.0
4,2010,20003,CAR,18.0,2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.984615,0.0,1.169231,0.892308,1.107692,1.738462,0.261538,0.523077,1.0,0.0,2.692308,0.630769,0.4,1.938462,1.061538,1.692308,0.907692,0.0,1.0


- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [592]:
dy = pd.pivot_table(dy, index=['Season', 'GameNumber'], columns=['A'], values=['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4', 'RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3'])
dy = dy.reset_index()
dy.columns = ['_'.join(str(s).strip() for s in col if s) for col in dy.columns]
dy = dy.reset_index()
dy = dy.rename(columns={'C1_1.0': 'VC1', 'C2_1.0': 'VC2', 'C3_1.0': 'VC3', 'C4_1.0': 'VC4', 'LW1_1.0': 'VLW1', 'LW2_1.0': 'VLW2', 'LW3_1.0': 'VLW3', 'LW4_1.0': 'VLW4', 'RW1_1.0': 'VRW1', 'RW2_1.0': 'VRW2', 'RW3_1.0': 'VRW3', 'RW4_1.0': 'VRW4', 'LD1_1.0': 'VLD1', 'LD2_1.0': 'VLD2', 'LD3_1.0': 'VLD3', 'RD1_1.0': 'VRD1', 'RD2_1.0': 'VRD2', 'RD3_1.0': 'VRD3', 'C1_2.0': 'HC1', 'C2_2.0': 'HC2', 'C3_2.0': 'HC3', 'C4_2.0': 'HC4', 'LW1_2.0': 'HLW1', 'LW2_2.0': 'HLW2', 'LW3_2.0': 'HLW3', 'LW4_2.0': 'HLW4', 'RW1_2.0': 'HRW1', 'RW2_2.0': 'HRW2', 'RW3_2.0': 'HRW3', 'RW4_2.0': 'HRW4', 'LD1_2.0': 'HLD1', 'LD2_2.0': 'HLD2', 'LD3_2.0': 'HLD3', 'RD1_2.0': 'HRD1', 'RD2_2.0': 'HRD2', 'RD3_2.0': 'HRD3'})
dy = dy[['Season', 'GameNumber', 'VC1', 'VC2', 'VC3', 'VC4', 'VLW1', 'VLW2', 'VLW3', 'VLW4', 'VRW1', 'VRW2', 'VRW3', 'VRW4', 'VLD1', 'VLD2', 'VLD3', 'VRD1', 'VRD2', 'VRD3', 'HC1', 'HC2', 'HC3', 'HC4', 'HLW1', 'HLW2', 'HLW3', 'HLW4', 'HRW1', 'HRW2', 'HRW3', 'HRW4', 'HLD1', 'HLD2', 'HLD3', 'HRD1', 'HRD2', 'HRD3']]
dy.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3
0,2010,20001,0.0,2.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0
1,2010,20002,3.0,0.0,0.0,1.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0
2,2010,20003,2.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0
3,2010,20004,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0
4,2010,20005,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0


In [593]:
dy.shape

(879, 38)

In [594]:
#dy.to_csv('season_game_roster.csv', index='False', sep=',')

In [595]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/roster/season_game_roster_by_goals_assists_points_per_toi.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/roster/season_game_roster_by_goals_assists_points_per_toi.csv', index='False', sep=',')

# Roster Analysis

## season_level_analysis

#### $WinPc = \beta_{0} + \beta_{1}MeanC_{1} + \beta_{2}MeanC_{2}+ \beta_{3}MeanC_{3} + \beta_{4}MeanC_{4} + \beta_{5}MeanLW_{1} + \beta_{6}MeanLW_{2}+ \beta_{7}MeanLW_{3} + \beta_{8}MeanLW_{4} + \beta_{9}MeanRW_{1} + \beta_{10}MeanRW_{2}+ \beta_{11}MeanRW_{3} + \beta_{12}MeanRW_{4} + \beta_{13}MeanLD_{1} + \beta_{14}MeanLD_{2}+ \beta_{15}MeanLD_{3} + \beta_{16}MeanRD_{1} + \beta_{17}MeanRD_{2}+ \beta_{18}MeanRD_{3} + e_{s}$

- merge season_team dataset (dz) and season_team_roster_ranking (dv) for roster analysis at the season level. Use **ds** as the merging dataset.

In [596]:
ds = dv.merge(dz, on=['Season', 'TeamCode'], how='left')
ds.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3
0,2010,VAN,58,40,18,198,137,0.689655,0.310345,2.482759,2.100575,1.0,15.0,24.0,18.0,2.0,0.0,0.896552,0.844828,1.0,0.810345,1.206897,1.155172,0.0,1.827586,1.827586,0.431034,0.793103,0.568966,1.741379,1.137931,0.965517,0.793103
1,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.12234,2.219858,2.0,3.0,27.0,18.0,1.851064,0.787234,0.978723,0.659574,1.0,1.978723,1.0,0.0,0.93617,0.93617,1.787234,0.085106,0.361702,1.808511,0.893617,0.574468,1.0,1.361702
2,2010,PHI,66,39,27,219,188,0.590909,0.409091,2.22096,1.742424,3.0,4.0,11.0,18.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273
3,2010,DET,61,36,25,197,181,0.590164,0.409836,2.330601,1.980874,4.0,7.0,21.0,18.0,0.606557,1.803279,1.606557,0.180328,0.967213,1.836066,1.704918,0.311475,0.0,1.737705,1.245902,0.0,1.688525,0.918033,0.0,0.0,1.819672,1.57377
4,2010,ANA,60,35,25,169,168,0.583333,0.416667,2.568056,2.366667,5.0,20.0,29.0,18.0,0.8,0.983333,0.0,2.5,1.0,0.95,1.55,0.5,1.9,0.0,0.666667,1.15,0.95,0.2,1.766667,0.0,1.7,1.383333


- display the diffence in quality of forwards (DF) and defensemen (DD) per team.

In [597]:
ds['DC'] = ds['MeanC1'] - ds['MeanC2'] - ds['MeanC3'] - ds['MeanC4']
ds['DLW'] = ds['MeanLW1'] - ds['MeanLW2'] - ds['MeanLW3'] - ds['MeanLW4']
ds['DRW'] = ds['MeanRW1'] - ds['MeanRW2'] - ds['MeanRW3'] - ds['MeanRW4']
ds['DLD'] = ds['MeanLD1'] - ds['MeanLD2'] - ds['MeanLD3'] 
ds['DRD'] = ds['MeanRD1'] - ds['MeanRD2'] - ds['MeanRD3']

- mean goals for and mean goals against per team.

In [598]:
ds['meanGF'] = ds['GF']/ ds['GP']
ds['meanGA'] = ds['GA']/ ds['GP']

In [599]:
ds.shape

(30, 40)

### summary analysis

In [600]:
ds.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D,RosterCount,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,DC,DLW,DRW,DLD,DRD,meanGF,meanGA
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,58.6,29.3,29.3,167.7,167.7,0.494979,0.505021,2.463376,1.831794,15.5,15.5,15.5,18.0,0.816922,1.2879,1.365414,0.648304,0.720514,1.368929,1.048522,0.776203,0.563698,1.536697,1.335094,0.531803,1.376264,0.763705,0.812857,1.276777,0.939448,0.830949,-2.484697,-2.47314,-2.839895,-0.200298,-0.493621,2.836002,2.859379
std,0.0,12.237872,8.50213,8.183878,40.87251,41.150816,0.104917,0.104917,0.202526,0.313165,8.80145,8.803408,8.803408,0.0,0.832285,0.914813,0.795068,0.702511,0.619995,0.611453,0.639809,0.592124,0.560175,0.999333,0.800284,0.448286,0.806026,0.782058,0.56069,0.82296,0.613775,0.662571,1.659384,1.186146,1.266265,1.610431,1.728652,0.371673,0.335736
min,2010.0,19.0,4.0,15.0,32.0,56.0,0.210526,0.310345,2.078571,1.157143,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.833333,-4.671875,-5.52381,-3.19697,-3.393443,1.684211,2.362069
25%,2010.0,58.0,22.5,25.0,158.5,157.0,0.460623,0.438796,2.341106,1.64881,8.25,8.25,8.25,18.0,0.0,0.804711,0.96059,0.185971,0.0,0.984982,0.547516,0.315696,0.0,0.757143,0.751722,0.124813,0.769171,0.028571,0.258007,0.633976,0.593182,0.138346,-3.760116,-3.403896,-3.581738,-1.505788,-1.654819,2.713258,2.569866
50%,2010.0,63.5,32.0,29.5,177.0,172.0,0.520398,0.479602,2.495733,1.818269,15.5,15.5,15.5,18.0,0.720919,1.0,1.485714,0.359649,0.951411,1.416667,1.091694,0.768681,0.640909,1.706694,1.275125,0.427638,1.5,0.62042,0.849794,1.458635,0.936605,0.944231,-2.797807,-2.356061,-3.177381,-0.210938,-0.157143,2.853945,2.840909
75%,2010.0,66.0,35.75,33.0,192.5,191.0,0.561204,0.539377,2.584573,2.038127,22.75,22.75,22.75,18.0,1.548522,1.80022,1.880008,0.916259,1.0,1.843632,1.531731,1.139678,0.925812,2.004051,1.799081,0.799219,1.909199,1.069253,1.090385,1.855357,1.394737,1.305024,-1.941187,-1.503571,-2.183042,0.957867,0.780128,3.056391,3.077518
max,2010.0,70.0,40.0,44.0,219.0,230.0,0.689655,0.789474,2.894737,2.471264,30.0,30.0,30.0,18.0,2.878788,3.710145,2.8,2.835821,2.0,2.621212,2.231884,2.078125,2.0,3.714286,3.366667,1.512821,2.877193,2.984848,1.974359,3.0,2.242424,2.465517,1.954545,0.085714,0.4375,2.649123,2.782609,3.413793,3.666667


### estimate roster model 

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [601]:
print ('season level analysis (win percent) by mean roster position')
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3' ]] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/goals_assists_points_per_toi/seson_level_analysis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season level analysis (win percent) by mean roster position
                            OLS Regression Results                            
Dep. Variable:                  WinPc   R-squared:                       0.663
Model:                            OLS   Adj. R-squared:                  0.247
Method:                 Least Squares   F-statistic:                     1.595
Date:                Thu, 22 Feb 2018   Prob (F-statistic):              0.200
Time:                        03:50:08   Log-Likelihood:                 41.874
No. Observations:                  30   AIC:                            -49.75
Df Residuals:                      13   BIC:                            -25.93
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------

- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [602]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.072
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,2.159
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.153
Time:,03:50:08,Log-Likelihood:,26.692
No. Observations:,30,AIC:,-49.38
Df Residuals:,28,BIC:,-46.58
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4674,0.027,17.611,0.000,0.413 0.522
MeanC1,0.0337,0.023,1.469,0.153,-0.013 0.081

0,1,2,3
Omnibus:,6.807,Durbin-Watson:,0.199
Prob(Omnibus):,0.033,Jarque-Bera (JB):,5.509
Skew:,-1.032,Prob(JB):,0.0636
Kurtosis:,3.383,Cond. No.,2.45


- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **Logit**.  The purpose is to deterimine the impact each roster position has on team winning percent.

In [603]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.665809
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,13.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0393
Time:,03:50:08,Log-Likelihood:,-19.974
converged:,True,LL-Null:,-20.791
,,LLR p-value:,1.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0029,2.17e+07,1.36e-10,1.000,-4.25e+07 4.25e+07
MeanC1,0.1377,8.22e+06,1.68e-08,1.000,-1.61e+07 1.61e+07
MeanC2,0.0389,8.28e+06,4.71e-09,1.000,-1.62e+07 1.62e+07
MeanC3,-0.1115,8.14e+06,-1.37e-08,1.000,-1.6e+07 1.6e+07
MeanC4,-0.0243,8.16e+06,-2.98e-09,1.000,-1.6e+07 1.6e+07
MeanLW1,0.2284,8.23e+06,2.78e-08,1.000,-1.61e+07 1.61e+07
MeanLW2,0.0730,8.29e+06,8.8e-09,1.000,-1.63e+07 1.63e+07
MeanLW3,0.2799,8.2e+06,3.41e-08,1.000,-1.61e+07 1.61e+07
MeanLW4,0.0494,8.21e+06,6.02e-09,1.000,-1.61e+07 1.61e+07


- regress **team win percent** on the mean of top forwards. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [604]:
y = ds['WinPc']  
X = sm.add_constant(ds[['MeanC1']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690069
         Iterations 3


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,28.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.004299
Time:,03:50:08,Log-Likelihood:,-20.702
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.6724

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.1305,0.517,-0.252,0.801,-1.144 0.883
MeanC1,0.1352,0.448,0.302,0.763,-0.742 1.013


- regress **team win percent** on the difference in the mean quality of forwards (DF). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [605]:
print ('season level analysis (win percent) by roster position differential')
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/goals_assists_points_per_toi/season_level_analysis_with_roster_position_differential.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season level analysis (win percent) by roster position differential
                            OLS Regression Results                            
Dep. Variable:                  WinPc   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     1.219
Date:                Thu, 22 Feb 2018   Prob (F-statistic):              0.330
Time:                        03:50:08   Log-Likelihood:                 28.974
No. Observations:                  30   AIC:                            -45.95
Df Residuals:                      24   BIC:                            -37.54
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------

- regress **team win percent** on the difference in the mean quality of forwards (DF). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [606]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.685698
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,26.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.01061
Time:,03:50:08,Log-Likelihood:,-20.571
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.9316

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.4853,1.184,0.410,0.682,-1.836 2.806
DC,0.0459,0.239,0.192,0.848,-0.423 0.515
DLW,0.0422,0.331,0.127,0.899,-0.607 0.691
DRW,0.1011,0.300,0.337,0.736,-0.486 0.688


- regress **team win percent** on the difference in the mean quality of defensemen (DD). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [607]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.033
Model:,OLS,Adj. R-squared:,-0.039
Method:,Least Squares,F-statistic:,0.4598
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.636
Time:,03:50:08,Log-Likelihood:,26.08
No. Observations:,30,AIC:,-46.16
Df Residuals:,27,BIC:,-41.96
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4917,0.021,23.932,0.000,0.450 0.534
DLD,-0.0118,0.012,-0.957,0.347,-0.037 0.014
DRD,-0.0018,0.012,-0.153,0.880,-0.025 0.022

0,1,2,3
Omnibus:,6.09,Durbin-Watson:,0.152
Prob(Omnibus):,0.048,Jarque-Bera (JB):,4.633
Skew:,-0.934,Prob(JB):,0.0986
Kurtosis:,3.463,Cond. No.,1.94


- regress **team win percent** on the difference in the mean quality of defensemen (DD). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster positi0n has on team win percent.

In [608]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691666
         Iterations 3


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,27.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.001994
Time:,03:50:08,Log-Likelihood:,-20.75
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.9594

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.0331,0.385,-0.086,0.931,-0.787 0.721
DLD,-0.0474,0.232,-0.204,0.838,-0.502 0.407
DRD,-0.0071,0.216,-0.033,0.974,-0.430 0.416


In [609]:
y = ds['WinPc']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.684595
         Iterations 4


0,1,2,3
Dep. Variable:,WinPc,No. Observations:,30.0
Model:,Logit,Df Residuals:,24.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0122
Time:,03:50:08,Log-Likelihood:,-20.538
converged:,True,LL-Null:,-20.791
,,LLR p-value:,0.9919

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.5110,1.238,0.413,0.680,-1.916 2.938
DC,0.0356,0.262,0.136,0.892,-0.479 0.550
DLW,0.0565,0.358,0.158,0.875,-0.646 0.759
DRW,0.1058,0.313,0.338,0.735,-0.508 0.719
DLD,-0.0372,0.262,-0.142,0.887,-0.551 0.476
DRD,0.0196,0.233,0.084,0.933,-0.437 0.476


#### mean goals regression

- regress **mean goals for** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [610]:
y = ds['meanGF']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGF,R-squared:,0.886
Model:,OLS,Adj. R-squared:,0.745
Method:,Least Squares,F-statistic:,6.299
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.000892
Time:,03:50:08,Log-Likelihood:,20.172
No. Observations:,30,AIC:,-6.345
Df Residuals:,13,BIC:,17.48
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1462,0.004,37.575,0.000,0.138 0.155
MeanC1,0.4500,0.122,3.701,0.003,0.187 0.713
MeanC2,0.2535,0.085,2.968,0.011,0.069 0.438
MeanC3,0.0558,0.064,0.876,0.397,-0.082 0.194
MeanC4,0.0941,0.079,1.188,0.256,-0.077 0.265
MeanLW1,0.2458,0.118,2.086,0.057,-0.009 0.500
MeanLW2,0.2134,0.078,2.740,0.017,0.045 0.382
MeanLW3,0.3317,0.082,4.028,0.001,0.154 0.510
MeanLW4,0.1292,0.092,1.402,0.184,-0.070 0.328

0,1,2,3
Omnibus:,9.922,Durbin-Watson:,1.896
Prob(Omnibus):,0.007,Jarque-Bera (JB):,8.529
Skew:,1.086,Prob(JB):,0.0141
Kurtosis:,4.451,Cond. No.,5.36e+16


- regress **mean goals against** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [611]:
y = ds['meanGA']  
X = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4','MeanLW1', 'MeanLW2', 'MeanLW3', 'MeanLW4', 'MeanRW1', 'MeanRW2', 'MeanRW3', 'MeanRW4', 'MeanLD1', 'MeanLD2', 'MeanLD3', 'MeanRD1', 'MeanRD2', 'MeanRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGA,R-squared:,0.398
Model:,OLS,Adj. R-squared:,-0.343
Method:,Least Squares,F-statistic:,0.5375
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.88
Time:,03:50:09,Log-Likelihood:,-1.701
No. Observations:,30,AIC:,37.4
Df Residuals:,13,BIC:,61.22
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.1483,0.008,18.388,0.000,0.131 0.166
MeanC1,0.3222,0.252,1.278,0.224,-0.222 0.867
MeanC2,0.2398,0.177,1.354,0.199,-0.143 0.622
MeanC3,0.2980,0.132,2.255,0.042,0.013 0.583
MeanC4,0.1466,0.164,0.893,0.388,-0.208 0.501
MeanLW1,-0.0771,0.244,-0.316,0.757,-0.605 0.451
MeanLW2,0.0804,0.161,0.498,0.627,-0.269 0.429
MeanLW3,-0.0573,0.171,-0.336,0.743,-0.426 0.312
MeanLW4,-0.0302,0.191,-0.158,0.877,-0.443 0.383

0,1,2,3
Omnibus:,1.274,Durbin-Watson:,1.723
Prob(Omnibus):,0.529,Jarque-Bera (JB):,1.037
Skew:,-0.438,Prob(JB):,0.596
Kurtosis:,2.751,Cond. No.,5.36e+16


- regress **mean goals for** on the differential of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [612]:
y = ds['meanGF']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGF,R-squared:,0.232
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,1.454
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.242
Time:,03:50:09,Log-Likelihood:,-8.3984
No. Observations:,30,AIC:,28.8
Df Residuals:,24,BIC:,37.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.2830,0.220,14.912,0.000,2.829 3.737
DC,0.0965,0.047,2.065,0.050,6.65e-05 0.193
DLW,0.0180,0.064,0.283,0.780,-0.114 0.150
DRW,0.0506,0.056,0.908,0.373,-0.064 0.166
DLD,0.0372,0.047,0.796,0.434,-0.059 0.133
DRD,0.0234,0.042,0.564,0.578,-0.062 0.109

0,1,2,3
Omnibus:,6.089,Durbin-Watson:,1.374
Prob(Omnibus):,0.048,Jarque-Bera (JB):,4.352
Skew:,-0.839,Prob(JB):,0.113
Kurtosis:,3.818,Cond. No.,16.9


- regress **mean goals against** on the differential of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [613]:
y = ds['meanGA']  
X = sm.add_constant(ds[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,meanGA,R-squared:,0.105
Model:,OLS,Adj. R-squared:,-0.082
Method:,Least Squares,F-statistic:,0.5622
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.728
Time:,03:50:09,Log-Likelihood:,-7.6553
No. Observations:,30,AIC:,27.31
Df Residuals:,24,BIC:,35.72
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.7569,0.215,12.837,0.000,2.314 3.200
DC,0.0106,0.046,0.232,0.819,-0.083 0.105
DLW,-0.0267,0.062,-0.428,0.672,-0.155 0.102
DRW,-0.0198,0.054,-0.364,0.719,-0.132 0.092
DLD,0.0543,0.046,1.192,0.245,-0.040 0.148
DRD,-0.0354,0.041,-0.873,0.391,-0.119 0.048

0,1,2,3
Omnibus:,1.139,Durbin-Watson:,1.044
Prob(Omnibus):,0.566,Jarque-Bera (JB):,0.871
Skew:,0.048,Prob(JB):,0.647
Kurtosis:,2.171,Cond. No.,16.9


## season_game_level_analysis

#### $HomeWin = \beta_{0} + \beta_{1}DC_{1} + \beta_{2}DC_{2} + \beta_{3}DC_{3} + \beta_{4}DC_{4} + \beta_{5}DLW_{1} + \beta_{6}DLW_{2} + \beta_{7}DLW_{3} + \beta_{8}DLW_{4} + + \beta_{9}DRW_{1} + \beta_{10}DRW_{2} + \beta_{11}DRW_{3} + \beta_{12}DRW_{4} + \beta_{13}DLD_{1} + \beta_{14}DLD_{2} + \beta_{15}DLD_{3} + \beta_{16}DRD_{1}+ \beta_{17}DRD_{2} + \beta_{18}DRD_{3}+ e_{s,g}$

- merge season game data (dg) and season game roster (dy).

In [614]:
dl = dg.merge(dy, on=['Season', 'GameNumber'], how='left')
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,0.0,2.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,3.0,0.0,0.0,1.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,2.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0


- determine if the home or away team won the game.

In [615]:
dl['WinTeam'] = dl.apply(lambda x: 'HOME' if x['GD'] > 0 else 'AWAY', axis=1)

- Calculate the difference between player quality per game for all positions with respect to home team ( Home Team - Visitor Team). There are 5 positions and 2 types of player quality. This will give us a total of 10 differenecs. 

In [616]:
dl.shape

(1230, 47)

- total of forwards and defensemen by team per game.

In [617]:
dl['VF'] = dl['VC1'] + dl['VC2'] + dl['VC3'] + dl['VC4'] + dl['VLW1'] + dl['VLW2'] + dl['VLW3'] + dl['VLW4'] + dl['VRW1'] + dl['VRW2'] + dl['VRW3'] + dl['VRW4']
dl['VD'] = dl['VLD1'] + dl['VLD2'] + dl['VLD3'] + dl['VRD1'] + dl['VRD2'] + dl['VRD3']
dl['HF'] = dl['HC1'] + dl['HC2'] + dl['HC3'] + dl['HC4'] + dl['HLW1'] + dl['HLW2'] + dl['HLW3'] + dl['HLW4'] + dl['HRW1'] + dl['HRW2'] + dl['HRW3'] + dl['HRW4']
dl['HD'] = dl['HLD1'] + dl['HLD2'] + dl['HLD3'] + dl['HRD1'] + dl['HRD2'] + dl['HRD3']

- total of forwards and defensemen per game.

In [618]:
dl['F'] = dl['VF'] + dl['HF']
dl['D'] = dl['VD'] + dl['HD']
dl.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3,VF,VD,HF,HD,F,D
0,2010,20001,MTL,TOR,2,3,1,3,2,HOME,MTL,0.0,2.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
1,2010,20002,PHI,PIT,3,2,-1,2,3,AWAY,PIT,3.0,0.0,0.0,1.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
2,2010,20003,CAR,MIN,4,3,-1,3,4,AWAY,MIN,2.0,0.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,2.0,12.0,6.0,12.0,6.0,24.0,12.0
3,2010,20004,CHI,COL,3,4,1,4,3,HOME,CHI,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,3.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
4,2010,20005,CGY,EDM,0,4,4,4,0,HOME,CGY,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0


- **keep games with 12 forwards and 6 defensemen per team.**

In [619]:
dl = dl[((dl['VF'] == 12) & (dl['VD'] == 6) & (dl['HF'] == 12) & (dl['HD'] == 6))]

In [620]:
dl.shape

(879, 53)

In [621]:
dl['VF'].value_counts()

12.0    879
Name: VF, dtype: int64

In [622]:
dl['VD'].value_counts()

6.0    879
Name: VD, dtype: int64

In [623]:
dl['HF'].value_counts()

12.0    879
Name: HF, dtype: int64

In [624]:
dl['HD'].value_counts()

6.0    879
Name: HD, dtype: int64

### summary analysis

In [625]:
dl.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,VC1,VC2,VC3,VC4,VLW1,VLW2,VLW3,VLW4,VRW1,VRW2,VRW3,VRW4,VLD1,VLD2,VLD3,VRD1,VRD2,VRD3,HC1,HC2,HC3,HC4,HLW1,HLW2,HLW3,HLW4,HRW1,HRW2,HRW3,HRW4,HLD1,HLD2,HLD3,HRD1,HRD2,HRD3,VF,VD,HF,HD,F,D
count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
mean,2010.0,20613.112628,2.763367,2.960182,0.196815,2.960182,2.763367,0.750853,1.357224,1.458476,0.699659,0.775882,1.386803,1.03868,0.622298,0.535836,1.484642,1.326507,0.56314,1.492605,0.705347,0.812287,1.31058,0.943117,0.736064,0.86917,1.224118,1.277588,0.623436,0.614334,1.425484,1.027304,0.857793,0.606371,1.68942,1.293515,0.491468,1.260523,0.902162,0.811149,1.235495,0.94653,0.844141,12.0,6.0,12.0,6.0,24.0,12.0
std,0.0,349.491101,1.672088,1.727312,2.455284,1.727312,1.672088,0.831085,0.980349,0.900667,0.960916,0.585365,0.703463,0.810442,0.700311,0.619189,1.072297,0.991589,0.663261,0.935081,0.797694,0.774028,0.987514,0.720001,0.749158,0.883894,1.021884,0.889235,0.731925,0.678551,0.721354,0.766129,0.817814,0.611027,1.089509,0.837798,0.618329,0.931198,0.890078,0.756634,0.910739,0.735131,0.860184,0.0,0.0,0.0,0.0,0.0,0.0
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
25%,2010.0,20317.5,2.0,2.0,-1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.5,0.0,0.0,12.0,6.0,12.0,6.0,24.0,12.0
50%,2010.0,20613.0,3.0,3.0,1.0,3.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
75%,2010.0,20913.5,4.0,4.0,2.0,4.0,4.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,12.0,6.0,12.0,6.0,24.0,12.0
max,2010.0,21230.0,10.0,9.0,7.0,9.0,10.0,3.0,4.0,3.0,5.0,2.0,3.0,3.0,4.0,2.0,5.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,5.0,2.0,3.0,3.0,4.0,2.0,5.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,12.0,6.0,12.0,6.0,24.0,12.0


In [626]:
#dl = dl[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'HGF', 'VGF', 'GD','WinTeam', 'VF1', 'VF2', 'VD1', 'VD2', 'HF1', 'HF2', 'HD1', 'HD2']]

- determine if a game was won by the home or visitor team.
- compute the difference in quality of forwards and defensemen between home and visitor team per game (DF1, DF2, DD1, DD2). 

In [627]:
dl['HomeWin'] = dl.apply(lambda x: 1 if x['WinTeam']=='HOME' else 0, axis=1)

# difference in center position per game
dl['DC1'] = dl['HC1'] - dl['VC1']
dl['DC2'] = dl['HC2'] - dl['VC2']
dl['DC3'] = dl['HC3'] - dl['VC3']
dl['DC4'] = dl['HC4'] - dl['VC4']

# difference in left position per game
dl['DLW1'] = dl['HLW1'] - dl['VLW1']
dl['DLW2'] = dl['HLW2'] - dl['VLW2']
dl['DLW3'] = dl['HLW3'] - dl['VLW3']
dl['DLW4'] = dl['HLW4'] - dl['VLW4']

# difference in right wing position per game
dl['DRW1'] = dl['HRW1'] - dl['VRW1']
dl['DRW2'] = dl['HRW2'] - dl['VRW2']
dl['DRW3'] = dl['HRW3'] - dl['VRW3']
dl['DRW4'] = dl['HRW4'] - dl['VRW4']

# difference in left defence position per game
dl['DLD1'] = dl['HLD1'] - dl['VLD1']
dl['DLD2'] = dl['HLD2'] - dl['VLD2']
dl['DLD3'] = dl['HLD3'] - dl['VLD3']

# difference in right defence position per game
dl['DRD1'] = dl['HRD1'] - dl['VRD1']
dl['DRD2'] = dl['HRD2'] - dl['VRD2']
dl['DRD3'] = dl['HRD3'] - dl['VRD3']

In [628]:
dl.groupby(['WinTeam'])['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3', ].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,DC1,DC2,DC3,DC4,DLW1,DLW2,DLW3,DLW4,DRW1,DRW2,DRW3,DRW4,DLD1,DLD2,DLD3,DRD1,DRD2,DRD3
WinTeam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AWAY,count,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0
AWAY,mean,0.111374,-0.2109,-0.125592,-0.042654,-0.182464,0.021327,-0.035545,0.253555,0.087678,0.14218,0.023697,-0.042654,-0.253555,0.218009,0.009479,-0.123223,-0.009479,0.158768
AWAY,std,1.296593,1.489655,1.282389,1.21083,0.897278,1.020931,1.14139,0.998657,0.916665,1.563744,1.268814,0.868056,1.302172,1.205631,1.091929,1.329004,1.022272,1.186186
AWAY,min,-3.0,-4.0,-3.0,-4.0,-2.0,-3.0,-3.0,-2.0,-2.0,-4.0,-4.0,-3.0,-3.0,-3.0,-3.0,-4.0,-2.0,-3.0
AWAY,25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
AWAY,50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWAY,75%,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
AWAY,max,3.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,5.0,3.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0
HOME,count,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0
HOME,mean,0.124726,-0.061269,-0.231947,-0.107221,-0.142232,0.054705,0.010941,0.218818,0.054705,0.262582,-0.085339,-0.098468,-0.212254,0.177243,-0.010941,-0.030635,0.015317,0.061269


### estimate roster model

- regress **home win** on the difference in number of home and visitor players by position and quality. Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on home team success.

In [629]:
print ('season game level analysis (home win) by roster position differential')
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/goals_assists_points_per_toi/season_game_level_analyis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season game level analysis (home win) by roster position differential
                            OLS Regression Results                            
Dep. Variable:                HomeWin   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.6925
Date:                Thu, 22 Feb 2018   Prob (F-statistic):              0.803
Time:                        03:50:10   Log-Likelihood:                -631.66
No. Observations:                 879   AIC:                             1297.
Df Residuals:                     862   BIC:                             1379.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------

In [630]:
result.params

const    0.516046
DC1      0.025118
DC2      0.020110
DC3     -0.015404
DC4     -0.009731
DLW1     0.003310
DLW2     0.013820
DLW3     0.005891
DLW4     0.007388
DRW1    -0.012274
DRW2     0.003111
DRW3    -0.019424
DRW4    -0.021915
DLD1    -0.002808
DLD2    -0.013050
DLD3     0.004764
DRD1     0.011277
DRD2     0.007905
DRD3    -0.008089
dtype: float64

- By increasing the differential of **elite center** (home team – visitor team) by one unit, home win **increases** by 2.5 games.
- By increasing the differential of **secondary center** (home team – visitor team) by one unit, home win **increases** by 2 games respectfully.

- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [631]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.378
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.24
Time:,03:50:10,Log-Likelihood:,-634.51
No. Observations:,879,AIC:,1279.0
Df Residuals:,874,BIC:,1303.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5188,0.017,29.710,0.000,0.485 0.553
DC1,0.0264,0.022,1.182,0.237,-0.017 0.070
DC2,0.0305,0.020,1.502,0.133,-0.009 0.070
DC3,-0.0083,0.018,-0.470,0.638,-0.043 0.026
DC4,-0.0073,0.017,-0.436,0.663,-0.040 0.026

0,1,2,3
Omnibus:,0.942,Durbin-Watson:,1.902
Prob(Omnibus):,0.625,Jarque-Bera (JB):,142.927
Skew:,-0.08,Prob(JB):,9.2e-32
Kurtosis:,1.031,Cond. No.,3.55


In [632]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DLW1', 'DLW2', 'DLW3', 'DLW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.4777
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.752
Time:,03:50:10,Log-Likelihood:,-636.31
No. Observations:,879,AIC:,1283.0
Df Residuals:,874,BIC:,1307.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5219,0.018,29.748,0.000,0.488 0.556
DLW1,0.0216,0.021,1.042,0.298,-0.019 0.062
DLW2,0.0193,0.019,0.995,0.320,-0.019 0.057
DLW3,0.0173,0.019,0.929,0.353,-0.019 0.054
DLW4,0.0039,0.019,0.204,0.838,-0.033 0.041

0,1,2,3
Omnibus:,0.932,Durbin-Watson:,1.888
Prob(Omnibus):,0.627,Jarque-Bera (JB):,145.229
Skew:,-0.079,Prob(JB):,2.9100000000000003e-32
Kurtosis:,1.015,Cond. No.,2.28


In [633]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DRW1', 'DRW2', 'DRW3', 'DRW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.004
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.9425
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.439
Time:,03:50:10,Log-Likelihood:,-635.38
No. Observations:,879,AIC:,1281.0
Df Residuals:,874,BIC:,1305.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5173,0.017,30.050,0.000,0.484 0.551
DRW1,-0.0094,0.021,-0.454,0.650,-0.050 0.031
DRW2,0.0037,0.013,0.283,0.777,-0.022 0.030
DRW3,-0.0195,0.015,-1.275,0.203,-0.049 0.011
DRW4,-0.0253,0.020,-1.268,0.205,-0.064 0.014

0,1,2,3
Omnibus:,0.922,Durbin-Watson:,1.883
Prob(Omnibus):,0.631,Jarque-Bera (JB):,144.001
Skew:,-0.079,Prob(JB):,5.38e-32
Kurtosis:,1.023,Cond. No.,2.49


In [634]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DLD1', 'DLD2', 'DLD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.1428
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.934
Time:,03:50:10,Log-Likelihood:,-637.06
No. Observations:,879,AIC:,1282.0
Df Residuals:,875,BIC:,1301.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5216,0.017,30.252,0.000,0.488 0.555
DLD1,-0.0001,0.019,-0.007,0.995,-0.038 0.037
DLD2,-0.0089,0.020,-0.455,0.650,-0.047 0.030
DLD3,-0.0064,0.021,-0.312,0.755,-0.047 0.034

0,1,2,3
Omnibus:,0.943,Durbin-Watson:,1.888
Prob(Omnibus):,0.624,Jarque-Bera (JB):,146.215
Skew:,-0.08,Prob(JB):,1.78e-32
Kurtosis:,1.008,Cond. No.,2.79


In [635]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.904
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.439
Time:,03:50:10,Log-Likelihood:,-635.91
No. Observations:,879,AIC:,1280.0
Df Residuals:,875,BIC:,1299.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5220,0.017,30.824,0.000,0.489 0.555
DRD1,0.0214,0.021,1.024,0.306,-0.020 0.062
DRD2,0.0228,0.023,1.000,0.317,-0.022 0.067
DRD3,-0.0056,0.019,-0.302,0.763,-0.042 0.031

0,1,2,3
Omnibus:,0.931,Durbin-Watson:,1.894
Prob(Omnibus):,0.628,Jarque-Bera (JB):,144.714
Skew:,-0.079,Prob(JB):,3.77e-32
Kurtosis:,1.019,Cond. No.,3.04


- regress **home win** on the difference in number of elite home and visitor players by position (DC1, DLW1, DRW1, DLD1, DRD1). Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [636]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW1', 'DLD1', 'DRD1']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.3631
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.874
Time:,03:50:10,Log-Likelihood:,-636.36
No. Observations:,879,AIC:,1285.0
Df Residuals:,873,BIC:,1313.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5238,0.018,29.891,0.000,0.489 0.558
DC1,0.0045,0.016,0.279,0.780,-0.027 0.036
DLW1,0.0080,0.021,0.385,0.700,-0.033 0.048
DRW1,-0.0064,0.019,-0.340,0.734,-0.043 0.031
DLD1,0.0075,0.014,0.523,0.601,-0.021 0.036
DRD1,0.0127,0.013,0.962,0.337,-0.013 0.039

0,1,2,3
Omnibus:,0.936,Durbin-Watson:,1.892
Prob(Omnibus):,0.626,Jarque-Bera (JB):,145.284
Skew:,-0.079,Prob(JB):,2.83e-32
Kurtosis:,1.015,Cond. No.,2.19


In [637]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC1', 'DLW1', 'DRW1', 'DLD1', 'DRD1']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691315
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0015
Time:,03:50:10,Log-Likelihood:,-607.67
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.8726

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0955,0.070,1.361,0.173,-0.042 0.233
DC1,0.0182,0.065,0.280,0.779,-0.109 0.146
DLW1,0.0320,0.083,0.387,0.699,-0.130 0.194
DRW1,-0.0257,0.075,-0.341,0.733,-0.174 0.122
DLD1,0.0302,0.058,0.525,0.600,-0.083 0.143
DRD1,0.0508,0.053,0.964,0.335,-0.053 0.154


- regress **home win** on the difference in number of secondary quality home and visitor players by position (DC2, DLW2, DRW2, DLD2, DRD2). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [638]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.004
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.7103
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.616
Time:,03:50:10,Log-Likelihood:,-635.49
No. Observations:,879,AIC:,1283.0
Df Residuals:,873,BIC:,1312.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5207,0.017,29.956,0.000,0.487 0.555
DC2,0.0149,0.013,1.135,0.257,-0.011 0.041
DLW2,0.0144,0.018,0.818,0.414,-0.020 0.049
DRW2,0.0087,0.012,0.727,0.468,-0.015 0.032
DLD2,-0.0061,0.015,-0.401,0.688,-0.036 0.024
DRD2,0.0018,0.017,0.105,0.916,-0.032 0.036

0,1,2,3
Omnibus:,0.943,Durbin-Watson:,1.885
Prob(Omnibus):,0.624,Jarque-Bera (JB):,144.134
Skew:,-0.08,Prob(JB):,5.03e-32
Kurtosis:,1.023,Cond. No.,2.16


In [639]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC2', 'DLW2', 'DRW2', 'DLD2', 'DRD2']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.690325
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.002932
Time:,03:50:10,Log-Likelihood:,-606.8
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.6131

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0834,0.070,1.196,0.232,-0.053 0.220
DC2,0.0598,0.053,1.137,0.255,-0.043 0.163
DLW2,0.0579,0.071,0.820,0.412,-0.081 0.196
DRW2,0.0351,0.048,0.729,0.466,-0.059 0.129
DLD2,-0.0246,0.061,-0.403,0.687,-0.144 0.095
DRD2,0.0073,0.069,0.106,0.916,-0.128 0.143


- regress **home win** on the difference in number of third quality home and visitor players by position (DC3, DLW3, DRW3, DLD3, DRD3). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [640]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.006
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.9969
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.418
Time:,03:50:10,Log-Likelihood:,-634.77
No. Observations:,879,AIC:,1282.0
Df Residuals:,873,BIC:,1310.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5180,0.017,30.259,0.000,0.484 0.552
DC3,-0.0180,0.014,-1.326,0.185,-0.045 0.009
DLW3,0.0054,0.015,0.354,0.723,-0.025 0.035
DRW3,-0.0159,0.013,-1.189,0.235,-0.042 0.010
DLD3,-0.0052,0.015,-0.341,0.734,-0.035 0.025
DRD3,-0.0171,0.014,-1.192,0.233,-0.045 0.011

0,1,2,3
Omnibus:,0.931,Durbin-Watson:,1.896
Prob(Omnibus):,0.628,Jarque-Bera (JB):,143.21
Skew:,-0.079,Prob(JB):,7.990000000000001e-32
Kurtosis:,1.029,Cond. No.,1.43


In [641]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC3', 'DLW3', 'DRW3', 'DLD3', 'DRD3']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.689505
         Iterations 4


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,873.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.004115
Time:,03:50:10,Log-Likelihood:,-606.07
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.4148

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0726,0.069,1.057,0.291,-0.062 0.207
DC3,-0.0726,0.055,-1.330,0.184,-0.180 0.034
DLW3,0.0219,0.061,0.356,0.722,-0.098 0.142
DRW3,-0.0644,0.054,-1.193,0.233,-0.170 0.041
DLD3,-0.0210,0.061,-0.343,0.732,-0.141 0.099
DRD3,-0.0690,0.058,-1.196,0.232,-0.182 0.044


- regress **home win** on the difference in number of bottom quality home and visitor forwards (DC4, DLW4, DRW4). Add a constant to the predictors and use **OLS** and **Logit**. The purpose is to deterimine the impact each roster position has on home team success.

In [642]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.5328
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.66
Time:,03:50:10,Log-Likelihood:,-636.47
No. Observations:,879,AIC:,1281.0
Df Residuals:,875,BIC:,1300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5200,0.017,29.969,0.000,0.486 0.554
DC4,-0.0097,0.014,-0.701,0.483,-0.037 0.018
DLW4,-0.0085,0.016,-0.530,0.596,-0.040 0.023
DRW4,-0.0161,0.019,-0.850,0.396,-0.053 0.021

0,1,2,3
Omnibus:,0.923,Durbin-Watson:,1.889
Prob(Omnibus):,0.63,Jarque-Bera (JB):,145.437
Skew:,-0.079,Prob(JB):,2.62e-32
Kurtosis:,1.014,Cond. No.,1.44


In [643]:
y = dl['HomeWin']  
X = sm.add_constant(dl[['DC4', 'DLW4', 'DRW4']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691441
         Iterations 3


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,879.0
Model:,Logit,Df Residuals:,875.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.001318
Time:,03:50:10,Log-Likelihood:,-607.78
converged:,True,LL-Null:,-608.58
,,LLR p-value:,0.6583

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0802,0.069,1.154,0.248,-0.056 0.216
DC4,-0.0391,0.056,-0.703,0.482,-0.148 0.070
DLW4,-0.0340,0.064,-0.531,0.595,-0.159 0.091
DRW4,-0.0648,0.076,-0.851,0.395,-0.214 0.084


#### goal differential 

- regress **goal differential** on the difference in number of home and visitor players by position and quality (DF1, DF2, DD1, DD2). Add a constant to the predictors and use OLS. The purpose is to deterimine the impact each roster position has on goal differential.

In [644]:
y = dl['GD']  
X = sm.add_constant(dl[['DC1', 'DC2', 'DC3', 'DC4', 'DLW1', 'DLW2', 'DLW3', 'DLW4', 'DRW1', 'DRW2', 'DRW3', 'DRW4', 'DLD1', 'DLD2', 'DLD3', 'DRD1', 'DRD2', 'DRD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.01
Model:,OLS,Adj. R-squared:,-0.008
Method:,Least Squares,F-statistic:,0.5612
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.913
Time:,03:50:10,Log-Likelihood:,-2031.7
No. Observations:,879,AIC:,4097.0
Df Residuals:,862,BIC:,4179.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2324,0.094,2.474,0.014,0.048 0.417
DC1,0.0726,0.119,0.610,0.542,-0.161 0.306
DC2,0.0972,0.085,1.137,0.256,-0.071 0.265
DC3,-0.0527,0.072,-0.732,0.465,-0.194 0.089
DC4,-0.0197,0.080,-0.245,0.807,-0.177 0.138
DLW1,0.0431,0.119,0.362,0.717,-0.191 0.277
DLW2,0.0921,0.089,1.039,0.299,-0.082 0.266
DLW3,-0.0068,0.082,-0.083,0.934,-0.168 0.154
DLW4,-0.0830,0.084,-0.985,0.325,-0.248 0.082

0,1,2,3
Omnibus:,0.54,Durbin-Watson:,1.975
Prob(Omnibus):,0.763,Jarque-Bera (JB):,0.632
Skew:,0.03,Prob(JB):,0.729
Kurtosis:,2.884,Cond. No.,1.11e+16


## season_game_team_level_analysis

#### $Win = \beta_{0} + \beta_{1}C_{1} + \beta_{2}C_{2} + \beta_{3}C_{3} + \beta_{4}C_{4} + \beta_{5}LW_{1} + \beta_{6}LW_{2} + \beta_{7}LW_{3} + \beta_{8}LW_{4} + \beta_{9}RW_{1} + \beta_{10}RW_{2} + \beta_{11}RW_{3} + \beta_{12}RW_{4} + \beta_{13}LD_{1} + \beta_{14}LD_{2} + \beta_{15}LD_{3} + \beta_{16}RD_{1} + \beta_{17}RD_{2} + \beta_{18}RD_{3} + e_{s,g,t}$

- use season game data (dg) and season game team roster (dx) to conduct season game team level analysis (dt).

In [645]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [646]:
dt = dg.merge(dx, on=['Season', 'GameNumber'], how='left')
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,MTL,18.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.5,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.0,2.483333,0.416667,0.8,0.466667,1.916667,0.216667,1.85,0.6,0.95,1.0
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.984375,2.5625,0.21875,0.0,2.015625,0.578125,2.078125,2.0,0.609375,0.359375,0.59375,1.421875,0.671875,1.171875,1.0,0.828125,0.90625,
2,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273,1.0
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,0.5,1.628571,2.242857,0.085714,0.0,1.642857,0.514286,1.185714,0.514286,1.957143,1.557143,0.171429,0.957143,1.528571,0.9,1.857143,0.757143,0.0,
4,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN,CAR,18.0,2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.984615,0.0,1.169231,0.892308,1.107692,1.738462,0.261538,0.523077,1.0,0.0,2.692308,0.630769,0.4,1.938462,1.061538,1.692308,0.907692,0.0,1.0


In [647]:
dt.shape

(2109, 50)

- Sum up goals for and against by team per game and find the goal differential (GD) per game. Assign a value of 1 to the team that won the game. 

In [648]:
dt['GD'] = dt.apply(lambda x: (x['HGF'] - x['VGF']) if x['HTeamCode']== x['TeamCode'] else (x['VGF'] - x['HGF']), 1)
dt['Win'] = dt.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dt['GF'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']== x['TeamCode'] else x['VGF'], 1)
dt['GA'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']!= x['TeamCode'] else x['VGF'], 1)
dt.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,TeamCode,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA
0,2010,20001,MTL,TOR,2,3,-1,3,2,TOR,MTL,MTL,18.0,0.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.5,1.816667,0.516667,0.0,1.466667,1.95,0.05,0.0,2.483333,0.416667,0.8,0.466667,1.916667,0.216667,1.85,0.6,0.95,1.0,0,2,3
1,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL,TOR,18.0,0.0,1.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.984375,2.5625,0.21875,0.0,2.015625,0.578125,2.078125,2.0,0.609375,0.359375,0.59375,1.421875,0.671875,1.171875,1.0,0.828125,0.90625,,1,3,2
2,2010,20002,PHI,PIT,3,2,1,2,3,PHI,PIT,PHI,18.0,3.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,2.878788,0.0,0.0,0.924242,0.0,2.621212,0.0,1.439394,1.0,1.333333,1.80303,0.0,0.0,2.984848,0.212121,1.984848,0.590909,0.227273,1.0,1,3,2
3,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT,PIT,18.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,2.0,0.0,0.5,1.628571,2.242857,0.085714,0.0,1.642857,0.514286,1.185714,0.514286,1.957143,1.557143,0.171429,0.957143,1.528571,0.9,1.857143,0.757143,0.0,,0,2,3
4,2010,20003,CAR,MIN,4,3,1,3,4,CAR,MIN,CAR,18.0,2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.984615,0.0,1.169231,0.892308,1.107692,1.738462,0.261538,0.523077,1.0,0.0,2.692308,0.630769,0.4,1.938462,1.061538,1.692308,0.907692,0.0,1.0,1,4,3


- total of forwards and defensemen by team per game.

In [649]:
dt['F'] = dt['C1'] + dt['C2'] + dt['C3'] + dt['C4'] + dt['LW1'] + dt['LW2'] + dt['LW3'] + dt['LW4'] + dt['RW1'] + dt['RW2'] + dt['RW3'] + dt['RW4']   
dt['D'] = dt['LD1'] + dt['LD2'] + dt['LD3'] + dt['RD1'] + dt['RD2'] + dt['RD3']

- display the difference of quality per position.

In [650]:
dt['DC'] = dt['C1'] - dt['C2'] - dt['C3'] - dt['C4']
dt['DLW'] = dt['LW1'] - dt['LW2'] - dt['LW3'] - dt['LW4']
dt['DRW'] = dt['RW1'] - dt['RW2'] - dt['RW3'] - dt['RW4']
dt['DLD'] = dt['LD1'] - dt['LD2'] - dt['LD3']
dt['DRD'] = dt['RD1'] - dt['RD2'] - dt['RD3']

In [651]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [652]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

-**keep only games that have 12 forwards and 6 defensemen.**

In [653]:
dt = dt.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['F'] == 12) & (x['D'] == 6)).all())

In [654]:
dt['F'].value_counts()

12.0    1758
Name: F, dtype: int64

In [655]:
dt['D'].value_counts()

6.0    1758
Name: D, dtype: int64

In [656]:
dt.shape

(1758, 60)

### summary analysis

In [657]:
dt.describe()

Unnamed: 0,Season,GameNumber,VGF,HGF,GD,VGA,HGA,RosterCount,C1,C2,C3,C4,LD1,LD2,LD3,LW1,LW2,LW3,LW4,RD1,RD2,RD3,RW1,RW2,RW3,RW4,MeanC1,MeanC2,MeanC3,MeanC4,MeanLW1,MeanLW2,MeanLW3,MeanLW4,MeanRW1,MeanRW2,MeanRW3,MeanRW4,MeanLD1,MeanLD2,MeanLD3,MeanRD1,MeanRD2,MeanRD3,A,Win,GF,GA,F,D,DC,DLW,DRW,DLD,DRD
count,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,879.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0,1758.0
mean,2010.0,20613.112628,2.763367,2.960182,0.0,2.960182,2.763367,18.0,0.810011,1.290671,1.368032,0.661547,1.376564,0.803754,0.811718,0.695108,1.406143,1.032992,0.740046,1.273038,0.944824,0.790102,0.571104,1.587031,1.310011,0.527304,0.810011,1.290671,1.368032,0.661547,0.695108,1.406143,1.032992,0.740046,0.571104,1.587031,1.310011,0.527304,1.376564,0.803754,0.811718,1.273038,0.944824,0.790102,1.0,0.5,2.861775,2.861775,12.0,6.0,-2.510239,-2.484073,-2.853242,-0.238908,-0.461889
std,0.0,349.39163,1.671612,1.72682,2.462467,1.72682,1.671612,0.0,0.859691,1.003258,0.899277,0.854737,0.940069,0.850623,0.765163,0.638625,0.712525,0.788393,0.770176,0.950374,0.7274,0.808164,0.615957,1.085472,0.917807,0.642008,0.811683,0.92362,0.787099,0.713769,0.584197,0.588747,0.627232,0.548742,0.553084,0.989922,0.761202,0.435106,0.808332,0.778185,0.541839,0.831645,0.622968,0.668371,0.0,0.500142,1.702289,1.702289,0.0,0.0,1.8001,1.331518,1.489507,1.834293,1.938283
min,2010.0,20001.0,0.0,0.0,-8.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,6.0,-6.0,-6.0,-7.0,-5.0,-5.0
25%,2010.0,20317.25,2.0,2.0,-2.0,2.0,2.0,18.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.701493,0.954545,0.180328,0.0,0.971429,0.537313,0.311475,0.0,0.8,0.74359,0.116667,0.761194,0.114286,0.253731,0.574468,0.586189,0.090909,1.0,0.0,2.0,2.0,12.0,6.0,-4.0,-3.0,-4.0,-2.0,-2.0
50%,2010.0,20613.0,3.0,3.0,0.0,3.0,3.0,18.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.791045,1.0,1.428571,0.333333,0.948276,1.466667,1.078125,0.614286,0.640909,1.716418,1.304348,0.431034,1.5,0.671875,0.80597,1.402985,0.907692,0.938462,1.0,0.5,3.0,3.0,12.0,6.0,-3.0,-2.0,-3.0,0.0,0.0
75%,2010.0,20913.75,4.0,4.0,2.0,4.0,4.0,18.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,0.965517,1.803279,1.894737,0.924242,1.0,1.846154,1.55,1.104167,0.894737,2.014925,1.80303,0.8,1.914286,1.103448,1.1,1.857143,1.575758,1.272727,1.0,1.0,4.0,4.0,12.0,6.0,-2.0,-2.0,-2.0,1.0,1.0
max,2010.0,21230.0,10.0,9.0,8.0,9.0,10.0,18.0,3.0,4.0,3.0,5.0,4.0,3.0,4.0,2.0,3.0,3.0,4.0,4.0,3.0,3.0,2.0,5.0,4.0,3.0,2.878788,3.710145,2.8,2.835821,2.0,2.621212,2.231884,2.078125,2.0,3.714286,3.366667,1.512821,2.877193,2.984848,1.974359,3.0,2.242424,2.465517,1.0,1.0,10.0,10.0,12.0,6.0,3.0,1.0,2.0,4.0,4.0


In [658]:
dt.groupby(['Win'])['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4', 'RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3,C4,LW1,LW2,LW3,LW4,RW1,RW2,RW3,RW4,LD1,LD2,LD3,RD1,RD2,RD3
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
0,mean,0.763367,1.288965,1.407281,0.690557,0.664391,1.411832,0.981797,0.779295,0.534699,1.594994,1.318544,0.564278,1.387941,0.774744,0.830489,1.275313,0.918089,0.813424
0,std,0.835421,0.986586,0.876284,0.876049,0.633171,0.698939,0.796997,0.790881,0.609989,1.083011,0.965703,0.656246,0.941762,0.842628,0.773819,0.955367,0.741007,0.822105
0,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,25%,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0,50%,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
0,75%,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0
0,max,3.0,4.0,3.0,5.0,2.0,3.0,3.0,4.0,2.0,5.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0
1,count,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0,879.0
1,mean,0.856655,1.292378,1.328783,0.632537,0.725825,1.400455,1.084187,0.700796,0.607509,1.579067,1.301479,0.49033,1.365188,0.832765,0.792947,1.270762,0.971559,0.76678


### estimate roster model

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [659]:
print ('season game team level analysis (win) by roster position')
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
print(result.summary())

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/latex/roster/goals_assists_points_per_toi/season_game_team_level_analysis.tex', 'w')
f.write(beginningtex)
f.write(result.summary().as_latex())
f.write(endtex)
f.close()

season game team level analysis (win) by roster position
                            OLS Regression Results                            
Dep. Variable:                    Win   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     2.932
Date:                Thu, 22 Feb 2018   Prob (F-statistic):           8.19e-05
Time:                        03:50:14   Log-Likelihood:                -1252.6
No. Observations:                1758   AIC:                             2539.
Df Residuals:                    1741   BIC:                             2632.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **Logit**. The purpose is to deterimine the impact each roster position has on home team success

In [660]:
y = dt['Win']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.Logit(y, X).fit()
result.summary()

         Current function value: 0.679835
         Iterations: 35


  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1741.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0192
Time:,03:50:14,Log-Likelihood:,-1195.2
converged:,False,LL-Null:,-1218.6
,,LLR p-value:,7.306e-05

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0068,,,,nan nan
C1,0.1864,,,,nan nan
C2,-0.0357,,,,nan nan
C3,-0.0812,,,,nan nan
C4,-0.1637,,,,nan nan
LW1,0.1256,,,,nan nan
LW2,-0.0511,,,,nan nan
LW3,0.2322,,,,nan nan
LW4,-0.0945,,,,nan nan


#### goal differential

- regress **goal differential** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success

In [661]:
y = dt['GD']  
X = sm.add_constant(dt[['C1', 'C2', 'C3', 'C4', 'LW1', 'LW2', 'LW3', 'LW4','RW1', 'RW2', 'RW3', 'RW4', 'LD1', 'LD2', 'LD3', 'RD1', 'RD2', 'RD3']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,3.298
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,1.02e-05
Time:,03:50:14,Log-Likelihood:,-4052.0
No. Observations:,1758,AIC:,8138.0
Df Residuals:,1741,BIC:,8231.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0041,0.006,0.724,0.469,-0.007 0.015
C1,0.2570,0.115,2.226,0.026,0.031 0.484
C2,-0.0752,0.084,-0.901,0.368,-0.239 0.089
C3,-0.1200,0.069,-1.749,0.081,-0.255 0.015
C4,-0.1846,0.081,-2.275,0.023,-0.344 -0.025
LW1,0.1375,0.119,1.155,0.248,-0.096 0.371
LW2,-0.0537,0.086,-0.622,0.534,-0.223 0.116
LW3,0.2522,0.082,3.067,0.002,0.091 0.413
LW4,-0.0873,0.080,-1.090,0.276,-0.244 0.070

0,1,2,3
Omnibus:,0.484,Durbin-Watson:,2.996
Prob(Omnibus):,0.785,Jarque-Bera (JB):,0.553
Skew:,-0.026,Prob(JB):,0.759
Kurtosis:,2.931,Cond. No.,3.13e+16


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [662]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,2.828
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.015
Time:,03:50:14,Log-Likelihood:,-1268.9
No. Observations:,1758,AIC:,2550.0
Df Residuals:,1752,BIC:,2583.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.6233,0.039,15.889,0.000,0.546 0.700
DC,0.0176,0.007,2.506,0.012,0.004 0.031
DLW,0.0057,0.009,0.615,0.539,-0.012 0.024
DRW,0.0217,0.008,2.620,0.009,0.005 0.038
DLD,0.0024,0.007,0.351,0.726,-0.011 0.016
DRD,0.0052,0.006,0.801,0.423,-0.007 0.018

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,3.0
Prob(Omnibus):,1.0,Jarque-Bera (JB):,283.737
Skew:,0.0,Prob(JB):,2.44e-62
Kurtosis:,1.032,Cond. No.,16.6


- regress **win** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **Logit**.

In [663]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.689131
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1752.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.005794
Time:,03:50:14,Log-Likelihood:,-1211.5
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.01486

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.4962,0.159,3.130,0.002,0.186 0.807
DC,0.0711,0.028,2.498,0.012,0.015 0.127
DLW,0.0228,0.037,0.612,0.540,-0.050 0.096
DRW,0.0872,0.033,2.614,0.009,0.022 0.153
DLD,0.0099,0.028,0.352,0.724,-0.045 0.065
DRD,0.0209,0.026,0.803,0.422,-0.030 0.072


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [664]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,4.487
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.00382
Time:,03:50:14,Log-Likelihood:,-1269.2
No. Observations:,1758,AIC:,2546.0
Df Residuals:,1754,BIC:,2568.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.6148,0.038,16.242,0.000,0.541 0.689
DC,0.0163,0.007,2.442,0.015,0.003 0.029
DLW,0.0067,0.009,0.744,0.457,-0.011 0.024
DRW,0.0201,0.008,2.496,0.013,0.004 0.036

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,3.0
Prob(Omnibus):,1.0,Jarque-Bera (JB):,284.178
Skew:,0.0,Prob(JB):,1.96e-62
Kurtosis:,1.03,Cond. No.,15.9


- regress **win** on the differential of forwards per team. Add a constant to the predictors and use **Logit**.

In [665]:
y = dt['Win']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.691089
         Iterations 4


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1754.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.002969
Time:,03:50:14,Log-Likelihood:,-1214.9
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.06476

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.2186,0.117,1.864,0.062,-0.011 0.448
DC,0.0694,0.027,2.563,0.010,0.016 0.122
DLW,0.0170,0.036,0.468,0.639,-0.054 0.088
DRD,0.0044,0.025,0.177,0.859,-0.044 0.053


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [666]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.1363
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.873
Time:,03:50:14,Log-Likelihood:,-1275.8
No. Observations:,1758,AIC:,2558.0
Df Residuals:,1755,BIC:,2574.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4987,0.012,40.207,0.000,0.474 0.523
DLD,-0.0033,0.007,-0.507,0.612,-0.016 0.010
DRD,-0.0010,0.006,-0.170,0.865,-0.013 0.011

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,3.01
Prob(Omnibus):,1.0,Jarque-Bera (JB):,292.818
Skew:,0.0,Prob(JB):,2.6e-64
Kurtosis:,1.001,Cond. No.,2.13


- regress **win** on the differential of defensemen per team. Add a constant to the predictors and use **Logit**.

In [667]:
y = dt['Win']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.Logit(y, X).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.693070
         Iterations 3


0,1,2,3
Dep. Variable:,Win,No. Observations:,1758.0
Model:,Logit,Df Residuals:,1755.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 22 Feb 2018",Pseudo R-squ.:,0.0001121
Time:,03:50:14,Log-Likelihood:,-1218.4
converged:,True,LL-Null:,-1218.6
,,LLR p-value:,0.8724

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.0051,0.050,-0.103,0.918,-0.102 0.092
DLD,-0.0133,0.026,-0.508,0.612,-0.064 0.038
DRD,-0.0042,0.025,-0.170,0.865,-0.053 0.044


- regress **goal differential** on the differential of forwards and defensemen per team. Add a constant to the predictors and use **OLS**.

In [668]:
y = dt['GD']
X = sm.add_constant(dt[['DC', 'DLW', 'DRW', 'DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,2.905
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.0129
Time:,03:50:14,Log-Likelihood:,-4071.0
No. Observations:,1758,AIC:,8154.0
Df Residuals:,1752,BIC:,8187.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5147,0.193,2.665,0.008,0.136 0.893
DC,0.1177,0.035,3.395,0.001,0.050 0.186
DLW,0.0132,0.045,0.290,0.772,-0.076 0.102
DRW,0.0560,0.041,1.373,0.170,-0.024 0.136
DLD,0.0249,0.034,0.727,0.467,-0.042 0.092
DRD,0.0452,0.032,1.421,0.155,-0.017 0.108

0,1,2,3
Omnibus:,0.909,Durbin-Watson:,3.033
Prob(Omnibus):,0.635,Jarque-Bera (JB):,0.951
Skew:,-0.001,Prob(JB):,0.622
Kurtosis:,2.886,Cond. No.,16.6


- regress **goal differential** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [669]:
y = dt['GD']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,4.084
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.00669
Time:,03:50:14,Log-Likelihood:,-4072.1
No. Observations:,1758,AIC:,8152.0
Df Residuals:,1754,BIC:,8174.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4387,0.186,2.353,0.019,0.073 0.804
DC,0.1046,0.033,3.189,0.001,0.040 0.169
DLW,0.0232,0.045,0.521,0.602,-0.064 0.110
DRW,0.0415,0.040,1.050,0.294,-0.036 0.119

0,1,2,3
Omnibus:,0.662,Durbin-Watson:,3.035
Prob(Omnibus):,0.718,Jarque-Bera (JB):,0.724
Skew:,-0.0,Prob(JB):,0.696
Kurtosis:,2.901,Cond. No.,15.9


- regress **goal differential** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [670]:
y = dt['GD']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.2559
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.774
Time:,03:50:14,Log-Likelihood:,-4078.0
No. Observations:,1758,AIC:,8162.0
Df Residuals:,1755,BIC:,8178.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0064,0.061,0.105,0.917,-0.113 0.126
DLD,-0.0097,0.032,-0.300,0.764,-0.073 0.053
DRD,0.0188,0.030,0.619,0.536,-0.041 0.079

0,1,2,3
Omnibus:,1.335,Durbin-Watson:,3.04
Prob(Omnibus):,0.513,Jarque-Bera (JB):,1.327
Skew:,-0.001,Prob(JB):,0.515
Kurtosis:,2.865,Cond. No.,2.13


- regress **goals for ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [671]:
y = dt['GF']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,5.803
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.000605
Time:,03:50:14,Log-Likelihood:,-3420.5
No. Observations:,1758,AIC:,6849.0
Df Residuals:,1754,BIC:,6871.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,3.1716,0.129,24.648,0.000,2.919 3.424
DC,0.0879,0.023,3.885,0.000,0.044 0.132
DLW,-0.0053,0.031,-0.174,0.862,-0.066 0.055
DRW,0.0359,0.027,1.314,0.189,-0.018 0.089

0,1,2,3
Omnibus:,44.41,Durbin-Watson:,2.086
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.359
Skew:,0.402,Prob(JB):,5.2e-11
Kurtosis:,3.019,Cond. No.,15.9


- regress **goals for** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [672]:
y = dt['GF']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GF,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.04887
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.952
Time:,03:50:14,Log-Likelihood:,-3429.2
No. Observations:,1758,AIC:,6864.0
Df Residuals:,1755,BIC:,6881.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8609,0.042,67.762,0.000,2.778 2.944
DLD,0.0048,0.022,0.216,0.829,-0.039 0.048
DRD,-0.0043,0.021,-0.205,0.838,-0.046 0.037

0,1,2,3
Omnibus:,45.999,Durbin-Watson:,2.092
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.199
Skew:,0.41,Prob(JB):,2.07e-11
Kurtosis:,3.011,Cond. No.,2.13


- regress **goals against ** on the differential of forwards per team. Add a constant to the predictors and use **OLS**.

In [673]:
y = dt['GA']  
X = sm.add_constant(dt[['DC', 'DLW', 'DRW']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.529
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.662
Time:,03:50:14,Log-Likelihood:,-3428.4
No. Observations:,1758,AIC:,6865.0
Df Residuals:,1754,BIC:,6887.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.7330,0.129,21.144,0.000,2.479 2.986
DC,-0.0166,0.023,-0.732,0.464,-0.061 0.028
DLW,-0.0285,0.031,-0.925,0.355,-0.089 0.032
DRW,-0.0056,0.027,-0.206,0.837,-0.059 0.048

0,1,2,3
Omnibus:,45.45,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.555
Skew:,0.407,Prob(JB):,2.86e-11
Kurtosis:,3.018,Cond. No.,15.9


- regress **goals against** on the differential of defensemen per team. Add a constant to the predictors and use **OLS**.

In [674]:
y = dt['GA']  
X = sm.add_constant(dt[['DLD', 'DRD']] )
result = sm.OLS(y, X).fit()
result.summary()

0,1,2,3
Dep. Variable:,GA,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.8911
Date:,"Thu, 22 Feb 2018",Prob (F-statistic):,0.41
Time:,03:50:15,Log-Likelihood:,-3428.3
No. Observations:,1758,AIC:,6863.0
Df Residuals:,1755,BIC:,6879.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.8545,0.042,67.643,0.000,2.772 2.937
DLD,0.0145,0.022,0.651,0.515,-0.029 0.058
DRD,-0.0232,0.021,-1.100,0.271,-0.064 0.018

0,1,2,3
Omnibus:,46.021,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.214
Skew:,0.41,Prob(JB):,2.06e-11
Kurtosis:,3.016,Cond. No.,2.13
