In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [2]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

## import data set

In [3]:
dm = pd.read_csv('pbp_merged.csv')
dm = dm.drop('Unnamed: 0', axis=1)

- keep only regular season games and drop irrelevant observations. Exclude overtime and shootouts.

In [4]:
dm = dm[dm['GameNumber'] <= 21230]

In [5]:
dm = dm[dm['EventType']!='STOP']
dm = dm[dm['EventType']!='EISTR']
dm = dm[dm['EventType']!='EIEND']

In [6]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

In [7]:
dm.shape

(310113, 44)

### reshape the data set from wide to long.

In [8]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [9]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [10]:
dm.shape

(1796745, 24)

In [11]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTimeFromTwenty',
       'EventTimeFromZero', 'EventType', 'GameDate', 'GameNumber', 'HTeamCode',
       'Length', 'PenaltyType', 'Period', 'PlayerName', 'PlayerNumber',
       'Season', 'ShotResult', 'ShotType', 'TeamCode', 'VTeamCode', 'Zone',
       'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [12]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]

In [13]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [14]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

In [15]:
dm.to_csv('play_by_play.csv', index='False', sep=',')

### create new data set and keep variables: 
#### - (a) game number.
#### - (b) visitor team information.
#### - (c) home team information.

In [16]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [17]:
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [18]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c})

In [19]:
dp = pd.read_csv('player_rank_manual.csv')
dp = dp.drop('Unnamed: 0', axis=1)

#### display each player by team per game. Drop duplicates.

In [20]:
dw = pd.merge(df, dp, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerPosition'], how='left')

In [21]:
dw = dw[dw.PlayerPosition!='G']
#dw = dw[dw.GameNumber==20001]
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw = dw[dw.RosterCount==18]
dw['Position'] = dw.apply(lambda x: 'D' if x['PlayerPosition']=='D' else 'F', 1)
dw.head()

Unnamed: 0,GameNumber,Season,PlayerNumber,PlayerPosition,TeamCode,Rank,RosterCount,Position
0,20001,2010,11.0,C,MTL,2,18.0,F
1,20001,2010,21.0,R,MTL,2,18.0,F
2,20001,2010,57.0,L,MTL,2,18.0,F
3,20001,2010,26.0,D,MTL,2,18.0,D
4,20001,2010,75.0,D,MTL,2,18.0,D


In [22]:
dw = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'Position'], as_index=False)['Rank'].mean()
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,Rank
0,2010,20001,MTL,D,1.833333
1,2010,20001,MTL,F,1.916667
2,2010,20001,TOR,D,1.833333
3,2010,20001,TOR,F,1.666667
4,2010,20002,PHI,D,1.666667


In [23]:
dw = pd.pivot_table(dw, index=['Season', 'GameNumber', 'TeamCode'], columns=['Position'], values=['Rank'])
dw = dw.reset_index()
dw.columns = ['_'.join(str(s).strip() for s in col if s) for col in dw.columns]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F
0,2010,20001,MTL,1.833333,1.916667
1,2010,20001,TOR,1.833333,1.666667
2,2010,20002,PHI,1.666667,1.583333
3,2010,20002,PIT,1.666667,1.75
4,2010,20003,CAR,1.666667,1.75


### Merge game outcomes into dw

In [24]:
ds = pd.read_csv('season_games.csv')
ds = ds.drop('Unnamed: 0', axis=1)
ds = ds[['Season', 'GameNumber', 'WinTeam', 'LossTeam']]
ds.head()

Unnamed: 0,Season,GameNumber,WinTeam,LossTeam
0,2010,20001,TOR,MTL
1,2010,20002,PHI,PIT
2,2010,20003,CAR,MIN
3,2010,20004,COL,CHI
4,2010,20005,EDM,CGY


In [25]:
dq = pd.merge(dw, ds, on=['Season', 'GameNumber'], how='left')
dq.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F,WinTeam,LossTeam
0,2010,20001,MTL,1.833333,1.916667,TOR,MTL
1,2010,20001,TOR,1.833333,1.666667,TOR,MTL
2,2010,20002,PHI,1.666667,1.583333,PHI,PIT
3,2010,20002,PIT,1.666667,1.75,PHI,PIT
4,2010,20003,CAR,1.666667,1.75,CAR,MIN


In [26]:
dq['TeamWin'] =  dq.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dq['TeamLos'] =  dq.apply(lambda x: 1 if x['TeamCode']!=x['WinTeam'] else 0, 1)
dq.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F,WinTeam,LossTeam,TeamWin,TeamLos
0,2010,20001,MTL,1.833333,1.916667,TOR,MTL,0,1
1,2010,20001,TOR,1.833333,1.666667,TOR,MTL,1,0
2,2010,20002,PHI,1.666667,1.583333,PHI,PIT,1,0
3,2010,20002,PIT,1.666667,1.75,PHI,PIT,0,1
4,2010,20003,CAR,1.666667,1.75,CAR,MIN,1,0


In [27]:
dq['GP'] = dq.groupby(['Season', 'TeamCode'])['GameNumber'].transform('count')
dq['GW'] = dq.groupby(['Season', 'WinTeam'])['TeamWin'].transform('sum')
dq['GL'] = dq.groupby(['Season', 'LossTeam'])['TeamLos'].transform('sum')
dq.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F,WinTeam,LossTeam,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,1.833333,1.916667,TOR,MTL,0,1,82,37,38
1,2010,20001,TOR,1.833333,1.666667,TOR,MTL,1,0,82,37,38
2,2010,20002,PHI,1.666667,1.583333,PHI,PIT,1,0,82,47,33
3,2010,20002,PIT,1.666667,1.75,PHI,PIT,0,1,82,47,33
4,2010,20003,CAR,1.666667,1.75,CAR,MIN,1,0,82,40,42


In [28]:
dq['Mean_F']= dq.groupby(['Season', 'TeamCode'])['Rank_F'].transform('mean')
dq['Mean_D']= dq.groupby(['Season', 'TeamCode'])['Rank_D'].transform('mean')
dq.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F,WinTeam,LossTeam,TeamWin,TeamLos,GP,GW,GL,Mean_F,Mean_D
0,2010,20001,MTL,1.833333,1.916667,TOR,MTL,0,1,82,37,38,1.815133,1.83072
1,2010,20001,TOR,1.833333,1.666667,TOR,MTL,1,0,82,37,38,1.665835,1.883566
2,2010,20002,PHI,1.666667,1.583333,PHI,PIT,1,0,82,47,33,1.59248,1.666667
3,2010,20002,PIT,1.666667,1.75,PHI,PIT,0,1,82,47,33,1.804757,1.711556
4,2010,20003,CAR,1.666667,1.75,CAR,MIN,1,0,82,40,42,1.751016,1.686992


In [29]:
dq['L'] = np.where(dq['TeamCode'] == dq['LossTeam'], dq['GL'], dq['GP'] - dq['GW'])
dq['W'] = np.where(dq['TeamCode'] == dq['WinTeam'], dq['GW'], dq['GP'] - dq['GL'])
dq.head()

Unnamed: 0,Season,GameNumber,TeamCode,Rank_D,Rank_F,WinTeam,LossTeam,TeamWin,TeamLos,GP,GW,GL,Mean_F,Mean_D,L,W
0,2010,20001,MTL,1.833333,1.916667,TOR,MTL,0,1,82,37,38,1.815133,1.83072,38,44
1,2010,20001,TOR,1.833333,1.666667,TOR,MTL,1,0,82,37,38,1.665835,1.883566,45,37
2,2010,20002,PHI,1.666667,1.583333,PHI,PIT,1,0,82,47,33,1.59248,1.666667,35,47
3,2010,20002,PIT,1.666667,1.75,PHI,PIT,0,1,82,47,33,1.804757,1.711556,33,49
4,2010,20003,CAR,1.666667,1.75,CAR,MIN,1,0,82,40,42,1.751016,1.686992,42,40


In [30]:
dx = dq[['Season', 'TeamCode', 'GP', 'L', 'W', 'Mean_F', 'Mean_D']]
dx = dx.drop_duplicates(['Season', 'TeamCode'])
dx['WinPc'] = dx['W']/ dx['GP']
dx['LossPc'] = dx['L']/ dx['GP']
dx = dx[['Season', 'TeamCode', 'GP','W', 'L', 'WinPc', 'LossPc', 'Mean_F', 'Mean_D']]
dx.head()

Unnamed: 0,Season,TeamCode,GP,W,L,WinPc,LossPc,Mean_F,Mean_D
0,2010,MTL,82,44,38,0.536585,0.463415,1.815133,1.83072
1,2010,TOR,82,37,45,0.45122,0.54878,1.665835,1.883566
2,2010,PHI,82,47,35,0.573171,0.426829,1.59248,1.666667
3,2010,PIT,82,49,33,0.597561,0.402439,1.804757,1.711556
4,2010,CAR,82,40,42,0.487805,0.512195,1.751016,1.686992


In [31]:
dx['WinPc'] = dx['W']/ dx['GP']
dx['LossPc'] = dx['L']/ dx['GP']
dx = dx[['Season', 'TeamCode', 'GP','W', 'L', 'WinPc', 'LossPc', 'Mean_F', 'Mean_D']]
dx.head()

Unnamed: 0,Season,TeamCode,GP,W,L,WinPc,LossPc,Mean_F,Mean_D
0,2010,MTL,82,44,38,0.536585,0.463415,1.815133,1.83072
1,2010,TOR,82,37,45,0.45122,0.54878,1.665835,1.883566
2,2010,PHI,82,47,35,0.573171,0.426829,1.59248,1.666667
3,2010,PIT,82,49,33,0.597561,0.402439,1.804757,1.711556
4,2010,CAR,82,40,42,0.487805,0.512195,1.751016,1.686992


In [32]:
dx['Rank_W'] = dx.groupby(['Season'])['WinPc'].rank(ascending=False)
dx['Rank_F'] = dx.groupby(['Season'])['Mean_F'].rank(ascending=True)
dx['Rank_D'] = dx.groupby(['Season'])['Mean_D'].rank(ascending=True)
dx = dx.sort_values(['Season', 'Rank_W', 'Rank_F', 'Rank_D'], ascending=[True, True, True, True])
dx.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,WinPc,LossPc,Mean_F,Mean_D,Rank_W,Rank_F,Rank_D
45,2010,VAN,81,53,28,0.654321,0.345679,1.600355,1.8398,1.0,4.0,17.0
3,2010,PIT,82,49,33,0.597561,0.402439,1.804757,1.711556,2.0,24.0,7.0
11,2010,SJ,82,48,34,0.585366,0.414634,1.517276,1.848432,3.5,1.0,21.0
21,2010,WSH,82,48,34,0.585366,0.414634,1.692073,1.73374,3.5,10.0,10.0
17,2010,DET,80,46,34,0.575,0.425,1.713352,1.704762,5.0,14.0,6.0
2,2010,PHI,82,47,35,0.573171,0.426829,1.59248,1.666667,6.5,3.0,1.0
16,2010,ANA,82,47,35,0.573171,0.426829,1.686253,1.839431,6.5,9.0,16.0
35,2010,TB,82,46,36,0.560976,0.439024,1.546748,2.0,9.0,2.0,27.5
22,2010,BOS,82,46,36,0.560976,0.439024,1.680894,1.835366,9.0,8.0,14.0
44,2010,LA,82,46,36,0.560976,0.439024,1.765152,1.681185,9.0,21.0,3.0


**Crosby played 41 games, Malkin 43 and Staal 42 for Pittsburgh**

In [33]:
dx.to_csv('season_teams_roster.csv', index='False')