In [200]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [201]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

## import data set

In [202]:
dm = pd.read_csv('pbp_merged.csv')
dm = dm.drop('Unnamed: 0', axis=1)

- keep only regular season games and drop irrelevant observations. Exclude overtime and shootouts.

In [203]:
dm = dm[dm['GameNumber'] <= 21230]

In [204]:
dm = dm[dm['EventType']!='STOP']
dm = dm[dm['EventType']!='EISTR']
dm = dm[dm['EventType']!='EIEND']

In [205]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

In [206]:
dm.shape

(310113, 44)

### reshape the data set from wide to long.

In [207]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [208]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [209]:
dm.shape

(1796745, 24)

In [210]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTimeFromTwenty',
       'EventTimeFromZero', 'EventType', 'GameDate', 'GameNumber', 'HTeamCode',
       'Length', 'PenaltyType', 'Period', 'PlayerName', 'PlayerNumber',
       'Season', 'ShotResult', 'ShotType', 'TeamCode', 'VTeamCode', 'Zone',
       'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [211]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]

In [212]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [213]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

In [214]:
dm.to_csv('play_by_play.csv', index='False', sep=',')

### create new data set and keep variables: 
#### - (a) game number.
#### - (b) visitor team information.
#### - (c) home team information.

In [215]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [216]:
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [217]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
df = pd.lreshape(df, {'Player' : a, 'Position' : b, 'TeamCode' : c})

In [218]:
df.head()

Unnamed: 0,GameNumber,Season,Player,Position,TeamCode
0,20001,2010,11.0,C,MTL
1,20001,2010,21.0,R,MTL
2,20001,2010,57.0,L,MTL
3,20001,2010,26.0,D,MTL
4,20001,2010,75.0,D,MTL


#### display each player by team per game. Drop duplicates.

In [219]:
df = df.drop_duplicates(['GameNumber', 'TeamCode', 'Player'])
df = df[['Season', 'GameNumber', 'TeamCode', 'Player', 'Position']]
df = df.rename(columns={'Player': 'PlayerNumber', 'Position': 'PlayerPosition' })
df = df.sort_values(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'PlayerNumber'], ascending=[True, True, True, True, True])

In [220]:
df.shape

(46920, 5)

- drop goalies.

In [221]:
df = df[df['PlayerPosition'] !='G']

- import player rankings.

In [222]:
dp = pd.read_csv('player_rank_manual.csv')
dp = dp.drop('Unnamed: 0', axis=1)

In [223]:
df = pd.merge(df, dp, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerPosition'], how='left')

- count the number of players by team per game and display roster.

In [224]:
df['playercount'] = df.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [225]:
df['roster'] = df.groupby(['Season', 'GameNumber', 'TeamCode'])['playercount'].transform('sum')

- create a data frame will player ranking per game by team.

In [226]:
dr = df[['Season', 'GameNumber', 'TeamCode', 'roster','PlayerNumber', 'PlayerPosition', 'Rank']]

- reshape data from long to wide.

In [227]:
dr.head()

Unnamed: 0,Season,GameNumber,TeamCode,roster,PlayerNumber,PlayerPosition,Rank
0,2010,20001,MTL,18.0,11.0,C,2
1,2010,20001,MTL,18.0,14.0,C,1
2,2010,20001,MTL,18.0,15.0,C,2
3,2010,20001,MTL,18.0,17.0,C,2
4,2010,20001,MTL,18.0,40.0,C,2


In [228]:
dr.to_csv('data_teams_roster.csv', index='False', sep=',')

### Pivot Table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position using rank values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank (10 columns). 


In [229]:
df = pd.pivot_table(df, index=['Season', 'GameNumber', 'TeamCode', 'roster'], columns=['PlayerPosition'], values=['Rank'])
df = df.reset_index()
df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]
df.reset_index()
df = df.fillna(0)

In [230]:
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,roster,Rank_C,Rank_D,Rank_L,Rank_R
0,2010,20001,MTL,18.0,1.857143,1.833333,2.0,2.0
1,2010,20001,TOR,18.0,1.6,1.833333,1.333333,2.0
2,2010,20002,PHI,18.0,1.4,1.666667,1.8,1.5
3,2010,20002,PIT,18.0,1.75,1.666667,1.666667,2.0
4,2010,20003,CAR,18.0,1.6,1.666667,2.0,1.75


- the data set shows the quality amount of players per position by team for every single regular season game. We will create the mean of forwards and defencemen per team by game. 

In [231]:
df['Rank_F'] = df[['Rank_C', 'Rank_L', 'Rank_R']].mean(axis=1)

In [232]:
#df['Rank_F'] = df.groupby(['Season', 'GameNumber', 'TeamCode'])['rank_f'].transform('mean')

In [245]:
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,roster,Rank_C,Rank_D,Rank_L,Rank_R,Rank_F
0,2010,20001,MTL,18.0,1.857143,1.833333,2.0,2.0,1.952381
1,2010,20001,TOR,18.0,1.6,1.833333,1.333333,2.0,1.644444
2,2010,20002,PHI,18.0,1.4,1.666667,1.8,1.5,1.566667
3,2010,20002,PIT,18.0,1.75,1.666667,1.666667,2.0,1.805556
4,2010,20003,CAR,18.0,1.6,1.666667,2.0,1.75,1.783333


In [234]:
df.columns

Index(['Season', 'GameNumber', 'TeamCode', 'roster', 'Rank_C', 'Rank_D',
       'Rank_L', 'Rank_R', 'Rank_F'],
      dtype='object')

- create a data set with the mean values of forwards and defenceman by team per game. 

In [235]:
dm = df[['Season', 'GameNumber', 'TeamCode', 'roster', 'Rank_F', 'Rank_D']]

In [236]:
dm.head()

Unnamed: 0,Season,GameNumber,TeamCode,roster,Rank_F,Rank_D
0,2010,20001,MTL,18.0,1.952381,1.833333
1,2010,20001,TOR,18.0,1.644444,1.833333
2,2010,20002,PHI,18.0,1.566667,1.666667
3,2010,20002,PIT,18.0,1.805556,1.666667
4,2010,20003,CAR,18.0,1.783333,1.666667


- mean forwards and defensemen per team for the whole season.

In [237]:
dm['F_Mean'] = dm.groupby(['Season','TeamCode'])['Rank_F'].transform('mean')
dm['D_Mean'] = dm.groupby(['Season','TeamCode'])['Rank_D'].transform('mean')
dm = dm.drop_duplicates(['Season', 'TeamCode'])
dm = dm.sort_values(['Season', 'F_Mean', 'D_Mean'], ascending=[True, True, True])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [246]:
dm.head()

Unnamed: 0,Season,TeamCode,roster,F_Mean,D_Mean
35,2010,TB,18.0,1.51778,2.0
18,2010,BUF,18.0,1.529326,1.840012
25,2010,NYR,18.0,1.571138,2.0
45,2010,VAN,18.0,1.576055,1.839721
2,2010,PHI,18.0,1.59939,1.666667


In [242]:
dm = dm[['Season', 'TeamCode', 'roster', 'F_Mean', 'D_Mean']]

In [243]:
dm.head()

Unnamed: 0,Season,TeamCode,roster,F_Mean,D_Mean
35,2010,TB,18.0,1.51778,2.0
18,2010,BUF,18.0,1.529326,1.840012
25,2010,NYR,18.0,1.571138,2.0
45,2010,VAN,18.0,1.576055,1.839721
2,2010,PHI,18.0,1.59939,1.666667


In [244]:
dm.to_csv('season_teams_roster_mean.csv', index='False')

### import team winning percent data frame

In [247]:
dt = pd.read_csv('season_teams.csv')
dt = dt.drop('Unnamed: 0', axis=1)

In [248]:
dt = pd.merge(dt, dm, on=['Season', 'TeamCode'], how='left')

In [249]:
dt.head()

Unnamed: 0,Season,TeamCode,GP,GW,GL,WinPc,LossPc,Rank,roster,F_Mean,D_Mean
0,2010,VAN,82,54,28,0.658537,0.341463,1.0,18.0,1.576055,1.839721
1,2010,PIT,82,49,33,0.597561,0.402439,2.0,18.0,1.829123,1.711556
2,2010,SJ,82,48,34,0.585366,0.414634,3.5,18.0,1.608895,1.848432
3,2010,WSH,82,48,34,0.585366,0.414634,3.5,18.0,1.67559,1.73374
4,2010,ANA,82,47,35,0.573171,0.426829,6.0,18.0,1.741996,1.839431


In [250]:
dt.to_csv('data_season_teams_wins_roster.csv', index='False')