## 2016 play by play data

In [5]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.cm as cm
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import seaborn as sn
from sklearn.metrics import silhouette_samples, silhouette_score


sys.setrecursionlimit(100000)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

### import 2016 nhl data set

In [122]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/data/nhl_pbp20162017.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/data/nhl_pbp20162017.csv')
da = da.drop('Unnamed: 0', axis=1)

- rename columns and create a season variable

In [123]:
da['Season'] = 2016
da = da.rename(columns={ 'Game_Id' : 'GameNumber', 'Ev_Zone': 'EventZone', 'Time_Elapsed': 'Time', 'Ev_Team': 'EventTeamCode', 'Description': 'EventDetail', 'Away_Team': 'ATeamCode', 'Home_Team':'HTeamCode', 'p1_name': 'EventPlayer1','p2_name': 'EventPlayer2', 'p3_name': 'EventPlayer3', 'awayPlayer1': 'APlayer1', 'awayPlayer2': 'APlayer2', 'awayPlayer3': 'APlayer3', 'awayPlayer4': 'APlayer4', 'awayPlayer5': 'APlayer5', 'awayPlayer6': 'APlayer6', 'homePlayer1': 'HPlayer1', 'homePlayer2': 'HPlayer2', 'homePlayer3': 'HPlayer3','homePlayer4': 'HPlayer4', 'homePlayer5': 'HPlayer5','homePlayer6': 'HPlayer6', 'Away_Score': 'AGoals', 'Home_Score': 'HGoals', 'Away_Goalie': 'AGoalie', 'Home_Goalie': 'HGoalie', 'Away_Players': 'PlayersA', 'Home_Players': 'PlayersH' })
da = da[['Season', 'GameNumber', 'Date', 'Period', 'Time', 'Strength', 'Event', 'EventDetail', 'EventTeamCode', 'EventPlayer1', 'EventPlayer2', 'EventPlayer3', 'ATeamCode', 'APlayer1', 'APlayer2', 'APlayer3', 'APlayer4', 'APlayer5', 'APlayer6', 'AGoalie', 'PlayersA', 'HTeamCode', 'HPlayer1', 'HPlayer2', 'HPlayer3', 'HPlayer4', 'HPlayer5', 'HPlayer6', 'HGoalie', 'PlayersH']]
da = da.sort_values(['Season', 'GameNumber', 'Period', 'Time'], ascending=[True, True, True, True])

In [124]:
da.head()

Unnamed: 0,Season,GameNumber,Date,Period,Time,Strength,Event,EventDetail,EventTeamCode,EventPlayer1,EventPlayer2,EventPlayer3,ATeamCode,APlayer1,APlayer2,APlayer3,APlayer4,APlayer5,APlayer6,AGoalie,PlayersA,HTeamCode,HPlayer1,HPlayer2,HPlayer3,HPlayer4,HPlayer5,HPlayer6,HGoalie,PlayersH
0,2016,20001,2016-10-12,1,0:00,5x5,PSTR,Period Start- Local time: 7:18 EDT,,,,,TOR,NAZEM KADRI,LEO KOMAROV,MILAN MICHALEK,MORGAN RIELLY,MARTIN MARINCIN,FREDERIK ANDERSEN,FREDERIK ANDERSEN,6,OTT,ZACK SMITH,DERICK BRASSARD,BOBBY RYAN,DION PHANEUF,CODY CECI,CRAIG ANDERSON,CRAIG ANDERSON,6
1,2016,20001,2016-10-12,1,0:00,5x5,FAC,TOR won Neu. Zone - TOR #43 KADRI vs OTT #19 B...,TOR,NAZEM KADRI,DERICK BRASSARD,,TOR,NAZEM KADRI,LEO KOMAROV,MILAN MICHALEK,MORGAN RIELLY,MARTIN MARINCIN,FREDERIK ANDERSEN,FREDERIK ANDERSEN,6,OTT,ZACK SMITH,DERICK BRASSARD,BOBBY RYAN,DION PHANEUF,CODY CECI,CRAIG ANDERSON,CRAIG ANDERSON,6
2,2016,20001,2016-10-12,1,0:05,5x5,STOP,ICING,,,,,TOR,NAZEM KADRI,LEO KOMAROV,MILAN MICHALEK,MORGAN RIELLY,MARTIN MARINCIN,FREDERIK ANDERSEN,FREDERIK ANDERSEN,6,OTT,ZACK SMITH,DERICK BRASSARD,BOBBY RYAN,DION PHANEUF,CODY CECI,CRAIG ANDERSON,CRAIG ANDERSON,6
3,2016,20001,2016-10-12,1,0:05,5x5,FAC,TOR won Def. Zone - TOR #43 KADRI vs OTT #19 B...,TOR,NAZEM KADRI,DERICK BRASSARD,,TOR,NAZEM KADRI,LEO KOMAROV,MILAN MICHALEK,MORGAN RIELLY,MARTIN MARINCIN,FREDERIK ANDERSEN,FREDERIK ANDERSEN,6,OTT,ZACK SMITH,DERICK BRASSARD,BOBBY RYAN,DION PHANEUF,CODY CECI,CRAIG ANDERSON,CRAIG ANDERSON,6
4,2016,20001,2016-10-12,1,0:43,5x5,TAKE,"TOR TAKEAWAY - #16 MARNER, Off. Zone",TOR,MITCHELL MARNER,,,TOR,MITCHELL MARNER,TYLER BOZAK,JAMES VAN RIEMSDYK,CONNOR CARRICK,JAKE GARDINER,FREDERIK ANDERSEN,FREDERIK ANDERSEN,6,OTT,KYLE TURRIS,MIKE HOFFMAN,MARK STONE,MARC METHOT,CODY CECI,CRAIG ANDERSON,CRAIG ANDERSON,6


- keep only regular season games and exclude irrelevant on-ice events

In [9]:
da = da[da['GameNumber'] <= 21230]
da = da[da['Period'] <= 4]
da = da[da['Period'] >= 1]
da = da[da['Event']!='STOP']
da = da[da['Event']!='EISTR']
da = da[da['Event']!='EIEND']
da = da[da['Event']!='PSTR']
da = da[da['Event']!='PEND']
da = da[da['Event']!='SOC']
da = da[da['Event']!='GEND']

- create an event number variable that will count the number of events per game. 
- create an advantage type variable for even strength, power play and short handed situations.

In [10]:
da['EventNumber'] = da.groupby(['Season', 'GameNumber']).cumcount()+1

In [11]:
da['AdvantageType'] = da.apply(lambda x: 'EV' if ((x['PlayersA'] == x['PlayersH']) & (x['PlayersA'] != 1) & (x['PlayersH'] != 1)) else 'PP' if ((x['PlayersA'] > x['PlayersH']) & (x['EventTeamCode'] == x['ATeamCode'])) else 'PP' if ((x['PlayersA'] < x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'SH' if ((x['PlayersA'] < x['PlayersH']) & (x['EventTeamCode'] == x['ATeamCode'])) else 'SH' if ((x['PlayersA'] > x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'PP' if ((x['PlayersA'] == 1 ) & (x['PlayersH'] == 1)) else np.nan, axis=1) 

In [12]:
da['AdvantageType'].value_counts()

EV    266764
PP     32659
SH     14169
Name: AdvantageType, dtype: int64

- reshape data from wide to long for away team players and home team players

In [13]:
db = da.copy()
a = [col for col in db.columns if 'APlayer' in col]
b = [col for col in db.columns if 'HPlayer' in col]
db = pd.lreshape(db, {'APlayer' : a, 'HPlayer' : b })
db.head()

Unnamed: 0,AGoalie,ATeamCode,AdvantageType,Date,Event,EventDetail,EventNumber,EventPlayer1,EventPlayer2,EventPlayer3,EventTeamCode,GameNumber,HGoalie,HTeamCode,Period,PlayersA,PlayersH,Season,Strength,Time,APlayer,HPlayer
0,FREDERIK ANDERSEN,TOR,EV,2016-10-12,FAC,TOR won Neu. Zone - TOR #43 KADRI vs OTT #19 B...,1,NAZEM KADRI,DERICK BRASSARD,,TOR,20001,CRAIG ANDERSON,OTT,1,6,6,2016,5x5,0:00,NAZEM KADRI,ZACK SMITH
1,FREDERIK ANDERSEN,TOR,EV,2016-10-12,FAC,TOR won Def. Zone - TOR #43 KADRI vs OTT #19 B...,2,NAZEM KADRI,DERICK BRASSARD,,TOR,20001,CRAIG ANDERSON,OTT,1,6,6,2016,5x5,0:05,NAZEM KADRI,ZACK SMITH
2,FREDERIK ANDERSEN,TOR,EV,2016-10-12,TAKE,"TOR TAKEAWAY - #16 MARNER, Off. Zone",3,MITCHELL MARNER,,,TOR,20001,CRAIG ANDERSON,OTT,1,6,6,2016,5x5,0:43,MITCHELL MARNER,KYLE TURRIS
3,FREDERIK ANDERSEN,TOR,EV,2016-10-12,BLOCK,"TOR #52 MARINCIN BLOCKED BY OTT #44 PAGEAU, W...",4,JEAN-GABRIEL PAGEAU,MARTIN MARINCIN,,TOR,20001,CRAIG ANDERSON,OTT,1,6,6,2016,5x5,10:05,PETER HOLLAND,TOM PYATT
4,FREDERIK ANDERSEN,TOR,EV,2016-10-12,FAC,OTT won Off. Zone - TOR #24 HOLLAND vs OTT #15...,5,ZACK SMITH,PETER HOLLAND,,OTT,20001,CRAIG ANDERSON,OTT,1,6,6,2016,5x5,10:11,PETER HOLLAND,ZACK SMITH


In [14]:
db = db[['Season', 'GameNumber', 'Date', 'Period', 'Time', 'AdvantageType', 'Strength', 'EventNumber', 'Event', 'EventDetail', 'EventTeamCode', 'EventPlayer1', 'EventPlayer2', 'EventPlayer3', 'ATeamCode', 'APlayer', 'AGoalie', 'PlayersA', 'HTeamCode', 'HPlayer', 'HGoalie', 'PlayersH']]
db = db.sort_values(['Season', 'GameNumber', 'Period', 'Time'], ascending=[True, True, True, True])

- reshape data from wide to long for away and home team code and player respectfully.

In [15]:
dc = db.copy()
dc = dc.rename(columns={'EventTeamCode': 'EventTeam', 'EventPlayer1': 'EventP1', 'EventPlayer2': 'EventP2', 'EventPlayer3': 'EventP3', 'PlayersA':'PA', 'PlayersH': 'PH'})

In [16]:
a = [col for col in dc.columns if 'Code' in col]
b = [col for col in dc.columns if 'Player' in col]
c = [col for col in dc.columns if 'Goalie' in col]
d = [col for col in dc.columns if 'Position' in col]
dc = pd.lreshape(dc, {'TeamCode' : a, 'PlayerName' : b, 'Goalie': c})

In [17]:
dc = dc.rename(columns={'EventTeam': 'EventTeamCode', 'EventP1': 'EventPlayer1', 'EventP2': 'EventPlayer2', 'EventP3': 'EventPlayer3', 'PA':'PlayersA', 'PH': 'PlayersH'})

In [18]:
dc.shape

(3607019, 19)

### games played

- calculate the games each player participated in for the duration of the 2011 regular season.

In [19]:
s = dc.copy()
s = s[['Season', 'GameNumber', 'TeamCode', 'PlayerName']]
s = s.sort_values(['Season', 'GameNumber', 'TeamCode'], ascending=[True, True, True])
s = s.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])

In [20]:
s['GP'] = s.groupby(['Season', 'TeamCode','PlayerName'])['GameNumber'].transform('count')
s = s.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
s = s[['Season', 'TeamCode', 'PlayerName', 'GP']]
s.head()

Unnamed: 0,Season,TeamCode,PlayerName,GP
1803381,2016,OTT,ZACK SMITH,74
1803382,2016,OTT,DERICK BRASSARD,81
1803383,2016,OTT,BOBBY RYAN,62
1803384,2016,OTT,DION PHANEUF,81
1803385,2016,OTT,CODY CECI,79


In [21]:
s.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
GP            0
dtype: int64

In [22]:
s.shape

(1050, 4)

### shots for and against

- calculate the shots against each player faced througout the duration of a season. 
- create shot for and shot against variables. If event team code is indiferrent to team code assign a value of 1. If not, assign a value of 0. 
- **goals are a result of a shot and therefore need to be added to a player's shot total**. Display total shots per player for a season.

In [23]:
sa = dc.copy()
sa = sa[sa['Event'].isin(['SHOT','GOAL'])]

In [24]:
sa['ShotF'] = sa.apply(lambda x: 1 if x['EventTeamCode'] == x['TeamCode'] else 0, axis=1)
sa['ShotsF'] = sa.groupby(['Season', 'TeamCode', 'PlayerName'])['ShotF'].transform('sum')

sa['ShotA'] = sa.apply(lambda x: 1 if x['EventTeamCode'] != x['TeamCode'] else 0, axis=1)
sa['ShotsA'] = sa.groupby(['Season', 'TeamCode', 'PlayerName'])['ShotA'].transform('sum')

- create shot for and shot against variables for even strength situations only. If event team code is indiferrent to team code assign a value of 1. If not, assign a value of 0. 
- **goals are a result of a shot and therefore need to be added to a player's shot total**. Display total shots per player for a season.

In [25]:
sa['EVShotF'] = sa.apply(lambda x: 1 if ((x['AdvantageType'] == 'EV') & (x['EventTeamCode'] == x['TeamCode'])) else 1 if ((x['AdvantageType'] == 'SH') & (x['EventTeamCode'] == x['TeamCode'])) else 0, axis=1)
sa['EVShotsF'] = sa.groupby(['Season', 'TeamCode', 'PlayerName'])['EVShotF'].transform('sum')

sa['EVShotA'] = sa.apply(lambda x: 1 if ((x['AdvantageType'] == 'EV') & (x['EventTeamCode'] != x['TeamCode'])) else 1 if ((x['AdvantageType'] == 'SH') & (x['EventTeamCode'] != x['TeamCode'])) else 0, axis=1)
sa['EVShotsA'] = sa.groupby(['Season', 'TeamCode', 'PlayerName'])['EVShotA'].transform('sum')

- calculate the shot differential for all shots and even strength shots.

In [26]:
sa['DShots'] = sa['ShotsF'] - sa['ShotsA']
sa['EVDShots'] = sa['EVShotsF'] - sa['EVShotsA']

In [27]:
sa = sa [['Season', 'TeamCode', 'PlayerName', 'ShotsF', 'ShotsA', 'DShots', 'EVShotsF', 'EVShotsA', 'EVDShots' ]]
sa = sa.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
sa.head()

Unnamed: 0,Season,TeamCode,PlayerName,ShotsF,ShotsA,DShots,EVShotsF,EVShotsA,EVDShots
30,2016,TOR,PETER HOLLAND,39,50,-11,38,43,-5
31,2016,TOR,CONNOR BROWN,627,717,-90,549,570,-21
32,2016,TOR,MATT MARTIN,306,353,-47,305,332,-27
33,2016,TOR,JAKE GARDINER,1002,832,170,821,810,11
34,2016,TOR,MARTIN MARINCIN,198,276,-78,198,213,-15


In [28]:
sa.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
ShotsF        0
ShotsA        0
DShots        0
EVShotsF      0
EVShotsA      0
EVDShots      0
dtype: int64

In [29]:
sa.shape

(1049, 9)

### goals for and against

- create goals for and goals against variable. If event team code is the same as team code assign a value of 1 for goals for and a value of 0 for goals against. If event team code is different to team code, assign a value of 0 for golas for and a value of 1 for goals against. Display total goals for and against per player for a season.

In [30]:
gfa = dc.copy()
gfa = gfa[gfa['Event'] == 'GOAL']

In [31]:
gfa['EVGoalF'] = gfa.apply(lambda x: 1 if ((x['AdvantageType'] == 'EV') & (x['EventTeamCode'] == x['TeamCode'])) else 1 if ((x['AdvantageType'] == 'SH') & (x['EventTeamCode'] == x['TeamCode'])) else 0, axis=1)
gfa['EVGoalA'] = gfa.apply(lambda x: 1 if ((x['AdvantageType'] == 'EV') & (x['EventTeamCode'] != x['TeamCode'])) else 1 if ((x['AdvantageType'] == 'SH') & (x['EventTeamCode'] != x['TeamCode'])) else 0, axis=1)

In [32]:
gfa['GoalF'] = gfa.apply(lambda x: 1 if (x['EventTeamCode'] == x['TeamCode']) else 0, axis=1)
gfa['GoalA'] = gfa.apply(lambda x: 1 if (x['EventTeamCode'] != x['TeamCode']) else 0, axis=1)

In [33]:
gfa['EVGoalsF'] = gfa.groupby(['Season', 'TeamCode', 'PlayerName'])['EVGoalF'].transform('sum')
gfa['EVGoalsA'] = gfa.groupby(['Season', 'TeamCode', 'PlayerName'])['EVGoalA'].transform('sum')
gfa['GoalsF'] = gfa.groupby(['Season', 'TeamCode', 'PlayerName'])['GoalF'].transform('sum')
gfa['GoalsA'] = gfa.groupby(['Season', 'TeamCode', 'PlayerName'])['GoalA'].transform('sum')

In [34]:
gfa = gfa[['Season', 'GameNumber', 'TeamCode', 'PlayerName' , 'GoalsF', 'GoalsA', 'EVGoalsF', 'EVGoalsA']]
gfa = gfa.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
gfa['Plus/Minus'] = gfa['EVGoalsF'] - gfa['EVGoalsA']
gfa = gfa.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
gfa = gfa[['Season', 'TeamCode', 'PlayerName' , 'GoalsF', 'GoalsA', 'EVGoalsF', 'EVGoalsA', 'Plus/Minus']]
gfa.head()

Unnamed: 0,Season,TeamCode,PlayerName,GoalsF,GoalsA,EVGoalsF,EVGoalsA,Plus/Minus
42,2016,TOR,PETER HOLLAND,1,4,1,3,-2
44,2016,TOR,CONNOR BROWN,66,58,52,45,7
46,2016,TOR,MATT MARTIN,20,22,20,20,0
48,2016,TOR,JAKE GARDINER,106,55,80,53,27
50,2016,TOR,MARTIN MARINCIN,21,22,21,18,3


In [35]:
gfa.shape

(1018, 8)

### create dataframes for all on-ice event types

#### goals dataframe (dg)

In [36]:
dg = dc.copy()
dg = dg[dg['Event'] == 'GOAL']

In [37]:
dg = dg[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
dg = dg.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})

- create a goal variable. If event type is a goal assign a value of 1. If not, assign a value of 0. Display total goals per player for a season.

In [38]:
dg['Goal'] = dg.apply(lambda x: 1 if (x['Event'] == 'GOAL') else 0, axis=1)
dg = dg.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dg['Goals'] = dg.groupby(['Season','TeamCode', 'PlayerName'])['Goal'].transform('sum')

- keep one observation per player by game and drop duplicates. This purpose of this step is to calculate the quantity of games per player for a season. 

In [39]:
dg = dg.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dg = dg.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dg = dg[['Season', 'TeamCode', 'PlayerName', 'Goals']]
dg.head()

Unnamed: 0,Season,TeamCode,PlayerName,Goals
42,2016,OTT,BOBBY RYAN,13
96,2016,OTT,ERIK KARLSSON,17
149,2016,TOR,AUSTON MATTHEWS,40
826,2016,OTT,DERICK BRASSARD,14
1556,2016,OTT,KYLE TURRIS,27


In [40]:
dg.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Goals         0
dtype: int64

In [41]:
dg.shape

(721, 4)

#### assists dataframe (dast)

In [42]:
dast = dc.copy()
dast = dast[dast['Event'] == 'GOAL']

In [43]:
dast = dast[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer2', 'EventPlayer3']]
dast = dast.rename(columns={'EventPlayer2': 'Assist1Name', 'EventPlayer3': 'Assist2Name', 'EventTeamCode':'TeamCode'})

- reshape assist players 1 and 2 to create one column of assist players

In [44]:
a = [col for col in dast.columns if 'Name' in col]
dast = pd.lreshape(dast, {'PlayerName' : a})

- create an assist variable. If event type is a goal assign a value of 1. If not, assign a value of 0. Display total goals per player for a season.

In [45]:
dast['Assist'] = dast.apply(lambda x: 1 if (x['Event'] == 'GOAL') else 0, axis=1)
dast = dast.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dast['Assists'] = dast.groupby(['Season','TeamCode', 'PlayerName'])['Assist'].transform('sum')

- keep one observation per player by game and drop duplicates. This purpose of this step is to calculate the quantity of games per player for a season. 

In [46]:
dast = dast.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dast = dast.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dast = dast[['Season', 'TeamCode', 'PlayerName', 'Assists']]
dast.head()

Unnamed: 0,Season,TeamCode,PlayerName,Assists
0,2016,OTT,ERIK KARLSSON,54
6,2016,OTT,DERICK BRASSARD,25
12,2016,TOR,ZACH HYMAN,18
18,2016,TOR,WILLIAM NYLANDER,39
24,2016,TOR,MORGAN RIELLY,21


In [47]:
dast.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Assists       0
dtype: int64

In [48]:
dast.shape

(817, 4)

#### shot dataframe (ds)

In [49]:
ds = dc.copy()
ds = ds[ds['Event'] == 'SHOT']

In [50]:
ds = ds[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
ds = ds.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})
ds.head()

Unnamed: 0,Season,GameNumber,EventNumber,Event,TeamCode,PlayerName
30,2016,20001,6,SHOT,OTT,ERIK KARLSSON
31,2016,20001,6,SHOT,OTT,ERIK KARLSSON
32,2016,20001,6,SHOT,OTT,ERIK KARLSSON
33,2016,20001,6,SHOT,OTT,ERIK KARLSSON
34,2016,20001,6,SHOT,OTT,ERIK KARLSSON


- create a shot variable. If event type is a goal assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [51]:
ds['Shot'] = ds.apply(lambda x: 1 if (x['Event'] == 'SHOT') else 0, axis=1)
ds = ds.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
ds['Shots'] = ds.groupby(['Season','TeamCode', 'PlayerName'])['Shot'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [52]:
ds = ds.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
ds = ds.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
ds = ds[['Season', 'TeamCode', 'PlayerName', 'Shots']]
ds.head()

Unnamed: 0,Season,TeamCode,PlayerName,Shots
30,2016,OTT,ERIK KARLSSON,202
66,2016,OTT,TOM PYATT,86
90,2016,OTT,BOBBY RYAN,98
190,2016,TOR,WILLIAM NYLANDER,183
205,2016,TOR,MITCHELL MARNER,156


In [53]:
ds.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Shots         0
dtype: int64

In [54]:
ds.shape

(933, 4)

#### block dataframe (db)

In [55]:
dbl = dc.copy()
dbl = dbl[dbl['Event'] == 'BLOCK']

In [56]:
dbl.shape

(401028, 19)

In [57]:
dbl['EventPlayer2'].isnull().sum()

0

In [58]:
dbl['EventPlayer3'].isnull().sum()

401028

In [59]:
dbl['BlockTeam'] = dbl.EventDetail.str.split(' ', expand = True)[6]
dbl['BlockTeamCode'] = dbl.BlockTeam.str.split(' ', expand = True)[0]

In [60]:
dbl = dbl[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventPlayer1', 'BlockTeamCode']]
dbl = dbl.rename(columns={'EventPlayer1': 'PlayerName', 'BlockTeamCode':'TeamCode'})

- create a block variable. If event type is a block assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [61]:
dbl['Block'] = dbl.apply(lambda x: 1 if (x['Event'] == 'BLOCK') else 0, axis=1)
dbl = dbl.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dbl['Blocks'] = dbl.groupby(['Season','TeamCode', 'PlayerName'])['Block'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [62]:
dbl = dbl.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dbl = dbl.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dbl = dbl[['Season', 'TeamCode', 'PlayerName', 'Blocks']]
dbl.head()

Unnamed: 0,Season,TeamCode,PlayerName,Blocks
18,2016,OTT,JEAN-GABRIEL PAGEAU,79
78,2016,OTT,MARC METHOT,74
131,2016,OTT,CODY CECI,159
173,2016,OTT,DION PHANEUF,154
200,2016,TOR,MITCHELL MARNER,39


In [63]:
dbl.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Blocks        0
dtype: int64

In [64]:
dbl.shape

(1112, 4)

#### hit dataframe (dh)

In [65]:
dhit = dc.copy()
dhit = dhit[dhit['Event'] == 'HIT']

In [66]:
dhit.shape

(632019, 19)

In [67]:
dhit['EventPlayer2'].isnull().sum()

12

In [68]:
dhit['EventPlayer3'].isnull().sum()

632019

In [69]:
dhit = dhit[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
dhit = dhit.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})

- create a hit variable. If event type is a hit assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [70]:
dhit['Hit'] = dhit.apply(lambda x: 1 if (x['Event'] == 'HIT') else 0, axis=1)
dhit = dhit.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dhit['Hits'] = dhit.groupby(['Season','TeamCode', 'PlayerName'])['Hit'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [71]:
dhit = dhit.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dhit = dhit.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dhit = dhit[['Season', 'TeamCode', 'PlayerName', 'Hits']]
dhit.head()

Unnamed: 0,Season,TeamCode,PlayerName,Hits
54,2016,OTT,MARK BOROWIECKI,364.0
108,2016,TOR,MATT MARTIN,300.0
137,2016,TOR,LEO KOMAROV,231.0
161,2016,OTT,ZACK SMITH,140.0
298,2016,TOR,JAKE GARDINER,69.0


In [72]:
dhit.isnull().sum()

Season        0
TeamCode      0
PlayerName    1
Hits          1
dtype: int64

In [73]:
dhit.shape

(945, 4)

#### penalty dataframe (dpen)

In [74]:
dp = dc.copy()
dp = dp[dp['Event'] == 'PENL']

- use event detail to find the duration of a given penalty and to assign it to the proper player. Major penalty is 5 minutes, so a value of 5 is assigned for every event that had a major penalty.

In [75]:
dp['Pen'] = dp.EventDetail.str.split(')', expand = True)[0]
dp['Penal'] = dp.Pen.str.split('(', expand = True)[1]
dp['Penalt'] = dp.Penal.str.split(' ', expand = True)[0]
dp['Penalty'] = dp.apply(lambda x: 5 if x['Penalt'] == 'maj' else x['Penalt'], axis=1)
dp['Penalty'] = dp['Penalty'].convert_objects(convert_numeric=True)
dp = dp[['Season', 'GameNumber', 'Period', 'Time', 'EventNumber', 'Event', 'EventDetail', 'EventTeamCode', 'EventPlayer1', 'EventPlayer2', 'EventPlayer3', 'TeamCode', 'PlayerName', 'Penalty']]



- use event detail to find penalty team code and penalty player name.

In [76]:
dp['PenaltyTeamCode'] = dp.EventDetail.str.split('\\s', expand = True)[0]
dp['PN'] = dp.EventDetail.str.split('\\s', expand = True)[1]
dp['PNumber'] = dp.PN.str.split('#', expand = True)[1]
dp['PenaltyPlayerNumber'] = dp.PNumber.str.split(' ', expand = True)[0]
dp['PenaltyPlayerLName'] = dp.EventDetail.str.split('\\s', expand = True)[2]
#dp['PenaltyPlayerLName'] = dp.PPlLN.str.split(' ', expand = True)[0]
dp = dp[['Season', 'GameNumber', 'Period', 'Time', 'EventNumber', 'Event', 'EventDetail', 'EventTeamCode', 'EventPlayer1', 'EventPlayer2', 'EventPlayer3', 'TeamCode', 'PlayerName', 'Penalty', 'PenaltyTeamCode', 'PenaltyPlayerNumber', 'PenaltyPlayerLName']]

- seperate player name into first and last. The purpose is to connect the correct player name to penalty player name.

In [77]:
dp['PlayerFName'] = dp.PlayerName.str.split('\\s', expand = True)[0]
dp['PlayerLName'] = dp.PlayerName.str.split('\\s', expand = True)[1]
dp['PenaltyPlayerFName'] = dp.apply(lambda x: x['PlayerFName'] if ((x['PenaltyTeamCode'] == x['TeamCode']) & (x['PenaltyPlayerNumber'] == x['PenaltyPlayerNumber']) & (x['PlayerLName'] == x['PenaltyPlayerLName'])) else np.nan, axis=1)
dp['PenaltyPlayerFName'] = dp.groupby(['Season','PenaltyTeamCode', 'PenaltyPlayerNumber', 'PenaltyPlayerLName'])['PenaltyPlayerFName'].apply(lambda x: x.ffill().bfill())

In [78]:
dp['PenaltyName'] = dp.apply(lambda x: x['PlayerName'] if ((x['PenaltyTeamCode'] == x['TeamCode']) & (x['PenaltyPlayerNumber'] == x['PenaltyPlayerNumber']) & (x['PlayerLName'] == x['PenaltyPlayerLName']) & (x['PlayerFName'] == x['PenaltyPlayerFName'])) else np.nan, axis=1)
dp['PenaltyName'] = dp.groupby(['Season','PenaltyTeamCode', 'PenaltyPlayerLName', 'PenaltyPlayerNumber', 'PenaltyPlayerLName'])['PenaltyName'].apply(lambda x: x.ffill().bfill())


- drop duplicates per game and event number. Calculate the total number of penalties for each player.

In [79]:
dp = dp.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'PenaltyTeamCode', 'PenaltyPlayerNumber', 'PenaltyName'])
dp['Penalties'] = dp.groupby(['Season', 'PenaltyTeamCode', 'PenaltyPlayerNumber', 'PenaltyName'])['Penalty'].transform('sum')
dp['Penalties'] = dp.groupby(['Season','PenaltyTeamCode', 'PenaltyPlayerNumber', 'PlayerName'])['Penalties'].apply(lambda x: x.ffill().bfill())
#dp.head()

- drop duplicates to keep one observation per player for the season.

In [80]:
dp = dp[['Season', 'PenaltyTeamCode', 'PenaltyName', 'Penalties']]
dp = dp.drop_duplicates(['Season', 'PenaltyTeamCode', 'PenaltyName'])
dp = dp.rename(columns={'PenaltyTeamCode':'TeamCode', 'PenaltyName':'PlayerName'})
dp = dp[['Season', 'TeamCode', 'PlayerName', 'Penalties']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerName,Penalties
114,2016,OTT,MARK BOROWIECKI,152.0
115,2016,TOR,MATT MARTIN,123.0
179,2016,OTT,ZACK SMITH,61.0
252,2016,TOR,CONNOR BROWN,10.0
624,2016,OTT,DION PHANEUF,98.0


- keep observations that are not null. Since a team can be allocated a penalty, those observations are excluded from the data.

In [81]:
dp['PlayerName'].isnull().sum()

30

In [82]:
dp = dp[dp['PlayerName'].notnull()]

In [83]:
dp.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Penalties     0
dtype: int64

In [84]:
dp.shape

(856, 4)

####  miss dataframe (dmiss)

In [85]:
dmiss = dc.copy()
dmiss = dmiss[dmiss['Event'] == 'MISS']

In [86]:
dmiss.shape

(331116, 19)

In [87]:
dmiss['EventPlayer2'].isnull().sum()

331116

In [88]:
dmiss['EventPlayer3'].isnull().sum()

331116

In [89]:
dmiss = dmiss[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
dmiss = dmiss.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})

- create a miss variable. If event type is a miss assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [90]:
dmiss['Miss'] = dmiss.apply(lambda x: 1 if (x['Event'] == 'MISS') else 0, axis=1)
dmiss = dmiss.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dmiss['Misses'] = dmiss.groupby(['Season','TeamCode', 'PlayerName'])['Miss'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [91]:
dmiss = dmiss.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dmiss = dmiss.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dmiss = dmiss[['Season', 'TeamCode', 'PlayerName', 'Misses']]
dmiss.head()

Unnamed: 0,Season,TeamCode,PlayerName,Misses
84,2016,TOR,MITCHELL MARNER,80.0
337,2016,OTT,DION PHANEUF,56.0
397,2016,TOR,LEO KOMAROV,38.0
409,2016,OTT,MIKE HOFFMAN,88.0
499,2016,TOR,WILLIAM NYLANDER,81.0


In [92]:
dmiss.isnull().sum()

Season        0
TeamCode      0
PlayerName    1
Misses        1
dtype: int64

In [93]:
dmiss.shape

(896, 4)

####  takeaway dataframe (dtake)

In [94]:
dtake = dc.copy()
dtake = dtake[dtake['Event'] == 'TAKE']

In [95]:
dtake.shape

(191913, 19)

In [96]:
dtake['EventPlayer2'].isnull().sum()

191913

In [97]:
dtake['EventPlayer3'].isnull().sum()

191913

In [98]:
dtake = dtake[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
dtake = dtake.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})

- create a takeaway variable. If event type is a takeaway assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [99]:
dtake['Takeaway'] = dtake.apply(lambda x: 1 if (x['Event'] == 'TAKE') else 0, axis=1)
dtake = dtake.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dtake['Takeaways'] = dtake.groupby(['Season','TeamCode', 'PlayerName'])['Takeaway'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [100]:
dtake = dtake.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dtake = dtake.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dtake = dtake[['Season', 'TeamCode', 'PlayerName', 'Takeaways']]
dtake.head()

Unnamed: 0,Season,TeamCode,PlayerName,Takeaways
12,2016,TOR,MITCHELL MARNER,67
143,2016,TOR,AUSTON MATTHEWS,76
451,2016,TOR,NIKITA ZAITSEV,33
655,2016,OTT,CHRIS KELLY,18
682,2016,OTT,CHRIS WIDEMAN,14


In [101]:
dtake.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Takeaways     0
dtype: int64

In [102]:
dtake.shape

(857, 4)

####  giveaway dataframe (dgive)

In [103]:
dgive = dc.copy()
dgive = dgive[dgive['Event'] == 'GIVE']

In [104]:
dgive.shape

(253531, 19)

In [105]:
dgive['EventPlayer2'].isnull().sum()

253531

In [106]:
dgive['EventPlayer3'].isnull().sum()

253531

In [107]:
dgive = dgive[['Season', 'GameNumber', 'EventNumber', 'Event', 'EventTeamCode', 'EventPlayer1']]
dgive = dgive.rename(columns={'EventPlayer1': 'PlayerName', 'EventTeamCode':'TeamCode'})

- create a giveaway variable. If event type is a giveaway assign a value of 1. If not, assign a value of 0. Display total shots per player for a season.

In [108]:
dgive['Giveaway'] = dgive.apply(lambda x: 1 if (x['Event'] == 'GIVE') else 0, axis=1)
dgive = dgive.drop_duplicates(['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName'])
dgive['Giveaways'] = dgive.groupby(['Season','TeamCode', 'PlayerName'])['Giveaway'].transform('sum')

- drop duplicates to keep one observation per player for the season.

In [109]:
dgive = dgive.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dgive = dgive.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dgive = dgive[['Season', 'TeamCode', 'PlayerName', 'Giveaways']]
dgive.head()

Unnamed: 0,Season,TeamCode,PlayerName,Giveaways
167,2016,OTT,DERICK BRASSARD,37
234,2016,OTT,JEAN-GABRIEL PAGEAU,27
283,2016,TOR,MORGAN RIELLY,76
293,2016,OTT,KYLE TURRIS,57
361,2016,TOR,CONNOR BROWN,25


In [110]:
dgive.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Giveaways     0
dtype: int64

In [111]:
dgive.shape

(940, 4)

### import 2016 player shifts

In [112]:
dshift = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/data/nhl_shifts20162017.csv')
#dshift = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/data/nhl_shifts20162017.csv')
dshift = dshift.drop('Unnamed: 0', axis=1)

- rename variables, keep only regular season games and create a season variable.

In [113]:
dshift['Season'] = 2016
dshift= dshift.rename(columns={ 'Game_Id' : 'GameNumber', 'Team': 'TeamCode', 'Player': 'PlayerName' })
dshift = dshift[dshift['GameNumber'] <= 21230]
dshift = dshift[['Season', 'Date', 'GameNumber', 'Period', 'TeamCode', 'PlayerName', 'Player_Id', 'Start', 'End', 'Duration']]
dshift.head()

Unnamed: 0,Season,Date,GameNumber,Period,TeamCode,PlayerName,Player_Id,Start,End,Duration
0,2016,2016-10-12,20001,1,TOR,MARTIN MARINCIN,8475716,0.0,24.0,24.0
1,2016,2016-10-12,20001,1,TOR,MILAN MICHALEK,8470599,0.0,24.0,24.0
2,2016,2016-10-12,20001,1,TOR,MORGAN RIELLY,8476853,0.0,24.0,24.0
3,2016,2016-10-12,20001,1,TOR,NAZEM KADRI,8475172,0.0,24.0,24.0
4,2016,2016-10-12,20001,1,OTT,BOBBY RYAN,8471676,0.0,32.0,32.0


- calcuate the seconds an individual played per game. Convert seconds to minutes and drop duplicates per season, game number, team and player.

In [114]:
dshift['Seconds'] = dshift.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])['Duration'].transform('sum')
dshift['Minutes'] = dshift['Seconds']/60
dshift['Minutes'] = dshift['Minutes'].round(2)

In [115]:
dshift = dshift.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])

In [116]:
dshift.head()

Unnamed: 0,Season,Date,GameNumber,Period,TeamCode,PlayerName,Player_Id,Start,End,Duration,Seconds,Minutes
0,2016,2016-10-12,20001,1,TOR,MARTIN MARINCIN,8475716,0.0,24.0,24.0,1200.0,20.0
1,2016,2016-10-12,20001,1,TOR,MILAN MICHALEK,8470599,0.0,24.0,24.0,761.0,12.68
2,2016,2016-10-12,20001,1,TOR,MORGAN RIELLY,8476853,0.0,24.0,24.0,1545.0,25.75
3,2016,2016-10-12,20001,1,TOR,NAZEM KADRI,8475172,0.0,24.0,24.0,1037.0,17.28
4,2016,2016-10-12,20001,1,OTT,BOBBY RYAN,8471676,0.0,32.0,32.0,870.0,14.5


- calcuate the seconds an individual played for the whole regular season. Convert seconds to minutes and drop duplicates per season, game number, team and player.

In [117]:
dshift['TSeconds'] = dshift.groupby(['Season', 'TeamCode', 'PlayerName'])['Seconds'].transform('sum')
dshift['TOI'] = dshift['TSeconds']/60
dshift['TOI'] = dshift['TOI'].round(0)
dshift.head()

Unnamed: 0,Season,Date,GameNumber,Period,TeamCode,PlayerName,Player_Id,Start,End,Duration,Seconds,Minutes,TSeconds,TOI
0,2016,2016-10-12,20001,1,TOR,MARTIN MARINCIN,8475716,0.0,24.0,24.0,1200.0,20.0,27071.0,451.0
1,2016,2016-10-12,20001,1,TOR,MILAN MICHALEK,8470599,0.0,24.0,24.0,761.0,12.68,4281.0,71.0
2,2016,2016-10-12,20001,1,TOR,MORGAN RIELLY,8476853,0.0,24.0,24.0,1545.0,25.75,101089.0,1685.0
3,2016,2016-10-12,20001,1,TOR,NAZEM KADRI,8475172,0.0,24.0,24.0,1037.0,17.28,81570.0,1360.0
4,2016,2016-10-12,20001,1,OTT,BOBBY RYAN,8471676,0.0,32.0,32.0,870.0,14.5,57801.0,963.0


- keep only observation per player for the season

In [118]:
dshift = dshift.drop_duplicates(['Season', 'TeamCode', 'PlayerName'])
dshift = dshift[['Season', 'TeamCode', 'PlayerName', 'TOI']]
dshift.head()

Unnamed: 0,Season,TeamCode,PlayerName,TOI
0,2016,TOR,MARTIN MARINCIN,451.0
1,2016,TOR,MILAN MICHALEK,71.0
2,2016,TOR,MORGAN RIELLY,1685.0
3,2016,TOR,NAZEM KADRI,1360.0
4,2016,OTT,BOBBY RYAN,963.0


In [119]:
dshift.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
TOI           0
dtype: int64

In [120]:
dshift.shape

(1050, 4)

### import 2016 player positions

In [151]:
dpos = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/data/nhl_positions20162017.csv')
#dpos = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/data/nhl_positions20162017.csv')

In [152]:
dpos.head()

Unnamed: 0,Last Name,First Name,Position,Team,FName,LName,PlayerName
0,Abbott,Spencer,LW,CHI,SPENCER,ABBOTT,SPENCER ABBOTT
1,Abdelkader,Justin,LW/RW,DET,JUSTIN,ABDELKADER,JUSTIN ABDELKADER
2,Aberg,Pontus,LW,NSH,PONTUS,ABERG,PONTUS ABERG
3,Acciari,Noel,C,BOS,NOEL,ACCIARI,NOEL ACCIARI
4,Agostino,Kenny,LW,STL,KENNY,AGOSTINO,KENNY AGOSTINO


In [153]:
dpos['Season'] = 2016
dpos= dpos.rename(columns={ 'Team': 'TeamCode'})
dpos = dpos[['Season', 'TeamCode', 'PlayerName', 'Position']]
dpos.head()

Unnamed: 0,Season,TeamCode,PlayerName,Position
0,2016,CHI,SPENCER ABBOTT,LW
1,2016,DET,JUSTIN ABDELKADER,LW/RW
2,2016,NSH,PONTUS ABERG,LW
3,2016,BOS,NOEL ACCIARI,C
4,2016,STL,KENNY AGOSTINO,LW


In [154]:
dpos.dtypes

Season         int64
TeamCode      object
PlayerName    object
Position      object
dtype: object

In [155]:
dpos['Position'] = dpos.apply(lambda x: 'C' if x['Position'] == 'C/LW' else 'C' if x['Position'] == 'C/RW' else 'C' if x['Position'] == 'C/RW/LW' else 'C' if x['Position'] == 'C/LW/RW' else 'W' if x['Position'] == 'RW' else 'W' if x['Position'] == 'LW' else 'W' if x['Position'] == 'LW/C' else 'W' if x['Position'] == 'RW/C' else 'W' if x['Position'] == 'RW/LW' else 'W' if x['Position'] == 'LW/RW' else 'W' if x['Position'] == 'RW/LW/C' else 'W' if x['Position'] == 'RW/C/LW' else 'W' if x['Position'] == 'RW/C/LW' else 'W' if x['Position'] == 'LW/D' else 'W' if x['Position'] == 'LW/C/RW' else 'W' if x['Position'] == 'LW/RW/C' else 'D' if x['Position'] == 'D/LW' else 'D' if x['Position'] == 'D/RW' else 'C' if x['Position'] == 'C/D' else 'W' if x['Position'] == 'RW/D' else 'C' if x['Position'] == 'C/LW/C' else x['Position'] , axis=1)
dpos['Position'].value_counts()

W    329
D    299
C    260
G     95
Name: Position, dtype: int64

In [156]:
dpos['TeamCode'] = dpos.apply(lambda x: 'L.A' if x['TeamCode'] == 'LAK' else 'N.J' if x['TeamCode'] == 'NJD' else 'S.J' if x['TeamCode'] == 'SJS' else 'T.B' if x['TeamCode'] == 'TBL' else x['TeamCode'] , axis=1)
dpos.head()

Unnamed: 0,Season,TeamCode,PlayerName,Position
0,2016,CHI,SPENCER ABBOTT,W
1,2016,DET,JUSTIN ABDELKADER,W
2,2016,NSH,PONTUS ABERG,W
3,2016,BOS,NOEL ACCIARI,C
4,2016,STL,KENNY AGOSTINO,W


In [157]:
dpos.shape

(984, 4)

### merge data frames

- merge games played, goals and assists.

In [158]:
dd = pd.merge(s, dpos, on=['Season', 'TeamCode', 'PlayerName'], how='left').merge(dg,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dast,on=['Season', 'TeamCode', 'PlayerName'], how = 'left')

- forward and backward fill players that have been traded during the season. Fill in positions for three players.

In [159]:
dd['Position'] = dd.groupby(['PlayerName'])['Position'].apply(lambda x: x.ffill().bfill())
dd.head()

Unnamed: 0,Season,TeamCode,PlayerName,GP,Position,Goals,Assists
0,2016,OTT,ZACK SMITH,74,C,16.0,16.0
1,2016,OTT,DERICK BRASSARD,81,C,14.0,25.0
2,2016,OTT,BOBBY RYAN,62,W,13.0,12.0
3,2016,OTT,DION PHANEUF,81,D,9.0,21.0
4,2016,OTT,CODY CECI,79,D,2.0,15.0


In [160]:
dd.isnull().sum()

Season          0
TeamCode        0
PlayerName      0
GP              0
Position        0
Goals         329
Assists       233
dtype: int64

In [161]:
fix = dd.copy()
fix = fix[fix['Position'].isnull()]
fix.head(20)

Unnamed: 0,Season,TeamCode,PlayerName,GP,Position,Goals,Assists


- fill in goals and assists with 0 for players who had no points.

In [162]:
dd['Goals'] = dd['Goals'].fillna(0)
dd['Assists'] = dd['Assists'].fillna(0)
dd['Points'] = dd['Goals'] + dd['Assists']
dd.head()

Unnamed: 0,Season,TeamCode,PlayerName,GP,Position,Goals,Assists,Points
0,2016,OTT,ZACK SMITH,74,C,16.0,16.0,32.0
1,2016,OTT,DERICK BRASSARD,81,C,14.0,25.0,39.0
2,2016,OTT,BOBBY RYAN,62,W,13.0,12.0,25.0
3,2016,OTT,DION PHANEUF,81,D,9.0,21.0,30.0
4,2016,OTT,CODY CECI,79,D,2.0,15.0,17.0


In [163]:
dd.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
GP            0
Position      0
Goals         0
Assists       0
Points        0
dtype: int64

In [164]:
de = pd.merge(dd, gfa,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(ds,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(sa,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dbl,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dhit,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dp,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dtake,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dgive,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dmiss,on=['Season', 'TeamCode', 'PlayerName'], how = 'left').merge(dshift,on=['Season', 'TeamCode', 'PlayerName'], how = 'left')
de.head()

Unnamed: 0,Season,TeamCode,PlayerName,GP,Position,Goals,Assists,Points,GoalsF,GoalsA,EVGoalsF,EVGoalsA,Plus/Minus,Shots,ShotsF,ShotsA,DShots,EVShotsF,EVShotsA,EVDShots,Blocks,Hits,Penalties,Takeaways,Giveaways,Misses,TOI
0,2016,OTT,ZACK SMITH,74,C,16.0,16.0,32.0,45.0,54.0,40.0,34.0,6.0,121.0,615.0,631.0,-16.0,540.0,491.0,49.0,38.0,140.0,61.0,31.0,35.0,56.0,1212.0
1,2016,OTT,DERICK BRASSARD,81,C,14.0,25.0,39.0,74.0,46.0,56.0,45.0,11.0,179.0,803.0,576.0,227.0,658.0,570.0,88.0,24.0,92.0,24.0,37.0,37.0,70.0,1408.0
2,2016,OTT,BOBBY RYAN,62,W,13.0,12.0,25.0,49.0,37.0,34.0,37.0,-3.0,98.0,517.0,442.0,75.0,401.0,442.0,-41.0,34.0,108.0,24.0,27.0,27.0,49.0,963.0
3,2016,OTT,DION PHANEUF,81,D,9.0,21.0,30.0,81.0,89.0,59.0,63.0,-4.0,147.0,906.0,981.0,-75.0,726.0,801.0,-75.0,154.0,132.0,98.0,16.0,54.0,56.0,1866.0
4,2016,OTT,CODY CECI,79,D,2.0,15.0,17.0,58.0,90.0,54.0,66.0,-12.0,141.0,758.0,1011.0,-253.0,706.0,800.0,-94.0,159.0,111.0,20.0,22.0,74.0,54.0,1833.0


- fill in with 0 for all nan values since not all players have been on the ice for a goal for or a goal against. 

In [165]:
de['GoalsF'] = de['GoalsF'].fillna(0)
de['GoalsA'] = de['GoalsA'].fillna(0)
de['EVGoalsF'] = de['EVGoalsF'].fillna(0)
de['EVGoalsA'] = de['EVGoalsA'].fillna(0)
de['Plus/Minus'] = de['Plus/Minus'].fillna(0)
de['Shots'] = de['Shots'].fillna(0)
de['Hits'] = de['Hits'].fillna(0)
de['Blocks'] = de['Blocks'].fillna(0)
de['Penalties'] = de['Penalties'].fillna(0)
de['Giveaways'] = de['Giveaways'].fillna(0)
de['Takeaways'] = de['Takeaways'].fillna(0)
de['Misses'] = de['Misses'].fillna(0)

In [166]:
de.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
GP            0
Position      0
Goals         0
Assists       0
Points        0
GoalsF        0
GoalsA        0
EVGoalsF      0
EVGoalsA      0
Plus/Minus    0
Shots         0
ShotsF        1
ShotsA        1
DShots        1
EVShotsF      1
EVShotsA      1
EVDShots      1
Blocks        0
Hits          0
Penalties     0
Takeaways     0
Giveaways     0
Misses        0
TOI           0
dtype: int64

### total player stats for the season

- players that have been traded need to be evaluated for their season performance. Group by sesaon and player name to calcute the sum of each stat. 

In [167]:
de['TGP'] = de.groupby(['Season', 'PlayerName'])['GP'].transform('sum')
de['TGoals'] = de.groupby(['Season', 'PlayerName'])['Goals'].transform('sum')
de['TAssists'] = de.groupby(['Season', 'PlayerName'])['Assists'].transform('sum')
de['TPoints'] = de.groupby(['Season', 'PlayerName'])['Points'].transform('sum')
de['TGoalsF'] = de.groupby(['Season', 'PlayerName'])['GoalsF'].transform('sum')
de['TGoalsA'] = de.groupby(['Season', 'PlayerName'])['GoalsA'].transform('sum')
de['TEVGoalsF'] = de.groupby(['Season', 'PlayerName'])['EVGoalsF'].transform('sum')
de['TEVGoalsA'] = de.groupby(['Season', 'PlayerName'])['EVGoalsA'].transform('sum')
de['TPlus/Minus'] = de.groupby(['Season', 'PlayerName'])['Plus/Minus'].transform('sum')
de['TShots'] = de.groupby(['Season', 'PlayerName'])['Shots'].transform('sum')
de['TShotsF'] = de.groupby(['Season', 'PlayerName'])['ShotsF'].transform('sum')
de['TShotsA'] = de.groupby(['Season', 'PlayerName'])['ShotsA'].transform('sum')
de['TDShots'] = de.groupby(['Season', 'PlayerName'])['DShots'].transform('sum')
de['TEVShotsF'] = de.groupby(['Season', 'PlayerName'])['EVShotsF'].transform('sum')
de['TEVShotsA'] = de.groupby(['Season', 'PlayerName'])['EVShotsA'].transform('sum')
de['TEVDShots'] = de.groupby(['Season', 'PlayerName'])['EVDShots'].transform('sum')
de['TBlocks'] = de.groupby(['Season', 'PlayerName'])['Blocks'].transform('sum')
de['THits'] = de.groupby(['Season', 'PlayerName'])['Hits'].transform('sum')
de['TPenalties'] = de.groupby(['Season', 'PlayerName'])['Penalties'].transform('sum')
de['TTakeaways'] = de.groupby(['Season', 'PlayerName'])['Takeaways'].transform('sum')
de['TGiveaways'] = de.groupby(['Season', 'PlayerName'])['Giveaways'].transform('sum')
de['TMisses'] = de.groupby(['Season', 'PlayerName'])['Misses'].transform('sum')
de['TTOI'] = de.groupby(['Season', 'PlayerName'])['TOI'].transform('sum')

- divide all stats with total time on ice to scale the data for cluster analysis

In [168]:
de['TOIGoals'] = de['TGoals'] / de['TTOI']
de['TOIAssists'] = de['TAssists'] / de['TTOI']
de['TOIPoints'] = de['TPoints'] / de['TTOI']
de['TOIGoalsF'] = de['TGoalsF'] / de['TTOI']
de['TOIGoalsA'] = de['TGoalsA'] / de['TTOI']
de['TOIEVGoalsF'] = de['TEVGoalsF'] / de['TTOI']
de['TOIEVGoalsA'] = de['TEVGoalsA'] / de['TTOI']
de['TOIPlus/Minus'] = de['TPlus/Minus'] / de['TTOI']
de['TOIShots'] = de['TShots'] / de['TTOI']
de['TOIShotsF'] = de['TShotsF'] / de['TTOI']
de['TOIShotsA'] = de['TShotsA'] / de['TTOI']
de['TOIDShots'] = de['TDShots'] / de['TTOI']
de['TOIEVShotsF'] = de['TEVShotsF'] / de['TTOI']
de['TOIEVShotsA'] = de['TEVShotsA'] / de['TTOI']
de['TOIEVDShots'] = de['TEVDShots'] / de['TTOI']
de['TOIBlocks'] = de['TBlocks'] / de['TTOI']
de['TOIHits'] = de['THits'] / de['TTOI']
de['TOIPenalties'] = de['TPenalties'] / de['TTOI']
de['TOITakeaways'] = de['TTakeaways'] / de['TTOI']
de['TOIGiveaways'] = de['TGiveaways'] / de['TTOI']
de['TOIMisses'] = de['TMisses'] / de['TTOI']

- keep season total variables and scaled variables

In [169]:
de = de[['Season', 'TeamCode', 'PlayerName', 'Position', 'TGP', 'TGoals', 'TAssists', 'TPoints', 'TGoalsF', 'TGoalsA', 'TEVGoalsF', 'TEVGoalsA', 'TPlus/Minus', 'TShots', 'TShotsF', 'TShotsA', 'TDShots', 'TEVShotsF', 'TEVShotsA', 'TEVDShots', 'TBlocks', 'THits', 'TPenalties', 'TTakeaways', 'TGiveaways', 'TMisses', 'TTOI', 'TOIGoals', 'TOIAssists', 'TOIPoints', 'TOIGoalsF', 'TOIGoalsA', 'TOIEVGoalsF', 'TOIEVGoalsA', 'TOIPlus/Minus', 'TOIShots', 'TOIShotsF', 'TOIShotsA', 'TOIDShots', 'TOIEVShotsF', 'TOIEVShotsA', 'TOIEVDShots', 'TOIBlocks', 'TOIHits', 'TOIPenalties', 'TOITakeaways', 'TOIGiveaways', 'TOIMisses']]
de.head()

Unnamed: 0,Season,TeamCode,PlayerName,Position,TGP,TGoals,TAssists,TPoints,TGoalsF,TGoalsA,TEVGoalsF,TEVGoalsA,TPlus/Minus,TShots,TShotsF,TShotsA,TDShots,TEVShotsF,TEVShotsA,TEVDShots,TBlocks,THits,TPenalties,TTakeaways,TGiveaways,TMisses,TTOI,TOIGoals,TOIAssists,TOIPoints,TOIGoalsF,TOIGoalsA,TOIEVGoalsF,TOIEVGoalsA,TOIPlus/Minus,TOIShots,TOIShotsF,TOIShotsA,TOIDShots,TOIEVShotsF,TOIEVShotsA,TOIEVDShots,TOIBlocks,TOIHits,TOIPenalties,TOITakeaways,TOIGiveaways,TOIMisses
0,2016,OTT,ZACK SMITH,C,74,16.0,16.0,32.0,45.0,54.0,40.0,34.0,6.0,121.0,615.0,631.0,-16.0,540.0,491.0,49.0,38.0,140.0,61.0,31.0,35.0,56.0,1212.0,0.013201,0.013201,0.026403,0.037129,0.044554,0.033003,0.028053,0.00495,0.099835,0.507426,0.520627,-0.013201,0.445545,0.405116,0.040429,0.031353,0.115512,0.05033,0.025578,0.028878,0.046205
1,2016,OTT,DERICK BRASSARD,C,81,14.0,25.0,39.0,74.0,46.0,56.0,45.0,11.0,179.0,803.0,576.0,227.0,658.0,570.0,88.0,24.0,92.0,24.0,37.0,37.0,70.0,1408.0,0.009943,0.017756,0.027699,0.052557,0.03267,0.039773,0.03196,0.007812,0.127131,0.570312,0.409091,0.161222,0.46733,0.40483,0.0625,0.017045,0.065341,0.017045,0.026278,0.026278,0.049716
2,2016,OTT,BOBBY RYAN,W,62,13.0,12.0,25.0,49.0,37.0,34.0,37.0,-3.0,98.0,517.0,442.0,75.0,401.0,442.0,-41.0,34.0,108.0,24.0,27.0,27.0,49.0,963.0,0.013499,0.012461,0.025961,0.050883,0.038422,0.035306,0.038422,-0.003115,0.101765,0.536864,0.458982,0.077882,0.416407,0.458982,-0.042575,0.035306,0.11215,0.024922,0.028037,0.028037,0.050883
3,2016,OTT,DION PHANEUF,D,81,9.0,21.0,30.0,81.0,89.0,59.0,63.0,-4.0,147.0,906.0,981.0,-75.0,726.0,801.0,-75.0,154.0,132.0,98.0,16.0,54.0,56.0,1866.0,0.004823,0.011254,0.016077,0.043408,0.047696,0.031618,0.033762,-0.002144,0.078778,0.485531,0.525723,-0.040193,0.389068,0.42926,-0.040193,0.082529,0.07074,0.052519,0.008574,0.028939,0.030011
4,2016,OTT,CODY CECI,D,79,2.0,15.0,17.0,58.0,90.0,54.0,66.0,-12.0,141.0,758.0,1011.0,-253.0,706.0,800.0,-94.0,159.0,111.0,20.0,22.0,74.0,54.0,1833.0,0.001091,0.008183,0.009274,0.031642,0.0491,0.02946,0.036007,-0.006547,0.076923,0.41353,0.551555,-0.138025,0.385161,0.436443,-0.051282,0.086743,0.060556,0.010911,0.012002,0.040371,0.02946


In [170]:
de.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats/2016_player_stats.csv', index='False', sep=',')
#de = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats/2016_player_stats.csv', index='False', sep=',')