# Player Evaluation

## points per game

In [680]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile

sys.setrecursionlimit(10000)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [681]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play dataframe

In [682]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})

- keep regular season games and events that happened in regulation and overtime (exclude shootouts). Drop rows that have 'NaN' in event number column and irrelevant data.

In [683]:
da = da[da['GameNumber'] <= 21230]
da = da[da['Period'] <= 4]
da = da[da['Period'] >= 1]
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']
da = da[da['EventType'] !='FIGHT']
da = da.dropna(subset=['EventNumber'])

- fill in NaN values for advantage type with the use of backfill. If Player name is 'TEAM' assign a value of 0.

In [684]:
da['AdvantageType'] = da.groupby(['Season','GameNumber', 'Period'])['AdvantageType'].apply(lambda x: x.bfill())
da['PlayerNumber'] = da.apply(lambda x: 0 if (x['PlayerName'] == 'TEAM') else x['PlayerNumber'], axis=1)

- use event detail for goals information to identify the players that assisted on a given goal. Split event detail data into assist player 1 and assist player 2, to display primary and secondary assist. 

In [685]:
#number and name of assist player 1
da['Assist'] = da.EventDetail.str.split(':', expand = True)[1]
da['Assist1'] = da.Assist.str.split(';', expand = True)[0]
da['A1a'] = da.Assist1.str.split('#', expand = True)[1]
da['A1b'] = da.A1a.str.split('(', expand = True)[0]
da['Assist1PlayerNumber'] = da.A1b.str.split(' ', expand = True)[0]
da['Assist1PlayerName'] = da.A1b.str.split('\\d+', expand = True)[1]

#number and name of assist player 2 
da['Assist2'] = da.Assist.str.split(';', expand = True)[1]
da['A2a'] = da.Assist2.str.split('#', expand = True)[1]
da['A2b'] = da.A2a.str.split('(', expand = True)[0]
da['Assist2PlayerNumber'] = da.A2b.str.split(' ', expand = True)[0]
da['Assist2PlayerName'] = da.A2b.str.split('\\d+', expand = True)[1]

- drop irrelevant split assist columns generated.

In [686]:
da = da[['Season', 'GameNumber', 'EventNumber', 'Period', 'AdvantageType', 'EventTimeFromZero', 'EventTimeFromTwenty', 'EventType', 'EventDetail', 'VPlayer1', 'VPosition1', 'VPlayer2', 'VPosition2', 'VPlayer3', 'VPosition3', 'VPlayer4', 'VPosition4', 'VPlayer5', 'VPosition5', 'VPlayer6', 'VPosition6', 'HPlayer1', 'HPosition1', 'HPlayer2', 'HPosition2', 'HPlayer3', 'HPosition3', 'HPlayer4', 'HPosition4', 'HPlayer5', 'HPosition5', 'HPlayer6', 'HPosition6', 'GameDate', 'VTeamCode', 'HTeamCode', 'EventTeamCode', 'PlayerNumber', 'PlayerName', 'ShotType', 'ShotResult', 'Zone', 'Length', 'PenaltyType', 'Assist1PlayerNumber', 'Assist1PlayerName', 'Assist2PlayerNumber', 'Assist2PlayerName']]

In [687]:
da.shape

(314600, 48)

- If event type is **not a goal,** in the assist columns display **NaN.**

In [688]:
da['Assist1PlayerNumber'] = da.apply(lambda x: x['Assist1PlayerNumber'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['Assist1PlayerName'] = da.apply(lambda x: x['Assist1PlayerName'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['Assist2PlayerNumber'] = da.apply(lambda x: x['Assist2PlayerNumber'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['Assist2PlayerName'] = da.apply(lambda x: x['Assist2PlayerName'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)

### create dataframes for all on-ice event types

#### goal dataframe (db)

In [689]:
db = da
db = db.rename(columns={'EventTeamCode': 'TeamCode' })
db = db.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a goal variable. If event type is a goal assign a value of 1. If not, assign a value of 0. Display total goals per player for a season.

In [690]:
db['Goal'] = db.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
db['Goals'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Goal'].transform('sum')
db['Goals'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Goals'].apply(lambda x: x.ffill().bfill())

- keep one observation per player by game and drop duplicates. This purpose of this step is to calculate the quantity of games per player for a season. 

In [691]:
db = db.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName'])
db['GP'] = db.groupby(['Season', 'TeamCode', 'PlayerNumber','PlayerName'])['GameNumber'].transform('count')
db = db.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])

- reshape visitor and home team, players from wide to long. Create 2 columns for visitor (VPlayer, VPosition) and 2 columns for home (HPlayer, HPosition) team.

In [692]:
a = [col for col in db.columns if 'VPlayer' in col]
b = [col for col in db.columns if 'HPlayer' in col]
c = [col for col in db.columns if 'VPosition' in col]
d = [col for col in db.columns if 'HPosition' in col]
db = pd.lreshape(db, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [693]:
#db['PlayerPosition'] = db.apply(lambda x: x['VPosition'] if ((x['TeamCode'] == x['VTeamCode']) & (x['PlayerNumber'] == x['VPlayer'])) else (x['HPosition'] if ((x['TeamCode'] == x['HTeamCode']) & (x['PlayerNumber'] == x['HPlayer'])) else np.nan), axis=1)
#db['PlayerPosition'] = db.groupby(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])['PlayerPosition'].apply(lambda x: x.ffill().bfill())

In [694]:
db = db[db['PlayerName'] != 'TEAM']
db = db[db['TeamCode'] != 'Def']
db = db[db['TeamCode'] != 'Neu']

In [695]:
db = db.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
db = db[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'GP', 'Goals']]
db.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals
0,2010,MTL,11.0,GOMEZ,80,7
1,2010,TOR,37.0,BRENT,79,8
2,2010,MTL,14.0,PLEKANEC,77,22
3,2010,MTL,76.0,SUBBAN,77,14
4,2010,TOR,35.0,GIGUERE,17,0


In [696]:
db.isnull().sum()

Season          0
TeamCode        0
PlayerNumber    0
PlayerName      0
GP              0
Goals           0
dtype: int64

In [697]:
db.shape

(1058, 6)

#### assist dataframe (dc)

- This dataset contains only event type that were goals.

In [698]:
dc = da[da['EventType'] == 'GOAL']
dc = dc[['Season', 'GameNumber', 'EventNumber', 'Period', 'AdvantageType', 'EventType', 'EventDetail', 'GameDate', 'EventTeamCode', 'Assist1PlayerNumber', 'Assist1PlayerName', 'Assist2PlayerNumber', 'Assist2PlayerName']]
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])
dc = dc.rename(columns={'Assist1PlayerNumber': 'AssistPlayerNumber1', 'Assist1PlayerName': 'AssistPlayerName1', 'Assist2PlayerNumber': 'AssistPlayerNumber2', 'Assist2PlayerName': 'AssistPlayerName2', 'EventTeamCode': 'TeamCode' })
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- reshape data from wide to long based on assist variables to display both primary and secondary assists in one column.

In [699]:
a = [col for col in dc.columns if 'AssistPlayerNumber' in col]
b = [col for col in dc.columns if 'AssistPlayerName' in col]
dc = pd.lreshape(dc, {'AssistPlayerNumber' : a, 'AssistPlayerName' : b})
dc = dc[['Season', 'GameNumber', 'GameDate', 'EventNumber', 'Period', 'AdvantageType', 'EventType', 'EventDetail', 'TeamCode', 'AssistPlayerNumber', 'AssistPlayerName']]
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create assist variables. Assign a value of 1 if an event was a goal; 0 otherwise. Calculate the total assists each player resgistered for a season.

In [700]:
dc['Assist'] = dc.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dc['Assists'] = dc.groupby(['Season', 'TeamCode', 'AssistPlayerNumber', 'AssistPlayerName'])['Assist'].transform('sum')

- keep one observation per player for the season and drop duplicates.

In [701]:
dc = dc.rename(columns={'AssistPlayerNumber': 'PlayerNumber', 'AssistPlayerName': 'PlayerName' })
dc = dc[['Season', 'TeamCode', 'PlayerNumber', 'Assists']]
dc = dc.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber'])
dc['PlayerNumber'] = dc['PlayerNumber'].astype(np.float64)
dc['Assists'] = dc['Assists'].astype(np.int64)
dc.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,Assists
0,2010,TOR,3.0,22
1,2010,TOR,42.0,17
6287,2010,TOR,8.0,9
2,2010,TOR,41.0,27
6288,2010,TOR,84.0,29


In [702]:
dc.shape

(837, 4)

#### shot dataframe (ds)

In [703]:
ds = da
ds = ds.rename(columns={'EventTeamCode': 'TeamCode' })
ds = ds.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a shot variable. If event type is a shot assign a value of 1. If not, assign a value of 0. **Goals are a result of a shot and therefore need to be added to a player's shot total**. Display total shots per player for a season.

In [704]:
ds['Shot'] = ds.apply(lambda x: 1 if (x['EventType'] == 'SHOT') else 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
ds['Shots'] = ds.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Shot'].transform('sum')
ds['Shots'] = ds.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Shots'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.


In [705]:
ds = ds.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
ds = ds[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Shots']]
ds.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Shots
0,2010,MTL,11.0,GOMEZ,156
1,2010,TOR,37.0,BRENT,60
2,2010,MTL,14.0,PLEKANEC,227
3,2010,MTL,76.0,SUBBAN,197
4,2010,TOR,35.0,GIGUERE,0


#### block dataframe (dl)

In [706]:
dl = da
dl = dl.rename(columns={'EventTeamCode': 'TeamCode' })
dl = dl.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a block variable. If event type is a block assign a value of 1. If not, assign a value of 0. Display total blocks per player for a season.

In [707]:
dl['Block'] = dl.apply(lambda x: 1 if (x['EventType'] == 'BLOCK') else 0, axis=1)
dl['Blocks'] = dl.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Block'].transform('sum')
dl['Blocks'] = dl.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Blocks'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [708]:
dl = dl.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dl = dl[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Blocks']]
dl.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Blocks
0,2010,MTL,11.0,GOMEZ,24
1,2010,TOR,37.0,BRENT,58
2,2010,MTL,14.0,PLEKANEC,45
3,2010,MTL,76.0,SUBBAN,106
4,2010,TOR,35.0,GIGUERE,0


#### miss dataframe (dm)

In [709]:
dm = da
dm = dm.rename(columns={'EventTeamCode': 'TeamCode' })
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a miss variable. If event type is a miss assign a value of 1. If not, assign a value of 0. Display total misses per player for a season.

In [710]:
dm['Miss'] = dm.apply(lambda x: 1 if (x['EventType'] == 'MISS') else 0, axis=1)
dm['Misses'] = dm.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Miss'].transform('sum')
dm['Misses'] = dm.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Misses'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [711]:
dm = dm.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dm = dm[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Misses']]
dm.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Misses
0,2010,MTL,11.0,GOMEZ,57
1,2010,TOR,37.0,BRENT,28
2,2010,MTL,14.0,PLEKANEC,79
3,2010,MTL,76.0,SUBBAN,78
4,2010,TOR,35.0,GIGUERE,0


####  hit dataframe (dh)

In [712]:
dh = da
dh = dh.rename(columns={'EventTeamCode': 'TeamCode' })
dh = dh.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a hit variable. If event type is a hit assign a value of 1. If not, assign a value of 0. Display total hits per player for a season.

In [713]:
dh['Hit'] = dh.apply(lambda x: 1 if (x['EventType'] == 'HIT') else 0, axis=1)
dh['Hits'] = dh.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Hit'].transform('sum')
dh['Hits'] = dh.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Hits'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [714]:
dh = dh.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dh = dh[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Hits']]
dh.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Hits
0,2010,MTL,11.0,GOMEZ,32
1,2010,TOR,37.0,BRENT,104
2,2010,MTL,14.0,PLEKANEC,53
3,2010,MTL,76.0,SUBBAN,110
4,2010,TOR,35.0,GIGUERE,0


#### penalty dataframe (dp)

In [715]:
dp = da
dp = dp.rename(columns={'EventTeamCode': 'TeamCode' })
dp = dp.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a penalty variable. If event type is a penalty assign a value of 1. If not, assign a value of 0. Display total penalties per player for a season.

In [716]:
dp['Penalty'] = dp.apply(lambda x: 1 if (x['EventType'] == 'PENL') else 0, axis=1)
dp['Penalties'] = dp.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Penalty'].transform('sum')
dp['Penalties'] = dp.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Penalties'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [717]:
dp = dp.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Penalties']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Penalties
0,2010,MTL,11.0,GOMEZ,19
1,2010,TOR,37.0,BRENT,14
2,2010,MTL,14.0,PLEKANEC,28
3,2010,MTL,76.0,SUBBAN,48
4,2010,TOR,35.0,GIGUERE,2


####  faceoff dataframe (df)

In [718]:
df = da
df = df.rename(columns={'EventTeamCode': 'TeamCode' })
df = df.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a faceoff variable. If event type is a faceoff assign a value of 1. If not, assign a value of 0. Display total faceoffs per player for a season.

In [719]:
df['Faceoff'] = df.apply(lambda x: 1 if (x['EventType'] == 'FAC') else 0, axis=1)
df['Faceoffs'] = df.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Faceoff'].transform('sum')
df['Faceoffs'] = df.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Faceoffs'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [720]:
df = df.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
df = df[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Faceoffs']]
df.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Faceoffs
0,2010,MTL,11.0,GOMEZ,575
1,2010,TOR,37.0,BRENT,410
2,2010,MTL,14.0,PLEKANEC,785
3,2010,MTL,76.0,SUBBAN,0
4,2010,TOR,35.0,GIGUERE,0


####  giveaway dataframe (di)

In [721]:
di = da
di = di.rename(columns={'EventTeamCode': 'TeamCode' })
di = di.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a giveaway variable. If event type is a giveaway assign a value of 1. If not, assign a value of 0. Display total giveaways per player for a season.

In [722]:
di['Giveaway'] = di.apply(lambda x: 1 if (x['EventType'] == 'GIVE') else 0, axis=1)
di['Giveaways'] = di.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Giveaway'].transform('sum')
di['Giveaways '] = di.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Giveaways'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [723]:
di = di.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
di = di[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Giveaways']]
di.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Giveaways
0,2010,MTL,11.0,GOMEZ,28
1,2010,TOR,37.0,BRENT,20
2,2010,MTL,14.0,PLEKANEC,37
3,2010,MTL,76.0,SUBBAN,56
4,2010,TOR,35.0,GIGUERE,33


####  takeaway dataframe (dt)

In [724]:
dt = da
dt = dt.rename(columns={'EventTeamCode': 'TeamCode' })
dt = dt.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a takeaway variable. If event type is a takeaway  assign a value of 1. If not, assign a value of 0. Display total takeaways per player for a season.

In [725]:
dt['Takeaway'] = dt.apply(lambda x: 1 if (x['EventType'] == 'TAKE') else 0, axis=1)
dt['Takeaways'] = dt.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Takeaway'].transform('sum')
dt['Takeaways'] = dt.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Takeaways'].apply(lambda x: x.ffill().bfill())

- drop duplicates to keep one observation per player for the season.

In [726]:
dt = dt.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dt = dt[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Takeaways']]
dt.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Takeaways
0,2010,MTL,11.0,GOMEZ,56
1,2010,TOR,37.0,BRENT,23
2,2010,MTL,14.0,PLEKANEC,43
3,2010,MTL,76.0,SUBBAN,37
4,2010,TOR,35.0,GIGUERE,0


## merge dataframes 

- goals and assists together to calcualte points per player.

In [727]:
dd = pd.merge(db, dc, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
#dd = dd.sort_values(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'], ascending=[True, True, True, True])
dd.head(10)

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals,Assists
0,2010,MTL,11.0,GOMEZ,80,7,31.0
1,2010,TOR,37.0,BRENT,79,8,12.0
2,2010,MTL,14.0,PLEKANEC,77,22,35.0
3,2010,MTL,76.0,SUBBAN,77,14,24.0
4,2010,TOR,35.0,GIGUERE,17,0,
5,2010,TOR,2.0,SCHENN,82,5,17.0
6,2010,MTL,81.0,ELLER,77,7,9.0
7,2010,MTL,46.0,KOSTITSYN,81,20,25.0
8,2010,TOR,32.0,VERSTEEG,53,14,21.0
9,2010,MTL,52.0,DARCHE,56,11,14.0


check if there are any 'NaN' values in all columns. Fill 'NaN' values with zero.

In [728]:
dd.isnull().sum()

Season            0
TeamCode          0
PlayerNumber      0
PlayerName        0
GP                0
Goals             0
Assists         183
dtype: int64

In [729]:
dd['Assists'] = dd['Assists'].fillna(0)

- calculate points per player for the season.

In [730]:
dd['Assists'] = dd['Assists'].astype(np.int64)
dd['Points'] = dd['Goals'] + dd['Assists']
dd.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals,Assists,Points
0,2010,MTL,11.0,GOMEZ,80,7,31,38
1,2010,TOR,37.0,BRENT,79,8,12,20
2,2010,MTL,14.0,PLEKANEC,77,22,35,57
3,2010,MTL,76.0,SUBBAN,77,14,24,38
4,2010,TOR,35.0,GIGUERE,17,0,0,0


- **merge all on-ice dataframes on season, team code, player number and player name** 

In [731]:
dk = dd.merge(ds,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(dl,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(dh,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(df,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(dm,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(dp,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(di,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName']).merge(dt,on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dk.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals,Assists,Points,Shots,Blocks,Hits,Faceoffs,Misses,Penalties,Giveaways,Takeaways
0,2010,MTL,11.0,GOMEZ,80,7,31,38,156,24,32,575,57,19,28,56
1,2010,TOR,37.0,BRENT,79,8,12,20,60,58,104,410,28,14,20,23
2,2010,MTL,14.0,PLEKANEC,77,22,35,57,227,45,53,785,79,28,37,43
3,2010,MTL,76.0,SUBBAN,77,14,24,38,197,106,110,0,78,48,56,37
4,2010,TOR,35.0,GIGUERE,17,0,0,0,0,0,0,0,0,2,33,0


In [732]:
dk.shape

(1058, 16)

### import player shift data and keep only regular season games.

In [733]:
dv = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/data/t_player_shift_o.csv')
#dv = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/data/t_t_player_shift_o.csv')

- keep only regular seasong games. Display the duration of each shift per player. Calculate the time on ice by player for each game and transform seconds into minutes. Drop duplicates by season, gamenumber, teamcode and playernumber.

In [734]:
dv = dv[dv['gamenumber'] <= 21230]
dv['shift'] = dv['endtime'] - dv['starttime']
dv['seconds'] = dv.groupby(['season','gamenumber','teamcode', 'playernumber'])['shift'].transform('sum')
dv['minutes'] = dv['seconds'] / 60
dv['minutes'] = dv['minutes'].round(2)
dv = dv.drop_duplicates(['season', 'gamenumber', 'teamcode', 'playernumber'])

- calculate the mean time on ice per player for the season.

In [735]:
dv['TOI'] = dv.groupby(['season','teamcode', 'playernumber'])['minutes'].transform('sum')
dv['ATOI'] = dv.groupby(['season','teamcode', 'playernumber'])['minutes'].transform('mean')
dv['TOI'] = dv['TOI'].round(2)
dv['ATOI'] = dv['ATOI'].round(2)
dv.head()

Unnamed: 0,season,gamenumber,teamcode,playernumber,period,starttime,endtime,shift,seconds,minutes,TOI,ATOI
0,2010,20001,MTL,6,1,43,95,52,1222,20.37,1135.35,19.24
9,2010,20001,MTL,11,1,0,35,35,1121,18.68,1485.07,18.56
16,2010,20001,MTL,14,1,35,95,60,1129,18.82,1559.15,20.25
24,2010,20001,MTL,15,1,95,135,40,640,10.67,916.71,12.73
32,2010,20001,MTL,17,1,135,186,51,882,14.7,98.37,9.84


In [736]:
dv = dv.rename(columns={'season': 'Season', 'gamenumber' : 'GameNumber', 'teamcode' : 'TeamCode', 'playernumber' : 'PlayerNumber'})
dv = dv.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber'])
dv = dv[['Season', 'TeamCode', 'PlayerNumber', 'TOI', 'ATOI']]

In [737]:
dv.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,TOI,ATOI
0,2010,MTL,6,1135.35,19.24
9,2010,MTL,11,1485.07,18.56
16,2010,MTL,14,1559.15,20.25
24,2010,MTL,15,916.71,12.73
32,2010,MTL,17,98.37,9.84


In [738]:
dv.shape

(1028, 5)

- merge time on ice onto dk dataframe

In [739]:
dw = pd.merge(dk, dv, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals,Assists,Points,Shots,Blocks,Hits,Faceoffs,Misses,Penalties,Giveaways,Takeaways,TOI,ATOI
0,2010,MTL,11.0,GOMEZ,80,7,31,38,156,24,32,575,57,19,28,56,1485.07,18.56
1,2010,TOR,37.0,BRENT,79,8,12,20,60,58,104,410,28,14,20,23,920.46,11.65
2,2010,MTL,14.0,PLEKANEC,77,22,35,57,227,45,53,785,79,28,37,43,1559.15,20.25
3,2010,MTL,76.0,SUBBAN,77,14,24,38,197,106,110,0,78,48,56,37,1714.69,22.27
4,2010,TOR,35.0,GIGUERE,17,0,0,0,0,0,0,0,0,2,33,0,1633.02,49.49


In [740]:
dw.shape

(1058, 18)

In [741]:
dw['MGoals'] = dw['Goals'] / dw['TOI']
dw['MAssists'] = dw['Assists'] / dw['TOI']
dw['MPoints'] = dw['Points'] / dw['TOI']
dw['MShots'] = dw['Shots'] / dw['TOI']
dw['MBlocks'] = dw['Blocks'] / dw['TOI']
dw['MHits'] = dw['Hits'] / dw['TOI']
dw['MFaceoffs'] = dw['Faceoffs'] / dw['TOI']
dw['MMisses'] = dw['Misses'] / dw['TOI']
dw['MPenalties'] = dw['Penalties'] / dw['TOI']
dw['MGiveaways'] = dw['Giveaways'] / dw['TOI']
dw['MTakeaways'] = dw['Takeaways'] / dw['TOI']


In [742]:
dw = dw.sort_values(['MPoints', 'MGoals', 'MAssists'], ascending=[False, False, False])
dw.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,GP,Goals,Assists,Points,Shots,Blocks,Hits,Faceoffs,Misses,Penalties,Giveaways,Takeaways,TOI,ATOI,MGoals,MAssists,MPoints,MShots,MBlocks,MHits,MFaceoffs,MMisses,MPenalties,MGiveaways,MTakeaways
872,2010,WSH,24.0,WILLSIE,1,0,1,1,0,2,0,0,0,0,0,0,6.25,6.25,0.0,0.16,0.16,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0
674,2010,COL,7.0,VANDERGULIK,6,1,2,3,12,3,10,0,4,1,2,2,40.51,6.75,0.024685,0.049371,0.074056,0.296223,0.074056,0.246853,0.0,0.098741,0.024685,0.049371,0.049371
37,2010,PIT,87.0,CROSBY,41,32,34,66,161,23,31,546,58,13,29,15,898.61,21.92,0.035611,0.037836,0.073447,0.179166,0.025595,0.034498,0.607605,0.064544,0.014467,0.032272,0.016692
544,2010,VAN,22.0,SEDIN,81,41,62,103,265,12,13,4,92,16,48,36,1521.32,18.55,0.02695,0.040754,0.067704,0.174191,0.007888,0.008545,0.002629,0.060474,0.010517,0.031552,0.023664
278,2010,ANA,8.0,SELANNE,73,31,49,80,213,20,15,97,64,18,45,37,1309.33,17.94,0.023676,0.037424,0.0611,0.162679,0.015275,0.011456,0.074084,0.04888,0.013747,0.034369,0.028259


In [743]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats_per_time_on_ice.csv', index='False', sep=',')
#dv = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats_per_time_on_ice.csv', index='False', sep=',')