# Player Evaluation

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile

sys.setrecursionlimit(10000)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [2]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play dataframe

In [3]:
da = pd.read_csv('pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})

- keep regular season games and events that happened in regulation and overtime (exclude shootouts). Drop irrelevant data.

In [4]:
da = da[da['GameNumber'] <= 21230]
da = da[da['Period'] <= 4]
da = da[da['Period'] >= 1]
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- use event detail for goals information to identify the players that assisted on a given goal. Split event detail data into assist player 1 and assist player 2, to display primary and secondary assist. 

In [5]:
#number of assist player 1 - need to split by '#'
da['Assist1'] = da.EventDetail.str.split(' ', expand = True)[8]
da['AssistPlayer1Number'] = da.Assist1.str.split('#', expand = True)[1]

#name of assist player 1 - need to split by '('
da['Assist1Last'] = da.EventDetail.str.split(' ', expand = True)[9]
da['AssistPlayer1Name'] = da.Assist1Last.str.split('(', expand = True)[0]

#number of assist player 2 - need to split by '#'
da['Assist2'] = da.EventDetail.str.split(' ', expand = True)[10]
da['AssistPlayer2Number'] = da.Assist2.str.split('#', expand = True)[1]

#name of assist player 2 - need to split by '('
da['Assist2Last'] = da.EventDetail.str.split(' ', expand = True)[11]
da['AssistPlayer2Name'] = da.Assist2Last.str.split('(', expand = True)[0]

- drop 'Assist1', 'Assist1Last', 'Assist2' and 'Assist2Last'

In [6]:
da = da [['Season', 'GameNumber', 'EventNumber', 'Period', 'AdvantageType', 'EventTimeFromZero', 'EventTimeFromTwenty', 'EventType', 'EventDetail', 'VPlayer1', 'VPosition1', 'VPlayer2', 'VPosition2', 'VPlayer3', 'VPosition3', 'VPlayer4', 'VPosition4', 'VPlayer5', 'VPosition5', 'VPlayer6', 'VPosition6', 'HPlayer1', 'HPosition1', 'HPlayer2', 'HPosition2', 'HPlayer3', 'HPosition3', 'HPlayer4', 'HPosition4', 'HPlayer5', 'HPosition5', 'HPlayer6', 'HPosition6', 'GameDate', 'VTeamCode', 'HTeamCode', 'EventTeamCode', 'PlayerNumber', 'PlayerName','ShotType', 'ShotResult', 'Zone', 'Length', 'PenaltyType', 'AssistPlayer1Number', 'AssistPlayer1Name', 'AssistPlayer2Number', 'AssistPlayer2Name']]

In [7]:
da.shape

(314600, 48)

- If event type is **not a goal,** in the assist columns display **NaN.**

In [8]:
da['AssistPlayer1Number'] = da.apply(lambda x: x['AssistPlayer1Number'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['AssistPlayer1Name'] = da.apply(lambda x: x['AssistPlayer1Name'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['AssistPlayer2Number'] = da.apply(lambda x: x['AssistPlayer2Number'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)
da['AssistPlayer2Name'] = da.apply(lambda x: x['AssistPlayer2Name'] if (x['EventType'] == 'GOAL') else np.nan, axis=1)

### create a dataframe for goals

In [9]:
db = da
db = db.rename(columns={'EventTeamCode': 'TeamCode' })
db = db.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create a goal variable. If event type is a goal assign a value of 1. If not, assign a value of 0. Display total goals per player for a season.

- display total goals per player for a season.

In [10]:
db['Goal'] = db.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
db['Goals'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Goal'].transform('sum')
db['Goals'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Goals'].apply(lambda x: x.ffill().bfill())

- keep one observation per player by game and drop duplicates. This purpose of this step is to calculate the quantity of games per player for a season. 

In [11]:
db = db.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName'])
db['GP'] = db.groupby(['Season', 'TeamCode', 'PlayerNumber','PlayerName'])['GameNumber'].transform('count')
db = db.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])

- reshape visitor and home team, players from wide to long. Create 2 columns for visitor (VPlayer, VPosition) and 2 columns for home (HPlayer, HPosition) team.

In [12]:
a = [col for col in db.columns if 'VPlayer' in col]
b = [col for col in db.columns if 'HPlayer' in col]
c = [col for col in db.columns if 'VPosition' in col]
d = [col for col in db.columns if 'HPosition' in col]
db = pd.lreshape(db, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [13]:
db['PlayerPosition'] = db.apply(lambda x: x['VPosition'] if ((x['TeamCode'] == x['VTeamCode']) & (x['PlayerNumber'] == x['VPlayer'])) else (x['HPosition'] if ((x['TeamCode'] == x['HTeamCode']) & (x['PlayerNumber'] == x['HPlayer'])) else np.nan), axis=1)
db['PlayerPosition'] = db.groupby(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])['PlayerPosition'].apply(lambda x: x.ffill().bfill())

In [14]:
db = db[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'GP', 'Goals']]
db.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,GP,Goals
0,2010,MTL,11.0,GOMEZ,C,80.0,7.0
1,2010,TOR,37.0,BRENT,C,79.0,8.0
2,2010,MTL,14.0,PLEKANEC,C,77.0,22.0
3,2010,MTL,76.0,SUBBAN,D,77.0,14.0
4,2010,TOR,35.0,GIGUERE,G,17.0,0.0


### create a dataframe for assists

- This dataset contains only event type that were goals.

In [15]:
dc = da[da['EventType'] == 'GOAL']
dc = dc[['Season', 'GameNumber', 'EventNumber', 'Period', 'AdvantageType', 'EventType', 'EventDetail', 'GameDate', 'EventTeamCode', 'AssistPlayer1Number', 'AssistPlayer1Name', 'AssistPlayer2Number', 'AssistPlayer2Name']]
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])
dc = dc.rename(columns={'AssistPlayer1Number': 'AssistPlayerNumber1', 'AssistPlayer1Name': 'AssistPlayerName1', 'AssistPlayer2Number': 'AssistPlayerNumber2', 'AssistPlayer2Name': 'AssistPlayerName2', 'EventTeamCode': 'TeamCode' })
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- reshape data from wide to long based on assist variables to display both primary and secondary assists in one column.

In [16]:
a = [col for col in dc.columns if 'AssistPlayerNumber' in col]
b = [col for col in dc.columns if 'AssistPlayerName' in col]
dc = pd.lreshape(dc, {'AssistPlayerNumber' : a, 'AssistPlayerName' : b})
dc = dc[['Season', 'GameNumber', 'GameDate', 'EventNumber', 'Period', 'AdvantageType', 'EventType', 'EventDetail', 'TeamCode', 'AssistPlayerNumber', 'AssistPlayerName']]
dc = dc.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- create assist variables. Assign a value of 1 if an event was a goal; 0 otherwise. Calculate the total assists each player resgistered for a season.

In [17]:
dc['Assist'] = dc.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dc['Assists'] = dc.groupby(['Season', 'TeamCode', 'AssistPlayerNumber', 'AssistPlayerName'])['Assist'].transform('sum')

- keep one observation per player for the season and drop duplicates.

In [18]:
dc = dc.rename(columns={'AssistPlayerNumber': 'PlayerNumber', 'AssistPlayerName': 'PlayerName' })
dc = dc[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'Assists']]
dc = dc.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])
dc.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,Assists
0,2010,TOR,3,PHANEUF,22
1,2010,TOR,42,BOZAK,17
6232,2010,TOR,8,KOMISAREK,9
2,2010,TOR,41,KULEMIN,27
6233,2010,TOR,84,GRABOVSKI,29


### merge assist dataframe (dc) onto goal dataframe (db).

In [19]:
db = pd.merge(db, dc, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'], how='outer')
db = db.sort_values(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'], ascending=[True, True, True, True])
db.head(10)

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,GP,Goals,Assists
4377,2010,ANA,1.0,HILLER,,6.0,0.0,
4378,2010,ANA,1.0,HILLER,,6.0,0.0,
4379,2010,ANA,1.0,HILLER,,6.0,0.0,
4380,2010,ANA,1.0,HILLER,,6.0,0.0,
4381,2010,ANA,1.0,HILLER,,6.0,0.0,
7082,2010,ANA,1.0,HILLER,,,,1.0
3754,2010,ANA,3.0,LILJA,D,52.0,1.0,
3755,2010,ANA,3.0,LILJA,D,52.0,1.0,
3756,2010,ANA,3.0,LILJA,D,52.0,1.0,
3757,2010,ANA,3.0,LILJA,D,52.0,1.0,


- group by team, player to forward fill and back fill NaN values

In [20]:
db['PlayerPosition'] = db.groupby(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName'])['PlayerPosition'].apply(lambda x: x.ffill().bfill())
db['GP'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['GP'].apply(lambda x: x.ffill().bfill())
db['Goals'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Goals'].apply(lambda x: x.ffill().bfill())
db['Assists'] = db.groupby(['Season','TeamCode', 'PlayerNumber', 'PlayerName'])['Assists'].apply(lambda x: x.ffill().bfill())

- keep one observation per player for the season and drop duplicates.

In [21]:
db = db.drop_duplicates(['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'])

- assign 0 to NaN values for goals and assists.

In [22]:
db['Assists'].fillna(0, inplace=True)
db['Goals'].fillna(0, inplace=True)

- display total points by player for the season. Calculate their production per game in terms of points, goals and assists. Sort dataframe by points per game. 

In [23]:
db['Points'] = db['Goals'] + db['Assists']

In [24]:
db['GoalsPerGame'] = db['Goals'] / db['GP']
db['AssistsPerGame'] = db['Assists'] / db['GP']
db['PointsPerGame'] = db['Points'] / db['GP']

exclude goaltenders from the dataframe.

In [25]:
db = db[db['PlayerPosition'] != 'G']
db = db.sort_values(['PointsPerGame', 'GoalsPerGame', 'AssistsPerGame'], ascending=[False, False, False])
db.head(10)

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,GP,Goals,Assists,Points,GoalsPerGame,AssistsPerGame,PointsPerGame
223,2010,PIT,87.0,CROSBY,C,41.0,32.0,34.0,66.0,0.780488,0.829268,1.609756
3209,2010,VAN,22.0,SEDIN,L,81.0,41.0,62.0,103.0,0.506173,0.765432,1.271605
1523,2010,ANA,10.0,PERRY,R,82.0,50.0,47.0,97.0,0.609756,0.573171,1.182927
3203,2010,VAN,33.0,SEDIN,C,82.0,19.0,75.0,94.0,0.231707,0.914634,1.146341
1622,2010,ANA,15.0,GETZLAF,C,67.0,19.0,57.0,76.0,0.283582,0.850746,1.134328
1634,2010,ANA,8.0,SELANNE,R,73.0,31.0,49.0,80.0,0.424658,0.671233,1.09589
2046,2010,WSH,8.0,OVECHKIN,L,79.0,32.0,53.0,85.0,0.405063,0.670886,1.075949
1305,2010,DAL,91.0,RICHARDS,C,72.0,28.0,49.0,77.0,0.388889,0.680556,1.069444
1517,2010,DET,13.0,DATSYUK,C,56.0,23.0,36.0,59.0,0.410714,0.642857,1.053571
977,2010,CGY,12.0,IGINLA,R,82.0,43.0,43.0,86.0,0.52439,0.52439,1.04878


In [26]:
db.to_csv('points_goals_assists_per_game.csv', index='False', sep=',')

In [27]:
#dz = db[db['TeamCode'] == 'ANA']

In [28]:
#dz['TotalGoals'] = dz.groupby(['Season', 'TeamCode', 'PlayerName', 'PlayerPosition'])['Goals'].transform('count')

In [29]:
#dz.head(50)