In [77]:
##### How do design an efficient roster in the National Hockey League #####

##### modules #####

import sys, os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy


##### Import play by play data #####


currPath = 'file:///Users/stefanostselios/Dropbox/steve'
playbyplaydata = 't_play_by_play_o.csv'

dm = pd.read_csv(currPath + '/' + playbyplaydata)


##### drop events listed as "stop of play" (icing, tv stopage, goalie stopage) #####

dm=dm[dm.EventType != 'STOP']


##### sort files by season, gamenumber, period, eventnumber #####

dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True], inplace=True)


##### rearrange columns in data set #####

dm = dm [['Season', 'GameNumber', 'EventNumber', 'Period', 'AdvantageType', 'EventTimeFromZero', 'EventTimeFromTwenty', 'EventType', 'EventDetail', 'VPlayer1', 'VPlayer1Position', 'VPlayer2', 'VPlayer2Position', 'VPlayer3', 'VPlayer3Position', 'VPlayer4', 'VPlayer4Position', 'VPlayer5', 'VPlayer5Position', 'VPlayer6', 'VPlayer6Position', 'HPlayer1', 'HPlayer1Position', 'HPlayer2', 'HPlayer2Position', 'HPlayer3', 'HPlayer3Position', 'HPlayer4', 'HPlayer4Position', 'HPlayer5', 'HPlayer5Position', 'HPlayer6', 'HPlayer6Position']]



##### import giveaway detail data #####

giveaway = 't_play_by_play_giveaway_detail_o.csv'

dgiveaway = pd.read_csv(currPath + '/' + giveaway)



#### sort dgiveaway by season, gamenumber, period, eventnumber #####

dgiveaway.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename columns in play by play giveaway file #####

dgiveaway = dgiveaway.rename(columns={'GivePlayerNumber': 'PlayerNumber', 'GivePlayerLName': 'PlayerName', 'GiveTeamCode': 'TeamCode'})


##### import takeaway detail data #####

takeaway = 't_play_by_play_takeaway_detail_o.csv'

dtakeaway = pd.read_csv(currPath + '/' + takeaway)




#### sort dtakeaway by season, gamenumber, period, eventnumber #####

dtakeaway.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename columns in play by play takeaway file #####

dtakeaway = dtakeaway.rename(columns={'TakePlayerNumber': 'PlayerNumber', 'TakePlayerLName': 'PlayerName', 'TakeTeamCode': 'TeamCode'})


##### merge giveaways and takeaways to dturn #####

dturn = dgiveaway.merge(dtakeaway, on=['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName', 'PlayerNumber', 'Zone'], how='outer')


##### sort dturn by season, gamenumber, eventnumber #####

dturn.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



##### import goal detail data #####

goal = 't_play_by_play_goal_detail_o.csv'

dgoal = pd.read_csv(currPath + '/' + goal)


#### sort dgoal by season, gamenumber, period, eventnumber #####

dgoal.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename player column in play by play goal file #####

dgoal = dgoal.rename(columns={'PlayerLName': 'PlayerName'})


##### import shot data #####

shot = 't_play_by_play_shot_detail_o.csv'

dshot = pd.read_csv(currPath + '/' + shot)



#### sort dshot by season, gamenumber, period, eventnumber #####

dshot.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename player column in play by play shot file #####

dshot = dshot.rename(columns={'PlayerLName': 'PlayerName'})


##### merge goal and shot file to new: da #####

da = dgoal.merge(dshot, on=['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName', 'PlayerNumber', 'Zone', 'ShotType', 'Length'], how='outer')


##### sort da by season, gamenumber, eventnumber #####

da.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



#### import miss detail data #####

miss = 't_play_by_play_miss_detail_o.csv'

dmiss = pd.read_csv(currPath + '/' + miss)


#### sort dmiss by season, gamenumber, period, eventnumber #####

dmiss.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename player column in play by play miss file #####

dmiss = dmiss.rename(columns={'PlayerLName': 'PlayerName'})


##### merge miss file to da (shot and goal) and create new dataset: db #####


db = da.merge(dmiss, on=['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName', 'PlayerNumber', 'Zone', 'ShotType', 'Length'], how='outer')


##### sort db by season, gamenumber, eventnumber #####

db.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



##### import block detail data #####

block = 't_play_by_play_block_detail_o.csv'

dblock = pd.read_csv(currPath + '/' + block)


#### sort dblock by season, gamenumber, eventnumber #####

dblock.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



##### rename columns in play by play block file #####

dblock = dblock.rename(columns={'ShotPlayerLName': 'ShotPlayerName', 'BlockPlayerLName': 'PlayerName', 'BlockPlayerNumber': 'PlayerNumber', 'BlockTeamCode': 'TeamCode'})


##### merge block file to db (shot, goal and miss) and create new dataset: dc #####


dc = db.merge(dblock, on=['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName', 'PlayerNumber', 'Zone', 'ShotType'], how='outer')


##### sort dc by season, gamenumber, eventnumber #####

dc.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)




##### import faceoff detail data #####

faceoff = 't_play_by_play_faceoff_detail_o.csv'

dfaceoff = pd.read_csv(currPath + '/' + faceoff)


#### sort dfaceoff by season, gamenumber, period, eventnumber #####

dfaceoff.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename columns in play by play faceoff file #####

dfaceoff = dfaceoff.rename(columns={'VPlayerLName': 'VPlayerName', 'HPlayerLName': 'HPlayerName'})


##### import hit detail data #####

hit = 't_play_by_play_hit_detail_o.csv'

dhit = pd.read_csv(currPath + '/' + hit)


#### sort dhit by season, gamenumber, period, eventnumber #####

dhit.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename player columns in play by play hit file #####

dhit = dhit.rename(columns={'HitterPlayerLName': 'HitterPlayerName', 'HitteePlayerLName': 'HitteePlayerName'})


##### merge faceoff and hit file by season, GameNumber, EventNumber, Zone and create dataset: dd ####

dd = dfaceoff.merge(dhit, on=['Season', 'GameNumber', 'EventNumber', 'Zone'], how='outer')


##### sort dd by season, gamenumber, eventnumber #####

dd.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



##### import penalty detail data #####

penalty = 't_play_by_play_penalty_detail_o.csv'

dpenalty = pd.read_csv(currPath + '/' + penalty)



#### sort dpenalty by season, gamenumber, period, eventnumber #####

dpenalty.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### rename player column in play by play penalty file #####

dpenalty = dpenalty.rename(columns={'PenaltyPlayerLName': 'PenaltyPlayerName', 'DrawnByPlayerLName': 'DrawnByPlayerName'})


##### merge penalty to dd (faceoff and hit) file by season, GameNumber, EventNumber, Zone and create dataset: de ####

de = dd.merge(dpenalty, on=['Season', 'GameNumber', 'EventNumber', 'Zone'], how='outer')


##### sort de by season, gamenumber, eventnumber #####

de.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



##### merge dc (shot, goal, miss, block) and dturn to one dataset : df #####

df = dc.merge(dturn, on=['Season', 'GameNumber', 'EventNumber', 'TeamCode', 'PlayerName', 'PlayerNumber', 'Zone'], how='outer')


##### sort dturn by season, gamenumber, eventnumber #####

df.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### merge de (penalty, hit, faceoff) and df to one dataset : dg #####

dg = df.merge(de, on=['Season', 'GameNumber', 'EventNumber', 'Zone'], how='outer')


##### sort dg by season, gamenumber, eventnumber #####

dg.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


################################ dg includes all on-ice events ##########################################


##### import game detail data #####

gameplay = 't_game_detail_o.csv'

dplay = pd.read_csv(currPath + '/' + gameplay)



##### sort dlay by season, gamenumber #####

dplay.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)


##### merge dplay file on dg (all on-ice events) and create new dataset: dh #####

dh = dg.merge(dplay, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='outer')


##### sort dh by season, gamenumber, eventnumber #####

dh.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)


##### merge dh dataset set on dm and create dn #####

dn = dm.merge(dh, on=['Season', 'GameNumber', 'EventNumber'], how='outer')

##### sort dn by season, gamenumber, eventnumber #####

dn.sort_values(['Season', 'GameNumber', 'EventNumber'], ascending=[True, True, True], inplace=True)



############################# keep first two games of the season #############################################

dn=dn[dn.GameNumber <=20002]


##### drop column Gamedate #####

dn=dn.drop('GameDate', axis=1)


##### create list for PP and SH observations #####

value_list = ['PP', 'SH']

##### show PP and SH observations #####

dn[dn.AdvantageType.isin(value_list)]


##### drop non-even strength situations #####

dn=dn[dn.AdvantageType != 'PP']

dn=dn[dn.AdvantageType != 'SH']


##### fill in NaN values with Even strength situations #####

dn['AdvantageType']=dn['AdvantageType'].fillna('EV')


##### seconds between each event #####

dn['TimeBetweenEvent'] = dn['EventTimeFromZero'] - dn['EventTimeFromZero'].shift(1)


##### absolute values for time between events #####

dn['TimeBetweenEvent'] = dn['TimeBetweenEvent'].abs()


##### keep only first two games of the season #####

dn=dn[dn.GameNumber <=20001]


##### keep only events that are <= 20seconds #####

dn=dn[dn.TimeBetweenEvent <=20]

dn

#shape = dn.shape
#print(shape)



Unnamed: 0,Season,GameNumber,EventNumber,Period,AdvantageType,EventTimeFromZero,EventTimeFromTwenty,EventType,EventDetail,VPlayer1,...,HitteePlayerNumber,HitteePlayerName,PenaltyTeamCode,PenaltyPlayerNumber,PenaltyPlayerName,PenaltyType,DrawnByTeamCode,DrawnByPlayerNumber,DrawnByPlayerName,TimeBetweenEvent
1,2010,20001,3,1,EV,15,1185,HIT,"TOR #37 BRENT HIT MTL #26 GORGES, Off. Zone",11,...,26.0,GORGES,,,,,,,,15.0
3,2010,20001,5,1,EV,57,1143,HIT,"MTL #76 SUBBAN HIT TOR #15 KABERLE, Neu. Zone",14,...,15.0,KABERLE,,,,,,,,11.0
4,2010,20001,6,1,EV,69,1131,GIVE,"TOR&nbsp;GIVEAWAY - #35 GIGUERE, Def. Zone",14,...,,,,,,,,,,12.0
5,2010,20001,7,1,EV,73,1127,BLOCK,"MTL #76 SUBBAN BLOCKED BY TOR #2 SCHENN, Wris...",14,...,,,,,,,,,,4.0
6,2010,20001,8,1,EV,86,1114,SHOT,"MTL ONGOAL - #81 ELLER, Wrist, Off. Zone, 11 ft.",14,...,,,,,,,,,,13.0
7,2010,20001,9,1,EV,91,1109,SHOT,"MTL ONGOAL - #46 KOSTITSYN, Snap, Off. Zone, 8...",14,...,,,,,,,,,,5.0
8,2010,20001,10,1,EV,95,1105,BLOCK,"MTL #76 SUBBAN BLOCKED BY TOR #32 VERSTEEG, S...",14,...,,,,,,,,,,4.0
9,2010,20001,11,1,EV,102,1098,MISS,"MTL #76 SUBBAN, Slap, Wide of Net, Off. Zone, ...",15,...,,,,,,,,,,7.0
10,2010,20001,12,1,EV,104,1096,GIVE,"MTL&nbsp;GIVEAWAY - #52 DARCHE, Neu. Zone",15,...,,,,,,,,,,2.0
11,2010,20001,13,1,EV,108,1092,BLOCK,"TOR #16 MACARTHUR BLOCKED BY MTL #26 GORGES, ...",15,...,,,,,,,,,,4.0
