## player quality

###  import modules

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy

### import data 

In [2]:
dm = pd.read_csv('out_data/reshape.csv')
dm = dm.drop('Unnamed: 0', axis=1)

In [3]:
dm.columns

Index(['AdvantageType', 'EventNumber', 'EventType', 'GameNumber', 'HTeamCode',
       'Period', 'PlayerNumber', 'Season', 'TeamCode', 'VTeamCode', 'Zone',
       'endtime', 'starttime', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

- sort values of data frame and organize the order of the columns

In [4]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])
dm = dm[['Season', 'GameNumber', 'Period', 'AdvantageType', 'EventNumber', 'EventType', 'Zone', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'TeamCode', 'PlayerNumber', 'starttime', 'endtime']]

- assign value to home and away team based on the on-ice event. Use forward and backwrad fill to fill in home team code and away team code per game

In [5]:
dm['HTeamCode'] = np.where(dm['PlayerNumber'] == dm['HPlayer'], dm['TeamCode'], np.nan)

In [6]:
dm['HTeamCode'] = dm.groupby('GameNumber')['HTeamCode'].ffill()
dm['HTeamCode'] = dm.groupby('GameNumber')['HTeamCode'].bfill()

In [7]:
dm['VTeamCode'] = np.where(dm['PlayerNumber'] == dm['VPlayer'], dm['TeamCode'], np.nan)

In [8]:
dm['VTeamCode'] = dm.groupby('GameNumber')['VTeamCode'].ffill()
dm['VTeamCode'] = dm.groupby('GameNumber')['VTeamCode'].bfill()

- each period starts off with a faceoff in the neutral zone. Fill in 'NaN' with 'N' (neutral).

In [9]:
dm['Zone'] = dm['Zone'].fillna('N')

In [10]:
dm['TeamCode'].fillna(dm.groupby('GameNumber')['VTeamCode'].ffill(), inplace=True)

- use **numpy** to fill in player position and player number depending if the player involved in the on-ice event played for the home or away team.

In [11]:
dm['PlayerPosition'] = np.where((dm['TeamCode'] == dm['VTeamCode']) & (dm['PlayerNumber'] == dm['VPlayer']), dm['VPosition'], dm['HPosition'])

In [12]:
dm['PlayerNumber'] = np.where((dm['TeamCode'] == dm['VTeamCode']) & (dm['PlayerPosition'] == dm['VPosition']), dm['VPlayer'], dm['HPlayer'])

- Assign a value of 0 to both variables **'starttime'** and **'endtime'**, since the first faceoff of each period has no time duration. 

In [13]:
dm['starttime'].fillna(0, inplace=True)
dm['endtime'].fillna(0, inplace=True)

In [14]:
dm.isnull().sum()

Season            0
GameNumber        0
Period            0
AdvantageType     0
EventNumber       0
EventType         0
Zone              0
VTeamCode         0
VPlayer           0
VPosition         0
HTeamCode         0
HPlayer           0
HPosition         0
TeamCode          0
PlayerNumber      0
starttime         0
endtime           0
PlayerPosition    0
dtype: int64

- determine the length of each shift.

In [15]:
dm['shift'] = dm['endtime'] - dm['starttime']

- estimate the time on ice (TOI) per player per game.

In [16]:
dm['TOI'] = dm.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerPosition'])['shift'].transform('sum')

- show the amount of events each player participated in per game.

In [17]:
dm['eventspergame'] = dm.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerPosition'])['EventNumber'].transform('count')

- keep only one observation showing the total time on ice (TOI) for each player on each team per game.

In [18]:
dm = dm[['Season', 'GameNumber', 'Period', 'AdvantageType', 'EventNumber', 'EventType', 'Zone', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'TeamCode', 'PlayerNumber', 'PlayerPosition', 'starttime', 'endtime', 'shift', 'eventspergame', 'TOI']]

In [19]:
dm = dm.drop_duplicates(['GameNumber', 'TeamCode', 'PlayerNumber'])

In [20]:
dm.to_csv('out_data/player_quality.csv', index='False', sep=',')

#### roster position follows.