## reshape dataset from wide to long

###  import modules

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy

### import data frame

In [2]:
dm = pd.read_csv('out_data/pbpmerge.csv')
dm = dm.drop('Unnamed: 0', axis=1)

In [3]:
dm.shape

(256616, 44)

### roster position

Play by play data reports each on-ice event along all 12 players that were on the ice during a given event, along with the outcome of that on-ice event. There are 6 players for the visitor team and 6 for the home team. Positions 1, 2 and 3 are the forward positions, 4 and 5 are the defense positions and 6 is the goaltender position. Each position is categorized below. 

#### a) for visitor team:

In [4]:
#dm['VPosition1'] = 'C'
#dm['VPosition2'] = 'RW'
#dm['VPosition3'] = 'LW'
#dm['VPosition4'] = 'RD'
#dm['VPosition5'] = 'LD'

#### b) for home team: 

In [5]:
#dm['HPosition1'] = 'C'
#dm['HPosition2'] = 'RW'
#dm['HPosition3'] = 'LW'
#dm['HPosition4'] = 'RD'
#dm['HPosition5'] = 'LD'

- Once each roster position has been determined, the next step is to reshape the data set form wide to long. Instead of having 2 columns for each roster position (24 total), all players will be listed into 4 columns: 2 columns for the visitor team ** 'VPlayer' & 'VPosition'** and 2 columns for the home team **'HPlayer' & 'HPosition'**

In [6]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [7]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [8]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTimeFromTwenty',
       'EventTimeFromZero', 'EventType', 'GameDate', 'GameNumber', 'HTeamCode',
       'Length', 'PenaltyType', 'Period', 'PlayerName', 'PlayerNumber',
       'Season', 'ShotResult', 'ShotType', 'TeamCode', 'VTeamCode', 'Zone',
       'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [9]:
dm.shape

(1531666, 24)

In [11]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]

In [12]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [13]:
dm.shape

(1531666, 24)

In [14]:
dm.to_csv('out_data/reshape.csv', index='False', sep=',')

### fill in missing data (NaN)

- display which columns are missing data and the amount of NAN.

In [15]:
dm.isnull().sum()

Season                       0
GameNumber                   0
GameDate                     0
Period                       0
AdvantageType                0
Zone                         0
EventNumber                  0
EventType                    0
EventDetail                  0
EventTeamCode                0
EventPlayerNumber         1093
EventPlayerName              0
EventTimeFromZero            0
EventTimeFromTwenty          0
VTeamCode                    0
VPlayer                      0
VPosition                    0
HTeamCode                    0
HPlayer                      0
HPosition                    0
ShotType                874565
ShotResult             1394600
Length                 1042103
PenaltyType            1473385
dtype: int64

- use **numpy** to fill in player number for when a team gets a penalty for having too many players on ice. In this case, 'TEAM' will be assigned to player number.

In [16]:
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save play by play data set

In [17]:
dm.to_csv('out_data/play_by_play.csv', index='False', sep=',')

#### player quality follows

In [18]:
dm.shape

(1531666, 24)

In [19]:
dm.isnull().sum()

Season                       0
GameNumber                   0
GameDate                     0
Period                       0
AdvantageType                0
Zone                         0
EventNumber                  0
EventType                    0
EventDetail                  0
EventTeamCode                0
EventPlayerNumber            0
EventPlayerName              0
EventTimeFromZero            0
EventTimeFromTwenty          0
VTeamCode                    0
VPlayer                      0
VPosition                    0
HTeamCode                    0
HPlayer                      0
HPosition                    0
ShotType                874565
ShotResult             1394600
Length                 1042103
PenaltyType            1473385
dtype: int64