### game_team_roster_quality

In [3]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)


In [4]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

In [5]:
dm = pd.read_csv('team_roster_player_rank.csv', index_col=0)

In [6]:
dm.head()
dm = dm[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'Position', 'Rank']]
dm.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank
0,2010,20001,MTL,18.0,11.0,F,2
1,2010,20001,MTL,18.0,21.0,F,1
2,2010,20001,MTL,18.0,57.0,F,2
3,2010,20001,MTL,18.0,26.0,D,2
4,2010,20001,MTL,18.0,75.0,D,2


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 19 players per team and 38 per game for the dataset to be correct.

In [90]:
dm['playercount'] = dm.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [91]:
dm['rosterposition'] = dm.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank (10 columns). 


In [92]:
dm = pd.pivot_table(dm, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount'], columns=['Position', 'Rank'], values=['rosterposition'])
dm = dm.reset_index()
dm.columns = ['_'.join(str(s).strip() for s in col if s) for col in dm.columns]
dm.reset_index()
dm = dm.fillna(0)
dm = dm.rename(columns={'rosterposition_F_1': 'F1', 'rosterposition_F_2': 'F2', 'rosterposition_D_1': 'D1', 'rosterposition_D_2': 'D2'})
dm.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,D1,D2,F1,F2
0,2010,20001,MTL,18.0,1.0,5.0,2.0,10.0
1,2010,20001,TOR,18.0,1.0,5.0,2.0,10.0
2,2010,20002,PHI,18.0,2.0,4.0,5.0,7.0
3,2010,20002,PIT,18.0,3.0,3.0,5.0,7.0
4,2010,20003,CAR,18.0,1.0,5.0,3.0,9.0
5,2010,20003,MIN,18.0,1.0,5.0,2.0,10.0
6,2010,20004,CHI,18.0,2.0,4.0,4.0,8.0
7,2010,20004,COL,18.0,1.0,5.0,2.0,10.0
8,2010,20005,CGY,18.0,1.0,5.0,3.0,9.0
9,2010,20005,EDM,18.0,1.0,5.0,0.0,12.0


In [93]:
dm.to_csv('season_game_team_roster.csv', index='False', sep=',')

In [94]:
dm.shape

(2030, 8)

In [95]:
dz = dm

In [96]:
dz['MeanF1'] = dz.groupby(['Season', 'TeamCode'])['F1'].transform('mean')
dz['MeanF2'] = dz.groupby(['Season', 'TeamCode'])['F2'].transform('mean')
dz['MeanD1'] = dz.groupby(['Season', 'TeamCode'])['D1'].transform('mean')
dz['MeanD2'] = dz.groupby(['Season', 'TeamCode'])['D2'].transform('mean')
dz.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,D1,D2,F1,F2,MeanF1,MeanF2,MeanD1,MeanD2
0,2010,20001,MTL,18.0,1.0,5.0,2.0,10.0,3.191176,8.808824,1.044118,4.955882
1,2010,20001,TOR,18.0,1.0,5.0,2.0,10.0,1.985714,10.014286,0.671429,5.328571
2,2010,20002,PHI,18.0,2.0,4.0,5.0,7.0,4.888889,7.111111,2.569444,3.430556
3,2010,20002,PIT,18.0,3.0,3.0,5.0,7.0,4.295775,7.704225,2.633803,3.366197
4,2010,20003,CAR,18.0,1.0,5.0,3.0,9.0,2.986842,9.013158,1.171053,4.828947
5,2010,20003,MIN,18.0,1.0,5.0,2.0,10.0,2.544118,9.455882,0.970588,5.029412
6,2010,20004,CHI,18.0,2.0,4.0,4.0,8.0,3.788462,8.211538,2.0,4.0
7,2010,20004,COL,18.0,1.0,5.0,2.0,10.0,1.90625,10.09375,0.953125,5.046875
8,2010,20005,CGY,18.0,1.0,5.0,3.0,9.0,2.930556,9.069444,1.0,5.0
9,2010,20005,EDM,18.0,1.0,5.0,0.0,12.0,0.0,12.0,0.470588,5.529412


In [97]:
dz = dz.drop_duplicates(['Season', 'TeamCode'])

In [98]:
dz = dz[['Season', 'TeamCode', 'RosterCount', 'MeanF1', 'MeanF2', 'MeanD1', 'MeanD2']]
dz.head()

Unnamed: 0,Season,TeamCode,RosterCount,MeanF1,MeanF2,MeanD1,MeanD2
0,2010,MTL,18.0,3.191176,8.808824,1.044118,4.955882
1,2010,TOR,18.0,1.985714,10.014286,0.671429,5.328571
2,2010,PHI,18.0,4.888889,7.111111,2.569444,3.430556
3,2010,PIT,18.0,4.295775,7.704225,2.633803,3.366197
4,2010,CAR,18.0,2.986842,9.013158,1.171053,4.828947


In [99]:
dz.shape

(30, 7)

In [100]:
dz.to_csv('season_team.csv', index='False', sep=',')

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [101]:
dm.loc[dm.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dm = dm.fillna(2)

- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [102]:
dm = pd.pivot_table(dm, index=['Season', 'GameNumber'], columns=['A'], values=['F1', 'F2', 'D1', 'D2'])
dm = dm.reset_index()
dm.columns = ['_'.join(str(s).strip() for s in col if s) for col in dm.columns]
dm = dm.reset_index()
dm = dm.rename(columns={'F1_1.0': 'VF1', 'F2_1.0': 'VF2', 'D1_1.0': 'VD1', 'D2_1.0': 'VD2', 'F1_2.0': 'HF1', 'F2_2.0': 'HF2', 'D1_2.0': 'HD1', 'D2_2.0': 'HD2'})
dm = dm[['Season', 'GameNumber', 'VF1', 'VF2', 'VD1', 'VD2', 'HF1', 'HF2', 'HD1', 'HD2']]
dm.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dm.head()

Unnamed: 0,Season,GameNumber,VF1,VF2,VD1,VD2,HF1,HF2,HD1,HD2
0,2010,20001,2.0,10.0,1.0,5.0,2.0,10.0,1.0,5.0
1,2010,20002,5.0,7.0,2.0,4.0,5.0,7.0,3.0,3.0
2,2010,20003,3.0,9.0,1.0,5.0,2.0,10.0,1.0,5.0
3,2010,20004,4.0,8.0,2.0,4.0,2.0,10.0,1.0,5.0
4,2010,20005,3.0,9.0,1.0,5.0,0.0,12.0,1.0,5.0


In [103]:
dm.to_csv('season_game_roster.csv', index='False', sep=',')

In [104]:
dm.shape

(1015, 10)