# Data

## season_game_level_data

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [2]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [3]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [4]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [5]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [6]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [7]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [8]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [9]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [10]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [11]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [12]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [13]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [14]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_data.csv', index='False', sep=',')

## season_level_data

In [15]:
dm = da

events that happened in regulation time only

In [16]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [17]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [18]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [19]:
dm.shape

(1796745, 24)

In [20]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTeamCode',
       'EventTimeFromTwenty', 'EventTimeFromZero', 'EventType', 'GameDate',
       'GameNumber', 'HTeamCode', 'Length', 'PenaltyType', 'Period',
       'PlayerName', 'PlayerNumber', 'Season', 'ShotResult', 'ShotType',
       'VTeamCode', 'Zone', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [21]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [22]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [23]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [24]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [25]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [26]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [27]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/stats_per_player_centers_wingers_defensemen.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/stats_per_player_centers_wingers_defensemen.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [28]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,TOR,37.0,BRENT,C
2,2010,MTL,14.0,PLEKANEC,C
3,2010,MTL,76.0,SUBBAN,D
4,2010,TOR,35.0,GIGUERE,G


In [29]:
dp.shape

(1058, 5)

In [30]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/player_rank_for_centers_wingers_defensemen.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/player_rank_for_centers_wingers_defensemen.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [31]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
397,2010,ANA,39.0,BELESKEY,W,3
659,2010,ANA,5.0,SBISA,D,3
137,2010,ANA,22.0,MARCHANT,C,4
135,2010,ANA,20.0,CARTER,C,4
715,2010,ANA,3.0,LILJA,D,3


- merge player position and player rankings and drop goaltenders

In [32]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1058, 6)

In [33]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              256
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [34]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1058, 6)

In [35]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
996,2010,OTT,51.0,SMITH,D,4.0


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [36]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'D') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [37]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [38]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [39]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [40]:
dw.shape

(3688734, 12)

In [41]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130215, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [42]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,5.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0


- count the amount of forwards and defensemen by team per game.

In [43]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0


In [44]:
dw1 = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode'])
dw1 = dw1.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [45]:
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,3.0,2,3,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0


In [46]:
dw1.shape

(2460, 17)

In [47]:
dw1.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')
#dw1.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')

### full regular season stats

In [48]:
dw2 = dw.copy()
dw2 = dw2.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw2.shape

(7380, 14)

- create columns for team win and team loss. 

In [49]:
dw2['TeamWin'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw2['TeamLos'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [50]:
dw2['GP'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw2['GW'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw2['GL'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw2['GF'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw2['GA'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,3.2,0,1,82,44,38
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.0,1,0,82,37,45


- create columns with the mean ranking for forward and defenseman by team per game.

In [51]:
dw2['RankC'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw2['RankW'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw2['RankD'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw2['RankC'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw2['RankW'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw2['RankD'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,3.2,0,1,82,44,38,3.2,2.857143,2.166667
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,3.2,2.857143,2.166667
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,3.2,2.857143,2.166667
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.5,2.0
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.0,1,0,82,37,45,3.0,2.5,2.0


- compute the mean per position by team for the season.

In [52]:
dw2['MeanC']= dw2.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw2['MeanW']= dw2.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw2['MeanD']= dw2.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,3.2,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.5,2.0,2.775407,2.464334,2.224158
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.0,1,0,82,37,45,3.0,2.5,2.0,2.775407,2.464334,2.224158


- display the quantity of wins and losses per team for the whole season

In [53]:
dw2['L'] = dw2.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw2['W'] = dw2.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,3.2,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481,38,44
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481,38,44
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,3.2,2.857143,2.166667,3.088618,2.622677,2.226481,38,44
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.5,2.0,2.775407,2.464334,2.224158,45,37
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.0,1,0,82,37,45,3.0,2.5,2.0,2.775407,2.464334,2.224158,45,37


- compute win and loss percent by team. Drop duplicate observations.

In [54]:
dw2 = dw2[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw2 = dw2.drop_duplicates(['Season', 'TeamCode'])
dw2['WinPc'] = dw2['W']/ dw2['GP']
dw2['LossPc'] = dw2['L']/ dw2['GP']

dw2 = dw2[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,MTL,82,44,38,220,213,0.536585,0.463415,3.088618,2.622677,2.226481
3,2010,TOR,82,37,45,225,259,0.45122,0.54878,2.775407,2.464334,2.224158
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.72561,2.655488,1.735772
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.844309,2.652996,2.022358
12,2010,CAR,82,40,42,239,242,0.487805,0.512195,2.159553,2.423393,1.796748


- rank teams based on win percent, mean centres, wingers and defensemen. 

In [55]:
dw2['RankWin'] = dw2.groupby(['Season'])['WinPc'].rank(ascending=False)
dw2['RankC'] = dw2.groupby(['Season'])['MeanC'].rank(ascending=True)
dw2['RankW'] = dw2.groupby(['Season'])['MeanW'].rank(ascending=True)
dw2['RankD'] = dw2.groupby(['Season'])['MeanD'].rank(ascending=True)
dw2 = dw2.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw2.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
135,2010,VAN,82,54,28,268,190,0.658537,0.341463,2.104878,2.591947,2.228804,1.0,4.0,14.0,17.0
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.844309,2.652996,2.022358,2.0,23.0,20.0,8.0
33,2010,SJ,82,48,34,253,216,0.585366,0.414634,1.943699,2.310414,2.382259,3.5,2.0,5.0,22.0
63,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.694106,2.546104,2.521196,3.5,18.0,12.0,27.0
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.72561,2.655488,1.735772,6.0,1.0,21.0,1.0
51,2010,DET,82,47,35,263,241,0.573171,0.426829,2.401016,2.196477,1.813298,6.0,9.0,3.0,3.0
48,2010,ANA,82,47,35,241,237,0.573171,0.426829,2.857724,2.399632,2.10482,6.0,25.0,8.0,11.0
66,2010,BOS,82,46,36,250,200,0.560976,0.439024,2.436789,2.105038,1.989837,9.0,10.0,1.0,7.0
132,2010,LA,82,46,36,227,207,0.560976,0.439024,2.635976,2.587882,2.076074,9.0,15.0,13.0,10.0
105,2010,TB,82,46,36,252,246,0.560976,0.439024,2.640854,2.357869,2.506678,9.0,16.0,6.0,26.0


In [56]:
dw2.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/full_season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')
#dw2.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/full_season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')

### keep games that have 12 forwards and 6 defensemen per team!!

In [57]:
dwfd = dw.copy()
dwfd.shape

(44262, 17)

In [58]:
dwfd = dwfd.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['RosterCount'] == 18) & (x['DCount'] == 6)).all())
dwfd.shape

(31644, 17)

In [59]:
dw3 = dwfd.copy()
dw3 = dw3.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw3.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20001,MTL,C,2,3,18.0,5.0,5.0,7.0,6.0,TOR,MTL,3.2
1,2010,20001,MTL,D,2,3,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667
2,2010,20001,MTL,W,2,3,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143
3,2010,20001,TOR,C,3,2,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0
4,2010,20001,TOR,D,3,2,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.0


In [60]:
dw3.shape

(5274, 14)

- create columns for team win and team loss. 

In [61]:
dw3['TeamWin'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw3['TeamLos'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [62]:
dw3['GP'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw3['GW'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw3['GL'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw3['GF'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw3['GA'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')

- create columns with the mean ranking for forward and defenseman by team per game.

In [63]:
dw3['RankC'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw3['RankW'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw3['RankD'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw3['RankC'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw3['RankW'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw3['RankD'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())

- compute the mean per position by team for the season.

In [64]:
dw3['MeanC']= dw3.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw3['MeanW']= dw3.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw3['MeanD']= dw3.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')

- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [65]:
dw3['L'] = dw3.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw3['W'] = dw3.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)

- compute win and loss percent by team. Drop duplicate observations.

In [66]:
dw3 = dw3[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw3 = dw3.drop_duplicates(['Season', 'TeamCode'])
dw3['WinPc'] = dw3['W']/ dw3['GP']
dw3['LossPc'] = dw3['L']/ dw3['GP']

dw3 = dw3[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]

- rank teams based on win percent, mean forwards and mean defensemen. 

In [67]:
dw3['RankWin'] = dw3.groupby(['Season'])['WinPc'].rank(ascending=False)
dw3['RankC'] = dw3.groupby(['Season'])['MeanC'].rank(ascending=True)
dw3['RankW'] = dw3.groupby(['Season'])['MeanW'].rank(ascending=True)
dw3['RankD'] = dw3.groupby(['Season'])['MeanD'].rank(ascending=True)
dw3 = dw3.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw3.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
297,2010,VAN,58,40,18,198,137,0.689655,0.310345,2.123851,2.599651,2.221264,1.0,4.0,14.0,17.0
18,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.097872,2.072568,2.148936,2.0,3.0,1.0,12.0
6,2010,PHI,66,39,27,219,188,0.590909,0.409091,1.719697,2.653409,1.739899,3.0,1.0,21.0,1.0
45,2010,DET,61,36,25,197,181,0.590164,0.409836,2.445628,2.210122,1.806011,4.0,10.0,3.0,3.0
42,2010,ANA,60,35,25,169,168,0.583333,0.416667,2.8625,2.410946,2.091667,5.0,25.0,7.0,11.0
87,2010,TB,35,20,15,112,109,0.571429,0.428571,2.717619,2.377211,2.485714,6.5,18.0,6.0,25.0
9,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.847857,2.653316,2.019048,6.5,23.0,20.0,8.0
267,2010,PHX,57,32,25,173,157,0.561404,0.438596,2.769006,2.222466,2.181287,8.0,21.0,4.0,13.0
282,2010,BOS,66,37,29,200,163,0.560606,0.439394,2.439899,2.104287,1.967172,9.0,9.0,2.0,6.0
24,2010,CGY,67,37,30,213,191,0.552239,0.447761,2.601493,2.366945,1.91791,10.0,14.0,5.0,5.0


### keep games that have 4 C, 8 W  and 6 D per team!!!!

In [68]:
du = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6)).all())

In [69]:
du.shape

(10116, 17)

- create a new dataset using team roster player rank

In [70]:
dv = du.copy()

In [71]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5
5,2010,20005,EDM,W,4,0,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.625
6,2010,20006,CBJ,C,2,3,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,2.75
7,2010,20006,CBJ,D,2,3,18.0,6.0,4.0,8.0,6.0,SJ,CBJ,2.333333
8,2010,20006,CBJ,W,2,3,18.0,8.0,4.0,8.0,6.0,SJ,CBJ,2.75
9,2010,20006,SJ,C,3,2,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,2.0


In [72]:
dv.shape

(1686, 14)

- create columns for team win and team loss. 

In [73]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0,1,0
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0


- display games played, games won, games loss, goals for and goals against by team for the season.

In [74]:
dv['GP'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,11,12
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0,1,0,26,9,17
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17


- create columns with the mean ranking for forward and defenseman by team per game.

In [75]:
dv['RankC'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dv['RankW'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dv['RankD'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dv['RankC'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dv['RankW'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dv['RankD'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,11,12,2.75,2.5,2.333333
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.75,2.5,2.333333
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.75,2.5,2.333333
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0,1,0,26,9,17,3.0,2.625,2.5
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,3.0,2.625,2.5


- compute the mean per position by team for the season.

In [76]:
dv['MeanC']= dv.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dv['MeanW']= dv.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dv['MeanD']= dv.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0,1,0,26,9,17,3.0,2.625,2.5,3.086538,2.716346,2.564103
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,3.0,2.625,2.5,3.086538,2.716346,2.564103


- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [77]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304,12,11
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304,12,11
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.75,2.5,2.333333,2.608696,2.336957,1.891304,12,11
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,3.0,1,0,26,9,17,3.0,2.625,2.5,3.086538,2.716346,2.564103,17,9
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,3.0,2.625,2.5,3.086538,2.716346,2.564103,17,9


- compute win and loss percent by team. Drop duplicate observations.

In [78]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,CGY,23,11,12,75,75,0.478261,0.521739,2.608696,2.336957,1.891304
3,2010,EDM,26,9,17,67,88,0.346154,0.653846,3.086538,2.716346,2.564103
6,2010,CBJ,25,9,16,69,86,0.36,0.64,2.85,2.64,2.16
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,2.044118,2.404412,2.362745
12,2010,ANA,10,3,7,18,32,0.3,0.7,2.825,2.4875,2.1


- rank teams based on win percent, mean forwards and mean defensemen. 

In [79]:
dv['RankWin'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['RankC'] = dv.groupby(['Season'])['MeanC'].rank(ascending=True)
dv['RankW'] = dv.groupby(['Season'])['MeanW'].rank(ascending=True)
dv['RankD'] = dv.groupby(['Season'])['MeanD'].rank(ascending=True)
dv = dv.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
39,2010,NSH,16,13,3,55,29,0.8125,0.1875,3.0,2.726562,2.0,1.0,26.0,23.0,7.0
105,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.288462,2.576923,2.211538,2.0,6.0,14.0,15.0
15,2010,DET,25,18,7,93,63,0.72,0.28,2.37,2.205,1.78,3.0,10.0,4.0,2.0
45,2010,LA,25,17,8,68,54,0.68,0.32,2.59,2.6,2.086667,4.0,14.0,15.0,11.0
18,2010,BUF,18,12,6,62,52,0.666667,0.333333,2.722222,2.256944,2.037037,5.0,17.0,5.0,8.0
51,2010,PHI,31,19,12,102,82,0.612903,0.387097,1.774194,2.616935,1.725806,6.0,1.0,18.0,1.0
66,2010,CHI,20,12,8,68,54,0.6,0.4,1.9875,2.1375,2.158333,7.5,2.0,3.0,13.0
573,2010,OTT,10,6,4,22,22,0.6,0.4,3.15,3.075,2.4,7.5,30.0,29.0,22.0
21,2010,NYR,22,13,9,72,56,0.590909,0.409091,2.5,2.517045,2.409091,9.0,12.0,12.0,23.0
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,2.044118,2.404412,2.362745,10.0,3.0,7.0,20.0


In [80]:
dv.shape

(30, 16)

In [81]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')

## season_game_team_roster_data

- use the team roster player rank dataset (dw) to display the roster quality by team per game

In [102]:
dx1 = du.copy()

In [103]:
dx2 = dv.copy()
dx2 = dx2[['Season', 'TeamCode', 'MeanC', 'MeanW', 'MeanD']]

In [104]:
dx = pd.merge(dx1, dx2, on=['Season', 'TeamCode'], how='left')
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,MeanC,MeanW,MeanD
0,2010,20005,CGY,13.0,JOKINEN,C,2.0,0,4,4,EDM,CGY,18.0,4.0,4.0,8.0,6.0,2.608696,2.336957,1.891304
1,2010,20005,CGY,12.0,IGINLA,W,1.0,0,4,4,EDM,CGY,18.0,8.0,4.0,8.0,6.0,2.608696,2.336957,1.891304
2,2010,20005,CGY,40.0,TANGUAY,W,1.0,0,4,4,EDM,CGY,18.0,8.0,4.0,8.0,6.0,2.608696,2.336957,1.891304
3,2010,20005,CGY,3.0,CARSON,D,3.0,0,4,4,EDM,CGY,18.0,6.0,4.0,8.0,6.0,2.608696,2.336957,1.891304
4,2010,20005,CGY,28.0,REGEHR,D,2.0,0,4,4,EDM,CGY,18.0,6.0,4.0,8.0,6.0,2.608696,2.336957,1.891304


In [105]:
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank', 'MeanC', 'MeanW', 'MeanD']]
dx['RankC'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dx['RankW'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dx['RankD'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dx['RankC'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dx['RankW'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dx['RankD'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dx['GMeanC']= dx.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dx['GMeanW']= dx.groupby(['Season', 'GameNumber', 'TeamCode'])['RankW'].transform('mean')
dx['GMeanD']= dx.groupby(['Season', 'GameNumber', 'TeamCode'])['RankD'].transform('mean')
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank', 'MeanC', 'MeanW', 'MeanD', 'GMeanC', 'GMeanW', 'GMeanD']]
dx = dx.rename(columns={'PlayerPosition': 'Position'})
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD
0,2010,20005,CGY,18.0,13.0,C,2.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111
1,2010,20005,CGY,18.0,12.0,W,1.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111
2,2010,20005,CGY,18.0,40.0,W,1.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111
3,2010,20005,CGY,18.0,3.0,D,3.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111
4,2010,20005,CGY,18.0,28.0,D,2.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 18 players per team and 36 per game for the dataset to be correct.

In [106]:
dx['playercount'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [107]:
dx['rosterposition'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [108]:
dx = pd.pivot_table(dx, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'MeanC', 'MeanW', 'MeanD', 'GMeanC', 'GMeanW', 'GMeanD'], columns=['Position', 'Rank'], values=['rosterposition'])
dx = dx.reset_index()
dx.columns = ['_'.join(str(s).strip() for s in col if s) for col in dx.columns]
dx.reset_index()
dx = dx.fillna(0)
dx = dx.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dx.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0
5,2010,20009,DET,18.0,2.37,2.205,1.78,2.157778,2.111111,1.722222,1.0,2.0,1.0,0.0,3.0,2.0,1.0,2.0,3.0,3.0,0.0
6,2010,20013,BUF,18.0,2.722222,2.256944,2.037037,2.626543,2.222222,2.333333,0.0,2.0,2.0,0.0,2.0,2.0,2.0,2.0,3.0,1.0,2.0
7,2010,20013,NYR,18.0,2.5,2.517045,2.409091,2.510101,2.555556,2.444444,0.0,2.0,2.0,0.0,0.0,4.0,2.0,0.0,3.0,4.0,1.0
8,2010,20015,DAL,18.0,2.298387,2.729839,2.575269,2.213262,2.388889,2.722222,2.0,0.0,1.0,1.0,0.0,2.0,4.0,2.0,2.0,0.0,4.0
9,2010,20015,NYI,18.0,2.54,2.875,2.253333,2.44,2.833333,2.055556,1.0,1.0,1.0,1.0,1.0,3.0,2.0,0.0,3.0,1.0,4.0


In [109]:
dx.shape

(562, 21)

In [110]:
dx.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')
#dx.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')

- create a dataset that will display the mean of forwards and defencemen by season per team

In [111]:
dz = dx.copy()

In [112]:
# mean centers ranking per team
dz['MeanC1'] = dz.groupby(['Season', 'TeamCode'])['C1'].transform('mean')
dz['MeanC2'] = dz.groupby(['Season', 'TeamCode'])['C2'].transform('mean')
dz['MeanC3'] = dz.groupby(['Season', 'TeamCode'])['C3'].transform('mean')
dz['MeanC4'] = dz.groupby(['Season', 'TeamCode'])['C4'].transform('mean')

# mean wing ranking per team
dz['MeanW1'] = dz.groupby(['Season', 'TeamCode'])['W1'].transform('mean')
dz['MeanW2'] = dz.groupby(['Season', 'TeamCode'])['W2'].transform('mean')
dz['MeanW3'] = dz.groupby(['Season', 'TeamCode'])['W3'].transform('mean')
dz['MeanW4'] = dz.groupby(['Season', 'TeamCode'])['W4'].transform('mean')


# mean defense ranking per team
dz['MeanD1'] = dz.groupby(['Season', 'TeamCode'])['D1'].transform('mean')
dz['MeanD2'] = dz.groupby(['Season', 'TeamCode'])['D2'].transform('mean')
dz['MeanD3'] = dz.groupby(['Season', 'TeamCode'])['D3'].transform('mean')

dz.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,0.0,1.869565,1.826087,0.304348,2.0,2.73913,1.826087,1.434783,1.913043,2.826087,1.26087
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0,0.0,0.961538,1.730769,1.307692,0.0,4.615385,1.038462,2.346154,0.0,2.615385,3.384615
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0,0.0,1.88,0.84,1.28,0.84,2.92,2.52,1.72,0.0,5.04,0.96
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,1.941176,1.0,0.0,1.058824,2.941176,0.764706,2.411765,1.882353,1.0,1.823529,3.176471
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0,0.9,1.0,0.0,2.1,2.7,1.0,2.0,2.3,1.9,1.6,2.5


- drop duplicates by season and team

In [113]:
dz = dz.drop_duplicates(['Season', 'TeamCode'])
dz = dz[['Season', 'TeamCode', 'RosterCount', 'MeanC', 'MeanW', 'MeanD', 'GMeanC', 'GMeanW', 'GMeanD', 'MeanC1', 'MeanC2', 'MeanC3', 'MeanC4', 'MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']]
dz.head()

Unnamed: 0,Season,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3
0,2010,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,1.869565,1.826087,0.304348,2.0,2.73913,1.826087,1.434783,1.913043,2.826087,1.26087
1,2010,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,0.961538,1.730769,1.307692,0.0,4.615385,1.038462,2.346154,0.0,2.615385,3.384615
2,2010,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,1.88,0.84,1.28,0.84,2.92,2.52,1.72,0.0,5.04,0.96
3,2010,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,1.941176,1.0,0.0,1.058824,2.941176,0.764706,2.411765,1.882353,1.0,1.823529,3.176471
4,2010,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,0.9,1.0,0.0,2.1,2.7,1.0,2.0,2.3,1.9,1.6,2.5


In [114]:
dz.shape

(30, 20)

In [115]:
#dz.to_csv('season_team.csv', index='False', sep=',')

In [116]:
dz.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_team_centers_wingers_defensemen.csv', index='False', sep=',')
#dz.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_team_centers_wingers_defensemen.csv', index='False', sep=',')

## game level data

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [117]:
dy = dx.copy()

In [118]:
dy.loc[dy.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dy = dy.fillna(2)

In [119]:
dy.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,A
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0,2.0
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0,1.0
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,2.0
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0,1.0


- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [120]:
dy = pd.pivot_table(dy, index=['Season', 'GameNumber'], columns=['A'], values=['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3'])
dy = dy.reset_index()
dy.columns = ['_'.join(str(s).strip() for s in col if s) for col in dy.columns]
dy = dy.reset_index()
dy = dy.rename(columns={'C1_1.0': 'VC1', 'C2_1.0': 'VC2', 'C3_1.0': 'VC3', 'C4_1.0': 'VC4', 'W1_1.0': 'VW1', 'W2_1.0': 'VW2', 'W3_1.0': 'VW3', 'W4_1.0': 'VW4', 'D1_1.0': 'VD1', 'D2_1.0': 'VD2', 'D3_1.0': 'VD3', 'C1_2.0': 'HC1', 'C2_2.0': 'HC2', 'C3_2.0': 'HC3', 'C4_2.0': 'HC4', 'W1_2.0': 'HW1', 'W2_2.0': 'HW2', 'W3_2.0': 'HW3', 'W4_2.0': 'HW4', 'D1_2.0': 'HD1', 'D2_2.0': 'HD2', 'D3_2.0': 'HD3'})
dy = dy[['Season', 'GameNumber', 'VC1', 'VC2', 'VC3', 'VC4', 'VW1', 'VW2', 'VW3', 'VW4', 'VD1', 'VD2', 'VD3', 'HC1', 'HC2', 'HC3', 'HC4', 'HW1', 'HW2', 'HW3', 'HW4', 'HD1', 'HD2', 'HD3']]
dy.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3
0,2010,20005,0.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,0.0,1.0,2.0,1.0,0.0,5.0,1.0,2.0,0.0,3.0,3.0
1,2010,20006,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,0.0,4.0,2.0,2.0,1.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0,2.0,3.0
2,2010,20009,1.0,1.0,0.0,2.0,3.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,1.0,0.0,2.0,3.0,3.0,0.0,3.0,2.0,1.0
3,2010,20013,0.0,2.0,2.0,0.0,2.0,3.0,1.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,3.0,4.0,1.0,0.0,4.0,2.0
4,2010,20015,2.0,0.0,1.0,1.0,2.0,2.0,0.0,4.0,0.0,2.0,4.0,1.0,1.0,1.0,1.0,0.0,3.0,1.0,4.0,1.0,3.0,2.0


In [121]:
dy.shape

(281, 24)

In [122]:
#dy.to_csv('season_game_roster.csv', index='False', sep=',')

In [123]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')

# Roster Analysis

## season_level_analysis

#### $WinPc = \beta_{0} + \beta_{1}MeanC_{1} + \beta_{2}MeanC_{2}+ \beta_{3}MeanC_{3} + \beta_{4}MeanC_{4} + \beta_{5}MeanW_{1} + \beta_{6}MeanW_{2}+ \beta_{7}MeanW_{3} + \beta_{8}MeanW_{4} + \beta_{9}MeanD_{1} + \beta_{10}MeanD_{2}+ \beta_{11}MeanD_{3} + e_{s}$

#### games with 4 centres, 8 wingers and 6 defensemen

- merge season_team dataset (dz) and season_team_roster_ranking (dv) for roster analysis at the season level. Use **ds** as the merging dataset.

In [141]:
ds = pd.merge(dv, dz, on=['Season', 'TeamCode', 'MeanC', 'MeanW', 'MeanD'], how='left')

In [142]:
ds['LMeanC1'] = ds['MeanC1'].mean()
ds['LMeanC2'] = ds['MeanC2'].mean()
ds['LMeanC3'] = ds['MeanC3'].mean()
ds['LMeanC4'] = ds['MeanC4'].mean()

ds['LMeanW1'] = ds['MeanW1'].mean()
ds['LMeanW2'] = ds['MeanW2'].mean()
ds['LMeanW3'] = ds['MeanW3'].mean()
ds['LMeanW4'] = ds['MeanW4'].mean()

ds['LMeanD1'] = ds['MeanD1'].mean()
ds['LMeanD2'] = ds['MeanD2'].mean()
ds['LMeanD3'] = ds['MeanD3'].mean()

ds.sort_values(['WinPc'], ascending=[False], inplace=True)

#ds['c1'] = ds.apply(lambda x: 1 if x['MeanC1'] > x['Mean_C'] else 0, 1)
#ds.sort_values(['c1'], ascending=[False], inplace=True)
#ds.head(30)

- mean goals for and mean goals against per team.

In [143]:
#ds['meanGF'] = ds['GF']/ ds['GP']
#ds['meanGA'] = ds['GA']/ ds['GP']
#ds['mGD'] = ds.meanGF - ds.meanGA

In [144]:
ds['TC'] = ds['MeanC1'] + ds['MeanC2'] + ds['MeanC3'] + ds['MeanC4']
ds['TW'] = ds['MeanW1'] + ds['MeanW2'] + ds['MeanW3'] + ds['MeanW4']
ds['TD'] = ds['MeanD1'] + ds['MeanD2'] + ds['MeanD3']

ds['meanc1'] = (ds['MeanC1'] * 4) / ds['TC']
ds['meanc2'] = (ds['MeanC2'] * 4) / ds['TC']
ds['meanc3'] = (ds['MeanC3'] * 4) / ds['TC']
ds['meanc4'] = (ds['MeanC4'] * 4) / ds['TC']

ds['meanw1'] = (ds['MeanW1'] * 8) / ds['TW']
ds['meanw2'] = (ds['MeanW2'] * 8) / ds['TW']
ds['meanw3'] = (ds['MeanW3'] * 8) / ds['TW']
ds['meanw4'] = (ds['MeanW4'] * 8) / ds['TW']

ds['meand1'] = (ds['MeanD1'] * 6) / ds['TW']
ds['meand2'] = (ds['MeanD2'] * 6) / ds['TW']
ds['meand3'] = (ds['MeanD3'] * 6) / ds['TW']

ds['lmeanc1'] = ds['meanc1'].mean()
ds['lmeanc2'] = ds['meanc2'].mean()
ds['lmeanc3'] = ds['meanc3'].mean()
ds['lmeanc4'] = ds['meanc4'].mean()

ds['lmeanw1'] = ds['meanw1'].mean()
ds['lmeanw2'] = ds['meanw2'].mean()
ds['lmeanw3'] = ds['meanw3'].mean()
ds['lmeanw4'] = ds['meanw4'].mean()

ds['lmeand1'] = ds['meand1'].mean()
ds['lmeand2'] = ds['meand2'].mean()
ds['lmeand3'] = ds['meand3'].mean()

ds['dc1'] = ds['meanc1'] - ds['lmeanc1']
ds['dc2'] = ds['meanc2'] - ds['lmeanc2']
ds['dc3'] = ds['meanc3'] - ds['lmeanc3']
ds['dc4'] = ds['meanc4'] - ds['lmeanc4']

ds['dw1'] = ds['meanw1'] - ds['lmeanw1']
ds['dw2'] = ds['meanw2'] - ds['lmeanw2']
ds['dw3'] = ds['meanw3'] - ds['lmeanw3']
ds['dw4'] = ds['meanw4'] - ds['lmeanw4']

ds['dd1'] = ds['meand1'] - ds['lmeand1']
ds['dd2'] = ds['meand2'] - ds['lmeand2']
ds['dd3'] = ds['meand3'] - ds['lmeand3']

- porpotion of position per team. 

In [145]:
ds['nmeanc1'] = ((ds['MeanC1'] * 4) / ds['TC']) / 4
ds['nmeanc2'] = ((ds['MeanC2'] * 4) / ds['TC']) / 4
ds['nmeanc3'] = ((ds['MeanC3'] * 4) / ds['TC']) / 4
ds['nmeanc4'] = ((ds['MeanC4'] * 4) / ds['TC']) / 4

ds['nmeanw1'] = ((ds['MeanW1'] * 8) / ds['TW']) / 8
ds['nmeanw2'] = ((ds['MeanW2'] * 8) / ds['TW']) / 8
ds['nmeanw3'] = ((ds['MeanW3'] * 8) / ds['TW']) / 8
ds['nmeanw4'] = ((ds['MeanW4'] * 8) / ds['TW']) / 8

ds['nmeand1'] = ((ds['MeanD1'] * 6) / ds['TW']) / 6
ds['nmeand2'] = ((ds['MeanD2'] * 6) / ds['TW']) / 6
ds['nmeand3'] = ((ds['MeanD3'] * 6) / ds['TW']) / 6

ds['nlmeanc1'] = ds['nmeanc1'].mean()
ds['nlmeanc2'] = ds['nmeanc2'].mean()
ds['nlmeanc3'] = ds['nmeanc3'].mean()
ds['nlmeanc4'] = ds['nmeanc4'].mean()

ds['nlmeanw1'] = ds['nmeanw1'].mean()
ds['nlmeanw2'] = ds['nmeanw2'].mean()
ds['nlmeanw3'] = ds['nmeanw3'].mean()
ds['nlmeanw4'] = ds['nmeanw4'].mean()

ds['nlmeand1'] = ds['nmeand1'].mean()
ds['nlmeand2'] = ds['nmeand2'].mean()
ds['nlmeand3'] = ds['nmeand3'].mean()

ds['ndc1'] = ds['nmeanc1'] - ds['nlmeanc1']
ds['ndc2'] = ds['nmeanc2'] - ds['nlmeanc2']
ds['ndc3'] = ds['nmeanc3'] - ds['nlmeanc3']
ds['ndc4'] = ds['nmeanc4'] - ds['nlmeanc4']

ds['ndw1'] = ds['nmeanw1'] - ds['nlmeanw1']
ds['ndw2'] = ds['nmeanw2'] - ds['nlmeanw2']
ds['ndw3'] = ds['nmeanw3'] - ds['nlmeanw3']
ds['ndw4'] = ds['nmeanw4'] - ds['nlmeanw4']

ds['ndd1'] = ds['nmeand1'] - ds['nlmeand1']
ds['ndd2'] = ds['nmeand2'] - ds['nlmeand2']
ds['ndd3'] = ds['nmeand3'] - ds['nlmeand3']

In [146]:
ds.shape

(30, 111)

### summary analysis

In [147]:
ds.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,RosterCount,GMeanC,GMeanW,GMeanD,MeanC1,MeanC2,MeanC3,MeanC4,MeanW1,MeanW2,MeanW3,MeanW4,MeanD1,MeanD2,MeanD3,LMeanC1,LMeanC2,LMeanC3,LMeanC4,LMeanW1,LMeanW2,LMeanW3,LMeanW4,LMeanD1,LMeanD2,LMeanD3,TC,TW,TD,meanc1,meanc2,meanc3,meanc4,meanw1,meanw2,meanw3,meanw4,meand1,meand2,meand3,lmeanc1,lmeanc2,lmeanc3,lmeanc4,lmeanw1,lmeanw2,lmeanw3,lmeanw4,lmeand1,lmeand2,lmeand3,dc1,dc2,dc3,dc4,dw1,dw2,dw3,dw4,dd1,dd2,dd3,nmeanc1,nmeanc2,nmeanc3,nmeanc4,nmeanw1,nmeanw2,nmeanw3,nmeanw4,nmeand1,nmeand2,nmeand3,nlmeanc1,nlmeanc2,nlmeanc3,nlmeanc4,nlmeanw1,nlmeanw2,nlmeanw3,nlmeanw4,nlmeand1,nlmeand2,nlmeand3,ndc1,ndc2,ndc3,ndc4,ndw1,ndw2,ndw3,ndw4,ndd1,ndd2,ndd3
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,18.733333,9.366667,9.366667,53.133333,53.133333,0.480995,0.519005,2.597801,2.571201,2.217145,15.5,15.5,15.5,15.5,18.0,2.482937,2.474074,2.194444,0.743861,0.982708,1.411799,0.861632,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,0.743861,0.982708,1.411799,0.861632,1.200142,2.896141,2.037686,1.866032,0.863402,1.796046,1.840552,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,-1.480297e-17,1.332268e-16,-2.664535e-16,-2.5905200000000003e-17,5.1810410000000004e-17,-4.736952e-16,-4.588922e-16,-2.072416e-16,-1.036208e-16,-2.664535e-16,8.881784000000001e-17,0.185965,0.245677,0.35295,0.215408,0.150018,0.362018,0.254711,0.233254,0.1439,0.299341,0.306759,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,-3.700743e-18,3.3306690000000003e-17,-6.661338000000001e-17,-6.476301e-18,6.476301e-18,-5.921189000000001e-17,-5.736152000000001e-17,-2.5905200000000003e-17,0.0,5.2735590000000006e-17,-3.8857810000000004e-17
std,0.0,7.750121,5.327116,4.810071,26.654537,25.429732,0.182702,0.182702,0.366324,0.255158,0.266215,8.80047,8.799491,8.802429,8.802429,0.0,0.380791,0.344862,0.292793,0.830099,0.73355,0.837815,0.614357,0.90406,1.137152,1.006628,0.996438,0.798621,1.004437,1.068793,1.129203e-16,3.387608e-16,0.0,2.258405e-16,2.258405e-16,4.51681e-16,1.355043e-15,4.51681e-16,6.775215e-16,9.033621e-16,1.806724e-15,8.246530000000001e-17,1.649306e-16,2.332471e-16,0.830099,0.73355,0.837815,0.614357,0.90406,1.137152,1.006628,0.996438,0.598965,0.753328,0.801595,1.129203e-16,3.387608e-16,0.0,2.258405e-16,2.258405e-16,4.51681e-16,1.355043e-15,4.51681e-16,4.51681e-16,9.033621e-16,6.775215e-16,0.8300992,0.73355,0.8378155,0.6143567,0.9040599,1.137152,1.006628,0.9964381,0.5989654,0.7533279,0.801595,0.207525,0.183387,0.209454,0.153589,0.113007,0.142144,0.125829,0.124555,0.099828,0.125555,0.133599,2.823006e-17,8.469019e-17,0.0,5.646013000000001e-17,2.823006e-17,5.646013000000001e-17,1.693804e-16,5.646013000000001e-17,8.469019e-17,1.129203e-16,2.258405e-16,0.2075248,0.1833875,0.2094539,0.1535892,0.1130075,0.1421439,0.1258285,0.1245548,0.099828,0.1255547,0.1335992
min,2010.0,4.0,0.0,2.0,7.0,10.0,0.0,0.1875,1.774194,2.125,1.725806,1.0,1.0,1.5,1.0,18.0,1.543011,1.888889,1.555556,0.0,0.0,0.0,0.0,0.0,0.764706,0.0,0.0,0.0,0.454545,0.354839,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,0.0,0.0,0.0,0.0,0.0,0.764706,0.0,0.0,0.0,0.340909,0.266129,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,-0.7438609,-0.9827076,-1.411799,-0.8616324,-1.200142,-2.131435,-2.037686,-1.866032,-0.8634016,-1.455137,-1.574423,0.0,0.0,0.0,0.0,0.0,0.095588,0.0,0.0,0.0,0.056818,0.044355,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,-0.1859652,-0.2456769,-0.3529498,-0.2154081,-0.1500177,-0.2664293,-0.2547107,-0.2332539,-0.1439,-0.2425229,-0.2624038
25%,2010.0,13.25,6.0,6.25,31.0,33.25,0.387311,0.402273,2.308929,2.419922,2.038194,7.875,8.25,8.25,8.25,18.0,2.191336,2.222222,2.0,0.0,0.5375,0.954545,0.263587,0.81,2.0,1.309384,1.237981,0.455,1.7625,1.552222,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,0.0,0.5375,0.954545,0.263587,0.81,2.0,1.309384,1.237981,0.34125,1.321875,1.164167,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,-0.7438609,-0.4452076,-0.4572536,-0.5980454,-0.390142,-0.8961406,-0.7283017,-0.6280508,-0.5221516,-0.4741713,-0.6763854,0.0,0.134375,0.238636,0.065897,0.10125,0.25,0.163673,0.154748,0.056875,0.220312,0.194028,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,-0.1859652,-0.1113019,-0.1143134,-0.1495114,-0.04876775,-0.1120176,-0.09103771,-0.07850635,-0.087025,-0.07902855,-0.1127309
50%,2010.0,17.0,9.0,8.0,58.0,49.0,0.481066,0.518934,2.645257,2.60303,2.232436,15.5,15.5,15.5,15.5,18.0,2.492828,2.388889,2.222222,0.744242,1.0,1.163333,1.0,1.0,2.94,1.867589,1.986631,1.136364,2.136364,2.477273,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,0.744242,1.0,1.163333,1.0,1.0,2.94,1.867589,1.986631,0.852273,1.602273,1.857955,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,0.0003814808,0.01729236,-0.2484657,0.1383676,-0.200142,0.04385935,-0.1700969,0.1205995,-0.01112886,-0.1937736,0.01740244,0.186061,0.25,0.290833,0.25,0.125,0.3675,0.233449,0.248329,0.142045,0.267045,0.309659,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,9.537021e-05,0.004323091,-0.06211643,0.03459191,-0.02501775,0.005482419,-0.02126211,0.01507493,-0.001855,-0.0322956,0.002900406
75%,2010.0,25.0,12.75,12.0,74.25,72.0,0.597727,0.612689,2.847727,2.726172,2.406818,22.75,22.75,22.75,22.75,18.0,2.751225,2.777778,2.375,1.0,1.604701,1.982143,1.145385,1.886667,3.598958,2.971591,2.359266,1.909783,3.05,3.2875,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,1.0,1.604701,1.982143,1.145385,1.886667,3.598958,2.971591,2.359266,1.432337,2.2875,2.465625,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,0.2561391,0.6219932,0.5703438,0.2837523,0.6865247,0.7028177,0.9339051,0.4932342,0.5689354,0.4914537,0.6250729,0.25,0.401175,0.495536,0.286346,0.235833,0.44987,0.371449,0.294908,0.238723,0.38125,0.410937,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,0.06403476,0.1554983,0.142586,0.07093806,0.08581559,0.08785221,0.1167381,0.06165427,0.094823,0.08190895,0.1041788
max,2010.0,33.0,21.0,22.0,102.0,106.0,0.8125,1.0,3.15,3.086538,2.666667,30.0,30.0,30.0,29.5,18.0,3.041667,3.111111,2.777778,2.967742,2.666667,3.411765,2.6,3.0,5.0,4.0,3.846154,2.76,5.04,4.363636,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,1.151202,2.394728,2.454069,4.0,8.0,6.0,2.967742,2.666667,3.411765,2.6,3.0,5.0,4.0,3.846154,2.07,3.78,3.272727,0.7438609,0.9827076,1.411799,0.8616324,1.200142,2.896141,2.037686,1.866032,0.8634016,1.796046,1.840552,2.223881,1.683959,1.999966,1.738368,1.799858,2.103859,1.962314,1.980122,1.206598,1.983954,1.432175,0.741935,0.666667,0.852941,0.65,0.375,0.625,0.5,0.480769,0.345,0.63,0.545455,0.1859652,0.2456769,0.35295,0.2154081,0.1500177,0.3620176,0.2547107,0.2332539,0.1439003,0.2993411,0.3067587,0.5559702,0.4209898,0.4999914,0.4345919,0.2249823,0.2629824,0.2452893,0.2475153,0.2011,0.3306589,0.2386959


### estimate roster model 

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [167]:
y = ds['WinPc']   

#X1 = sm.add_constant(ds[['MeanC1', 'MeanC2', 'MeanC3', 'MeanC4', 'MeanW1', 'MeanW2', 'MeanW3', 'MeanW4', 'MeanD1', 'MeanD2', 'MeanD3']] )
#X2 = sm.add_constant(ds[['DC1', 'DC2', 'DC3', 'DC4', 'DW1', 'DW2', 'DW3', 'DW4', 'DD1', 'DD2', 'DD3']] )

X3 = sm.add_constant(ds[['meanc1', 'meanc2', 'meanc3', 'meanc4', 'meanw1', 'meanw2', 'meanw3', 'meanw4', 'meand1', 'meand2', 'meand3']] )
X4 = sm.add_constant(ds[['dc1', 'dc2', 'dc3', 'dc4', 'dw1', 'dw2', 'dw3', 'dw4', 'dd1', 'dd2', 'dd3']] )

X5 = sm.add_constant(ds[['nmeanc1', 'nmeanc2', 'nmeanc3', 'nmeanc4', 'nmeanw1', 'nmeanw2', 'nmeanw3', 'nmeanw4', 'nmeand1', 'nmeand2', 'nmeand3']] )
X6 = sm.add_constant(ds[['ndc1', 'ndc2', 'ndc3', 'ndc4', 'ndw1', 'ndw2', 'ndw3', 'ndw4', 'ndd1', 'ndd2', 'ndd3']] )

#X7 = sm.add_constant(dr2[['DC1', 'DC2', 'DC3', 'DC4']] )
#X8 = sm.add_constant(dr2[['DW1', 'DW2', 'DW3', 'DW4']] )
#X9 = sm.add_constant(dr2[['DD1', 'DD2', 'DD3']] )

#m1 = sm.OLS(y, X1).fit()
#m2 = sm.OLS(y, X2).fit()
m3 = sm.OLS(y, X3).fit()
m4 = sm.OLS(y, X4).fit()
m5 = sm.OLS(y, X5).fit()
m6 = sm.OLS(y, X6).fit()


#m1.summary()
#m2.summary()
#m3.summary()
m4.summary()
#m5.summary()
#m6.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.466
Model:,OLS,Adj. R-squared:,0.262
Method:,Least Squares,F-statistic:,2.287
Date:,"Wed, 07 Mar 2018",Prob (F-statistic):,0.0616
Time:,17:36:03,Log-Likelihood:,18.337
No. Observations:,30,AIC:,-18.67
Df Residuals:,21,BIC:,-6.064
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4810,0.029,16.786,0.000,0.421 0.541
dc1,0.0107,0.036,0.302,0.766,-0.063 0.085
dc2,-0.0919,0.033,-2.745,0.012,-0.161 -0.022
dc3,0.0240,0.030,0.799,0.433,-0.039 0.087
dc4,0.0572,0.045,1.270,0.218,-0.036 0.151
dw1,0.0128,0.029,0.437,0.667,-0.048 0.074
dw2,0.0344,0.024,1.414,0.172,-0.016 0.085
dw3,0.0233,0.026,0.907,0.375,-0.030 0.077
dw4,-0.0705,0.030,-2.363,0.028,-0.133 -0.008

0,1,2,3
Omnibus:,1.947,Durbin-Watson:,0.859
Prob(Omnibus):,0.378,Jarque-Bera (JB):,1.497
Skew:,-0.361,Prob(JB):,0.473
Kurtosis:,2.178,Cond. No.,1.55e+16


## season_game_team_level_analysis

#### $Win = \beta_{0} + \beta_{1}C_{1} + \beta_{2}C_{2} + \beta_{3}C_{3} + \beta_{4}C_{4} + \beta_{5}W_{1} + \beta_{6}W_{2} + \beta_{7}W_{3} + \beta_{8}W_{4} + \beta_{9}D_{1} + \beta_{10}D_{2} + \beta_{11}D_{3} + e_{s,g,t}$

- use season game data (dg) and season game team roster (dx) to conduct season game team level analysis (dt).

In [149]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [150]:
dt = dx.merge(dg, on=['Season', 'GameNumber'], how='left')
dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0,ANA,DET,0,4,4,4,0,DET,ANA


In [151]:
dt.shape

(562, 30)

- Sum up goals for and against by team per game and find the goal differential (GD) per game. Assign a value of 1 to the team that won the game. 

In [152]:
dt['GD'] = dt.apply(lambda x: (x['HGF'] - x['VGF']) if x['HTeamCode']== x['TeamCode'] else (x['VGF'] - x['HGF']), 1)
dt['Win'] = dt.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dt['GF'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']== x['TeamCode'] else x['VGF'], 1)
dt['GA'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']!= x['TeamCode'] else x['VGF'], 1)
dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,Win,GF,GA
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,CGY,EDM,0,4,-4,4,0,EDM,CGY,0,0,4
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY,1,4,0
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ,0,2,3
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,SJ,CBJ,3,2,1,2,3,SJ,CBJ,1,3,2
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0,ANA,DET,0,4,-4,4,0,DET,ANA,0,0,4


In [153]:
dt.shape

(562, 33)

In [155]:
# difference in center position per game
dt['c1'] = dt['C1']/4
dt['c2'] = dt['C2']/4
dt['c3'] = dt['C3']/4
dt['c4'] = dt['C4']/4

# difference in wing position per game
dt['w1'] = dt['W1']/8
dt['w2'] = dt['W2']/8
dt['w3'] = dt['W3']/8
dt['w4'] = dt['W4']/8

# difference in defence position per game
dt['d1'] = dt['D1']/6
dt['d2'] = dt['D2']/6
dt['d3'] = dt['D3']/6

### summary analysis

In [156]:
dt.describe()

Unnamed: 0,Season,GameNumber,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VGF,HGF,GD,VGA,HGA,Win,GF,GA,c1,c2,c3,c4,w1,w2,w3,w4,d1,d2,d3
count,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0
mean,2010.0,20630.074733,18.0,2.523132,2.576512,2.218268,2.415283,2.435647,2.174674,0.875445,0.964413,1.352313,0.807829,1.153025,2.384342,2.462633,1.16548,2.8879,2.115658,1.830961,2.711744,2.960854,0.0,2.960854,2.711744,0.5,2.836299,2.836299,0.218861,0.241103,0.338078,0.201957,0.145685,0.360988,0.264457,0.22887,0.192171,0.39739,0.410439
std,0.0,364.33307,0.0,0.365254,0.227204,0.266035,0.391773,0.317465,0.351468,0.896896,0.756774,0.857225,0.674524,0.890472,1.160077,1.213204,0.864852,1.210425,1.204183,1.181362,1.703149,1.73726,2.480584,1.73726,1.703149,0.500445,1.7248,1.7248,0.224224,0.189193,0.214306,0.168631,0.108106,0.151303,0.150523,0.14767,0.148412,0.193346,0.202201
min,2010.0,20005.0,18.0,1.774194,2.125,1.725806,1.543011,1.666667,1.388889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2010.0,20322.0,18.0,2.288462,2.429688,2.037037,2.148148,2.222222,1.944444,0.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,-2.0,2.0,1.0,0.0,2.0,2.0,0.0,0.0,0.25,0.0,0.125,0.25,0.125,0.125,0.0,0.333333,0.333333
50%,2010.0,20612.0,18.0,2.54,2.606061,2.253333,2.44,2.388889,2.222222,1.0,1.0,1.0,1.0,1.0,2.0,3.0,1.0,3.0,2.0,2.0,3.0,3.0,0.0,3.0,3.0,0.5,3.0,3.0,0.25,0.25,0.25,0.25,0.125,0.375,0.25,0.25,0.166667,0.333333,0.5
75%,2010.0,20974.0,18.0,2.825,2.725,2.439394,2.739316,2.666667,2.444444,1.0,2.0,2.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,3.0,4.0,4.0,2.0,4.0,4.0,1.0,4.0,4.0,0.25,0.5,0.5,0.25,0.25,0.5,0.375,0.375,0.333333,0.5,0.5
max,2010.0,21230.0,18.0,3.15,3.086538,2.666667,3.041667,3.388889,2.944444,3.0,3.0,4.0,3.0,3.0,6.0,5.0,3.0,6.0,5.0,5.0,9.0,9.0,8.0,9.0,9.0,1.0,9.0,9.0,0.75,0.75,1.0,0.75,0.375,0.75,0.625,0.625,0.5,1.0,0.833333


In [157]:
dt.groupby(['Win'])['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3,C4,W1,W2,W3,W4,D1,D2,D3
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,count,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0
0,mean,0.772242,1.039146,1.370107,0.818505,1.124555,2.839858,1.982206,2.053381,1.05694,2.409253,2.533808
0,std,0.848157,0.761974,0.831326,0.68594,0.875542,1.241821,1.175511,1.204455,0.872371,1.239147,1.261984
0,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,25%,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,2.0
0,50%,1.0,1.0,1.0,1.0,1.0,3.0,2.0,2.0,1.0,2.0,3.0
0,75%,1.0,2.0,2.0,1.0,2.0,4.0,3.0,3.0,2.0,3.0,4.0
0,max,3.0,3.0,4.0,3.0,3.0,6.0,5.0,5.0,3.0,6.0,5.0
1,count,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0
1,mean,0.978648,0.88968,1.33452,0.797153,1.206406,2.935943,2.24911,1.608541,1.24911,2.359431,2.391459


In [158]:
_var = ['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3']
d1 = dt[dt.Win==1][_var].mean()
d2 = dt[dt.Win==0][_var].mean()
df = d1/d2
df

C1    1.267281
C2    0.856164
C3    0.974026
C4    0.973913
W1    1.072785
W2    1.033835
W3    1.134650
W4    0.783362
D1    1.181818
D2    0.979321
D3    0.943820
dtype: float64

### estimate roster model

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [159]:
dt['mC1'] = dt.C1 - 1
dt['mC2'] = dt.C2 - 1
dt['mC3'] = dt.C3 - 1
dt['mC4'] = dt.C4 - 1

dt['mW1'] = dt.W1 - 2
dt['mW2'] = dt.W2 - 2
dt['mW3'] = dt.W3 - 2
dt['mW4'] = dt.W4 - 2

dt['mD1'] = dt.D1 - 2
dt['mD2'] = dt.D2 - 2
dt['mD3'] = dt.D3 - 2

_m1 = smf.ols('Win ~ mC1 + mC2 + mC3 + mC4', data=dt).fit()
_m2 = smf.ols('Win ~ mW1 + mW2 + mW3 + mW4', data=dt).fit()
_m3 = smf.ols('Win ~ mD1 + mD2 + mD3', data=dt).fit()
_m4 = smf.ols('Win ~ mC1 + mC2 + mC3 +  mW1 + mW2 + mW3 + mW4', data=dt).fit()

_m5 = smf.ols('Win ~ mC1 + mC2 + mC3 + mC4 + mW1 + mW2 + mW3 + mW4 + mD1 + mD2 + mD3', data=dt).fit()
_m6 = smf.logit('Win ~ mC1 + mC2 + mC3 + mC4 +  mW1 + mW2 + mW3 + mW4 + mD1 + mD2 + mD3', data=dt).fit()


summary_col([_m1, _m2, _m3, _m4, _m5, _m6], stars=True)


         Current function value: 0.664490
         Iterations: 35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3,4,5,6
,Win I,Win II,Win III,Win IIII,Win IIIII,Win IIIIII
Intercept,0.4966***,0.4956***,0.5512***,0.4848***,0.4878***,-0.0546
,(0.0229),(0.0296),(0.0290),(0.0304),(0.0375),(0.1574)
mC1,0.0529***,,,0.0161,0.0343,0.1425
,(0.0199),,,(0.0433),(0.0214),(4880158.0427)
mC2,-0.0370,,,-0.0814*,-0.0620***,-0.2634
,(0.0225),,,(0.0419),(0.0229),(4880158.0427)
mC3,0.0103,,,-0.0110,0.0078,0.0344
,(0.0199),,,(0.0357),(0.0202),(4880158.0427)
mC4,-0.0262,,,,0.0199,0.0865


In [160]:
dt['RC1'] = dt.C1 - dt.C4
dt['RC2'] = dt.C2 - dt.C4
dt['RC3'] = dt.C3 - dt.C4

dt['RW1'] = dt.W1 - dt.C4
dt['RW2'] = dt.W2 - dt.C4
dt['RW3'] = dt.W3 - dt.C4
dt['RW4'] = dt.W4 - dt.C4

dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,Win,GF,GA,c1,c2,c3,c4,w1,w2,w3,w4,d1,d2,d3,mC1,mC2,mC3,mC4,mW1,mW2,mW3,mW4,mD1,mD2,mD3,RC1,RC2,RC3,RW1,RW2,RW3,RW4
0,2010,20005,CGY,18.0,2.608696,2.336957,1.891304,2.589372,2.222222,2.611111,0.0,2.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,CGY,EDM,0,4,-4,4,0,EDM,CGY,0,0,4,0.0,0.5,0.25,0.25,0.25,0.25,0.25,0.25,0.166667,0.333333,0.5,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,-1.0,1.0,0.0,1.0,1.0,1.0,1.0
1,2010,20005,EDM,18.0,3.086538,2.716346,2.564103,3.019231,2.277778,2.333333,0.0,1.0,2.0,1.0,0.0,3.0,3.0,0.0,5.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY,1,4,0,0.0,0.25,0.5,0.25,0.0,0.625,0.125,0.25,0.0,0.5,0.5,-1.0,0.0,1.0,0.0,-2.0,3.0,-1.0,0.0,-2.0,1.0,1.0,-1.0,0.0,1.0,-1.0,4.0,0.0,1.0
2,2010,20006,CBJ,18.0,2.85,2.64,2.16,2.951111,2.777778,2.277778,0.0,2.0,1.0,1.0,0.0,4.0,2.0,1.0,2.0,3.0,2.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ,0,2,3,0.0,0.5,0.25,0.25,0.125,0.25,0.375,0.25,0.0,0.666667,0.333333,-1.0,1.0,0.0,0.0,-1.0,0.0,1.0,0.0,-2.0,2.0,0.0,-1.0,1.0,0.0,0.0,1.0,2.0,1.0
3,2010,20006,SJ,18.0,2.044118,2.404412,2.362745,1.993464,2.333333,2.111111,2.0,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,SJ,CBJ,3,2,1,2,3,SJ,CBJ,1,3,2,0.5,0.25,0.0,0.25,0.375,0.125,0.125,0.375,0.166667,0.333333,0.5,1.0,0.0,-1.0,0.0,1.0,-1.0,-1.0,1.0,-1.0,0.0,1.0,1.0,0.0,-1.0,2.0,0.0,0.0,2.0
4,2010,20009,ANA,18.0,2.825,2.4875,2.1,2.327778,2.111111,2.222222,1.0,1.0,0.0,2.0,2.0,1.0,3.0,3.0,1.0,2.0,2.0,ANA,DET,0,4,-4,4,0,DET,ANA,0,0,4,0.25,0.25,0.0,0.5,0.375,0.125,0.25,0.25,0.333333,0.166667,0.5,0.0,0.0,-1.0,1.0,1.0,-1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,-1.0,-2.0,1.0,-1.0,0.0,0.0


In [161]:
y = dt['Win']  

X1 = sm.add_constant(dt[['RC1', 'RC2', 'RC3']])


m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,3.149
Date:,"Wed, 07 Mar 2018",Prob (F-statistic):,0.0247
Time:,17:32:17,Log-Likelihood:,-403.18
No. Observations:,562,AIC:,814.4
Df Residuals:,558,BIC:,831.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.4966,0.023,21.728,0.000,0.452 0.542
RC1,0.0529,0.020,2.660,0.008,0.014 0.092
RC2,-0.0370,0.022,-1.647,0.100,-0.081 0.007
RC3,0.0103,0.020,0.516,0.606,-0.029 0.049

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.986
Prob(Omnibus):,1.0,Jarque-Bera (JB):,87.559
Skew:,0.001,Prob(JB):,9.7e-20
Kurtosis:,1.066,Cond. No.,2.41


In [163]:
y = dt['Win']  
X1 = sm.add_constant(dt[['C1', 'C2', 'C3' , 'W1','W2', 'W3', 'D1', 'D2']] )
X2 = sm.add_constant(dt[['c1', 'c2', 'c3', 'w1', 'w2', 'w3', 'd1', 'd2']] )

#m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m2.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,4.087
Date:,"Wed, 07 Mar 2018",Prob (F-statistic):,9.57e-05
Time:,17:32:17,Log-Likelihood:,-391.75
No. Observations:,562,AIC:,801.5
Df Residuals:,553,BIC:,840.5
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.0013,0.144,0.009,0.993,-0.281 0.284
c1,0.0574,0.178,0.323,0.747,-0.292 0.407
c2,-0.3278,0.168,-1.947,0.052,-0.659 0.003
c3,-0.0484,0.145,-0.333,0.739,-0.334 0.237
w1,0.7928,0.275,2.884,0.004,0.253 1.333
w2,0.8281,0.218,3.806,0.000,0.401 1.255
w3,0.6442,0.171,3.764,0.000,0.308 0.980
d1,0.0156,0.170,0.092,0.927,-0.318 0.349
d2,-0.0157,0.119,-0.133,0.894,-0.249 0.217

0,1,2,3
Omnibus:,0.034,Durbin-Watson:,2.927
Prob(Omnibus):,0.983,Jarque-Bera (JB):,74.426
Skew:,-0.019,Prob(JB):,6.9e-17
Kurtosis:,1.218,Cond. No.,22.5
