# Data

## season_game_level_data

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [2]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [3]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [4]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [5]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [6]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [7]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [8]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [9]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [10]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [11]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [12]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [13]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [14]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_game_data.csv', index='False', sep=',')

## season_level_data

In [15]:
dm = da.copy()

events that happened in regulation time only

In [16]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [17]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [18]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [19]:
dm.shape

(1796745, 24)

In [20]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [21]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [22]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [23]:
df = dm.copy()
df = df[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [24]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [25]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [26]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/clusters/stats_per_player_nhl_positions.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/clusters/stats_per_player_nhl_positions.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [27]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,MTL,21.0,GIONTA,W
2,2010,MTL,57.0,POULIOT,W
3,2010,MTL,26.0,GORGES,D
4,2010,MTL,75.0,GILL,D


In [31]:
dp.shape

(1071, 5)

In [29]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/clusters/player_rank_nhl_positions.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/clusters/player_rank_nhl_positions.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [30]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
696,2010,ANA,42,SEXTON,W,3
481,2010,ANA,9,RYAN,W,1
232,2010,ANA,4,FOWLER,D,1
235,2010,ANA,54,FOWLER,D,1
475,2010,ANA,8,SELANNE,W,1


- merge player position and player rankings and drop goaltenders

In [32]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1071, 6)

In [33]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              317
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [34]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1071, 6)

In [35]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
1008,2010,OTT,51.0,SMITH,D,4.0


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [36]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'D') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [37]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [38]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,W,1.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [39]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [40]:
dw.shape

(3703346, 12)

In [41]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130236, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [42]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,7.0
1,2010,20001,MTL,21.0,GIONTA,W,1.0,2,3,1,TOR,MTL,18.0,5.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL,18.0,6.0


- count the amount of forwards and defensemen by team per game.

In [43]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,1.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0


In [44]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_game_player_rank_nhl_positions.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_game_player_rank_nhl_positions.csv', index='False', sep=',')

In [45]:
dw1 = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode'])
dw1 = dw1.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [46]:
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,1.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,5.0,7.0,5.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0
4,2010,20001,MTL,75.0,GILL,D,2.0,2,3,1,TOR,MTL,18.0,6.0,7.0,5.0,6.0


In [47]:
dw1.shape

(2460, 17)

In [48]:
dw1.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,7.0,7.0,5.0,6.0
1852892,2010,20001,TOR,37.0,BRENT,C,3.0,3,2,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1611,2010,20002,PHI,17.0,CARTER,C,1.0,3,2,-1,PHI,PIT,18.0,5.0,5.0,7.0,6.0
1854537,2010,20002,PIT,71.0,MALKIN,C,1.0,2,3,-1,PHI,PIT,18.0,8.0,8.0,4.0,6.0
3224,2010,20003,CAR,53.0,SKINNER,C,1.0,4,3,-1,CAR,MIN,18.0,6.0,6.0,6.0,6.0


In [49]:
dw1.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/team_roster_player_rank_nhl_positions.csv', index='False', sep=',')
#dw1.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/team_roster_player_rank_nhl_positions.csv', index='False', sep=',')

In [50]:
dw1.shape

(2460, 17)

### full regular season stats

In [51]:
dw2 = dw.copy()
dw2 = dw2.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw2.shape

(7380, 14)

- create columns for team win and team loss. 

In [52]:
dw2['TeamWin'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw2['TeamLos'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [53]:
dw2['GP'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw2['GW'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw2['GL'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw2['GF'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw2['GA'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,C,220,213,18.0,7.0,7.0,5.0,6.0,TOR,MTL,2.857143,0,1,82,44,38
1,2010,20001,MTL,D,220,213,18.0,6.0,7.0,5.0,6.0,TOR,MTL,1.666667,0,1,82,44,38
2,2010,20001,MTL,W,220,213,18.0,5.0,7.0,5.0,6.0,TOR,MTL,1.6,0,1,82,44,38
3,2010,20001,TOR,C,225,259,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.6,1,0,82,37,45
4,2010,20001,TOR,D,225,259,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.0,1,0,82,37,45


- create columns with the mean ranking for forward and defenseman by team per game.

In [54]:
dw2['RankC'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw2['RankW'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw2['RankD'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw2['RankC'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw2['RankW'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw2['RankD'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20001,MTL,C,220,213,18.0,7.0,7.0,5.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.857143,1.6,1.666667
1,2010,20001,MTL,D,220,213,18.0,6.0,7.0,5.0,6.0,TOR,MTL,1.666667,0,1,82,44,38,2.857143,1.6,1.666667
2,2010,20001,MTL,W,220,213,18.0,5.0,7.0,5.0,6.0,TOR,MTL,1.6,0,1,82,44,38,2.857143,1.6,1.666667
3,2010,20001,TOR,C,225,259,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.6,1,0,82,37,45,2.6,2.571429,2.0
4,2010,20001,TOR,D,225,259,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.0,1,0,82,37,45,2.6,2.571429,2.0


- compute the mean per position by team for the season.

In [55]:
dw2['MeanC']= dw2.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw2['MeanW']= dw2.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw2['MeanD']= dw2.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20001,MTL,C,220,213,18.0,7.0,7.0,5.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396
1,2010,20001,MTL,D,220,213,18.0,6.0,7.0,5.0,6.0,TOR,MTL,1.666667,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396
2,2010,20001,MTL,W,220,213,18.0,5.0,7.0,5.0,6.0,TOR,MTL,1.6,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396
3,2010,20001,TOR,C,225,259,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.6,1,0,82,37,45,2.6,2.571429,2.0,2.311992,2.504007,2.174797
4,2010,20001,TOR,D,225,259,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.0,1,0,82,37,45,2.6,2.571429,2.0,2.311992,2.504007,2.174797


- display the quantity of wins and losses per team for the whole season

In [56]:
dw2['L'] = dw2.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw2['W'] = dw2.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20001,MTL,C,220,213,18.0,7.0,7.0,5.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396,38,44
1,2010,20001,MTL,D,220,213,18.0,6.0,7.0,5.0,6.0,TOR,MTL,1.666667,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396,38,44
2,2010,20001,MTL,W,220,213,18.0,5.0,7.0,5.0,6.0,TOR,MTL,1.6,0,1,82,44,38,2.857143,1.6,1.666667,2.587921,1.601278,1.569396,38,44
3,2010,20001,TOR,C,225,259,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.6,1,0,82,37,45,2.6,2.571429,2.0,2.311992,2.504007,2.174797,45,37
4,2010,20001,TOR,D,225,259,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.0,1,0,82,37,45,2.6,2.571429,2.0,2.311992,2.504007,2.174797,45,37


- compute win and loss percent by team. Drop duplicate observations.

In [57]:
dw2 = dw2[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw2 = dw2.drop_duplicates(['Season', 'TeamCode'])
dw2['WinPc'] = dw2['W']/ dw2['GP']
dw2['LossPc'] = dw2['L']/ dw2['GP']

dw2 = dw2[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,MTL,82,44,38,220,213,0.536585,0.463415,2.587921,1.601278,1.569396
3,2010,TOR,82,37,45,225,259,0.45122,0.54878,2.311992,2.504007,2.174797
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.803659,2.09669,1.449187
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.228978,2.705981,1.733508
12,2010,CAR,82,40,42,239,242,0.487805,0.512195,1.95813,2.245427,1.768293


- rank teams based on win percent, mean centres, wingers and defensemen. 

In [58]:
dw2['RankWin'] = dw2.groupby(['Season'])['WinPc'].rank(ascending=False)
dw2['RankC'] = dw2.groupby(['Season'])['MeanC'].rank(ascending=True)
dw2['RankW'] = dw2.groupby(['Season'])['MeanW'].rank(ascending=True)
dw2['RankD'] = dw2.groupby(['Season'])['MeanD'].rank(ascending=True)
dw2 = dw2.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw2.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
135,2010,VAN,82,54,28,268,190,0.658537,0.341463,2.121951,2.164915,1.692741,1.0,11.0,12.0,8.0
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.228978,2.705981,1.733508,2.0,14.0,30.0,12.0
33,2010,SJ,82,48,34,253,216,0.585366,0.414634,1.608116,2.257433,1.704472,3.5,1.0,18.0,10.0
63,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.563444,1.893815,1.85453,3.5,25.0,5.0,20.0
51,2010,DET,82,47,35,263,241,0.573171,0.426829,1.757549,2.159814,1.427991,6.0,3.0,11.0,1.0
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.803659,2.09669,1.449187,6.0,4.0,9.0,2.0
48,2010,ANA,82,47,35,241,237,0.573171,0.426829,2.612805,2.255851,2.094657,6.0,27.0,17.0,26.0
66,2010,BOS,82,46,36,250,200,0.560976,0.439024,1.730778,1.80633,1.926829,9.0,2.0,4.0,23.0
105,2010,TB,82,46,36,252,246,0.560976,0.439024,2.041057,1.934437,1.725319,9.0,9.0,6.0,11.0
132,2010,LA,82,46,36,227,207,0.560976,0.439024,2.345122,2.567122,1.763357,9.0,18.0,29.0,15.0


In [59]:
dw2.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_roster_ranking_nhl_positions.csv', index='False', sep=',')
#dw2.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_roster_ranking_nhl_positions.csv', index='False', sep=',')

### keep games that have 12 forwards and 6 defensemen per team!!

In [60]:
dwfd = dw.copy()
dwfd.shape

(44263, 17)

In [61]:
dwfd = dwfd.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['RosterCount'] == 18) & (x['DCount'] == 6)).all())
dwfd.shape

(32940, 17)

In [62]:
dwfd.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv', index='False', sep=',')
#dwfd.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/12f_6d_game_team_player_ranking_nhl_positions.csv', index='False', sep=',')

In [63]:
dw3 = dwfd.copy()
dw3 = dw3.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw3.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20001,MTL,C,2,3,18.0,7.0,7.0,5.0,6.0,TOR,MTL,2.857143
1,2010,20001,MTL,D,2,3,18.0,6.0,7.0,5.0,6.0,TOR,MTL,1.666667
2,2010,20001,MTL,W,2,3,18.0,5.0,7.0,5.0,6.0,TOR,MTL,1.6
3,2010,20001,TOR,C,3,2,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.6
4,2010,20001,TOR,D,3,2,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.0


In [64]:
dw3.shape

(5490, 14)

- create columns for team win and team loss. 

In [65]:
dw3['TeamWin'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw3['TeamLos'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [66]:
dw3['GP'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw3['GW'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw3['GL'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw3['GF'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw3['GA'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')

- create columns with the mean ranking for forward and defenseman by team per game.

In [67]:
dw3['RankC'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw3['RankW'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw3['RankD'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw3['RankC'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw3['RankW'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw3['RankD'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())

- compute the mean per position by team for the season.

In [68]:
dw3['MeanC']= dw3.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw3['MeanW']= dw3.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw3['MeanD']= dw3.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')

- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [69]:
dw3['L'] = dw3.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw3['W'] = dw3.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)

- compute win and loss percent by team. Drop duplicate observations.

In [70]:
dw3 = dw3[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw3 = dw3.drop_duplicates(['Season', 'TeamCode'])
dw3['WinPc'] = dw3['W']/ dw3['GP']
dw3['LossPc'] = dw3['L']/ dw3['GP']

dw3 = dw3[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]

- rank teams based on win percent, mean forwards and mean defensemen. 

In [71]:
dw3['RankWin'] = dw3.groupby(['Season'])['WinPc'].rank(ascending=False)
dw3['RankC'] = dw3.groupby(['Season'])['MeanC'].rank(ascending=True)
dw3['RankW'] = dw3.groupby(['Season'])['MeanW'].rank(ascending=True)
dw3['RankD'] = dw3.groupby(['Season'])['MeanD'].rank(ascending=True)
dw3 = dw3.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw3.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
111,2010,VAN,42,28,14,136,99,0.666667,0.333333,2.186508,2.133551,1.646825,1.0,12.0,11.0,5.0
33,2010,SJ,57,36,21,179,144,0.631579,0.368421,1.608271,2.246867,1.716374,2.0,1.0,17.0,10.0
18,2010,CHI,46,28,18,161,123,0.608696,0.391304,2.217391,2.172403,1.507246,3.0,13.0,13.0,3.0
6,2010,PHI,65,38,27,214,186,0.584615,0.415385,1.804615,2.1,1.458974,4.0,4.0,8.0,2.0
108,2010,LA,67,39,28,190,170,0.58209,0.41791,2.338806,2.568378,1.766169,5.0,18.0,29.0,17.0
63,2010,PHX,54,31,23,173,154,0.574074,0.425926,2.08642,2.267901,1.768519,6.0,10.0,18.0,18.0
45,2010,DET,65,37,28,206,195,0.569231,0.430769,1.764908,2.171209,1.428205,7.5,3.0,12.0,1.0
42,2010,ANA,65,37,28,186,183,0.569231,0.430769,2.614872,2.269377,2.079487,7.5,28.0,19.0,26.0
9,2010,PIT,69,39,30,203,178,0.565217,0.434783,2.229365,2.727019,1.73913,9.0,14.0,30.0,15.0
60,2010,BOS,73,41,32,215,175,0.561644,0.438356,1.737247,1.815851,1.924658,10.0,2.0,4.0,22.0


In [110]:
dw3.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_roster_ranking_12f_6d_nhl_positions.csv', index='False', sep=',')
#dw3.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_roster_ranking_12f_6d_nhl_positions.csv', index='False', sep=',')

### keep games that have 4 C, 8 W  and 6 D per team!!!!

In [73]:
du = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6)).all())

In [74]:
du.shape

(1620, 17)

In [108]:
du.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/4c_8w_6d_game_team_player_ranking_nhl_positions.csv', index='False', sep=',')
#du.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/4c_8w_6d_game_team_player_ranking_nhl_positions.csv', index='False', sep=',')

- create a new dataset using team roster player rank

In [76]:
dv = du.copy()

In [77]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20023,LA,C,1,1,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5
1,2010,20023,LA,D,1,1,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333
2,2010,20023,LA,W,1,1,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625
3,2010,20023,VAN,C,1,1,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25
4,2010,20023,VAN,D,1,1,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667
5,2010,20023,VAN,W,1,1,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.5
6,2010,20025,CGY,C,3,1,18.0,4.0,4.0,8.0,6.0,CGY,LA,2.25
7,2010,20025,CGY,D,3,1,18.0,6.0,4.0,8.0,6.0,CGY,LA,1.666667
8,2010,20025,CGY,W,3,1,18.0,8.0,4.0,8.0,6.0,CGY,LA,2.125
9,2010,20025,LA,C,1,3,18.0,4.0,4.0,8.0,6.0,CGY,LA,2.5


In [78]:
dv.shape

(270, 14)

- create columns for team win and team loss. 

In [79]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20023,LA,C,1,1,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5,1,0
1,2010,20023,LA,D,1,1,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333,1,0
2,2010,20023,LA,W,1,1,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625,1,0
3,2010,20023,VAN,C,1,1,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25,0,1
4,2010,20023,VAN,D,1,1,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667,0,1


- display games played, games won, games loss, goals for and goals against by team for the season.

In [80]:
dv['GP'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20023,LA,C,16,22,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5,1,0,9,4,5
1,2010,20023,LA,D,16,22,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333,1,0,9,4,5
2,2010,20023,LA,W,16,22,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625,1,0,9,4,5
3,2010,20023,VAN,C,17,9,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25,0,1,5,3,2
4,2010,20023,VAN,D,17,9,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667,0,1,5,3,2


- create columns with the mean ranking for forward and defenseman by team per game.

In [81]:
dv['RankC'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dv['RankW'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dv['RankD'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dv['RankC'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dv['RankW'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dv['RankD'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20023,LA,C,16,22,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5,1,0,9,4,5,2.5,2.625,1.833333
1,2010,20023,LA,D,16,22,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333,1,0,9,4,5,2.5,2.625,1.833333
2,2010,20023,LA,W,16,22,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625,1,0,9,4,5,2.5,2.625,1.833333
3,2010,20023,VAN,C,17,9,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25,0,1,5,3,2,2.25,2.5,1.666667
4,2010,20023,VAN,D,17,9,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667,0,1,5,3,2,2.25,2.5,1.666667


- compute the mean per position by team for the season.

In [82]:
dv['MeanC']= dv.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dv['MeanW']= dv.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dv['MeanD']= dv.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20023,LA,C,16,22,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778
1,2010,20023,LA,D,16,22,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778
2,2010,20023,LA,W,16,22,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778
3,2010,20023,VAN,C,17,9,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25,0,1,5,3,2,2.25,2.5,1.666667,2.25,2.2,1.633333
4,2010,20023,VAN,D,17,9,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667,0,1,5,3,2,2.25,2.5,1.666667,2.25,2.2,1.633333


- display the quantity of wins and losses per team ( roster of roster of 4 centers, 8 wingers and 6 defensemen)

In [83]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20023,LA,C,16,22,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.5,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778,5,4
1,2010,20023,LA,D,16,22,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.833333,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778,5,4
2,2010,20023,LA,W,16,22,18.0,8.0,4.0,8.0,6.0,LA,VAN,2.625,1,0,9,4,5,2.5,2.625,1.833333,2.333333,2.583333,1.777778,5,4
3,2010,20023,VAN,C,17,9,18.0,4.0,4.0,8.0,6.0,LA,VAN,2.25,0,1,5,3,2,2.25,2.5,1.666667,2.25,2.2,1.633333,2,3
4,2010,20023,VAN,D,17,9,18.0,6.0,4.0,8.0,6.0,LA,VAN,1.666667,0,1,5,3,2,2.25,2.5,1.666667,2.25,2.2,1.633333,2,3


- compute win and loss percent by team. Drop duplicate observations.

In [84]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,LA,9,4,5,16,22,0.444444,0.555556,2.333333,2.583333,1.777778
3,2010,VAN,5,3,2,17,9,0.6,0.4,2.25,2.2,1.633333
6,2010,CGY,7,2,5,22,25,0.285714,0.714286,1.821429,1.696429,1.690476
12,2010,NYI,7,0,7,15,29,0.0,1.0,2.428571,2.678571,1.857143
15,2010,WSH,3,2,1,8,10,0.666667,0.333333,2.583333,1.791667,1.833333


- rank teams based on win percent, mean centres, mean wingers and mean defensemen. 

In [85]:
dv['RankWin'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['RankC'] = dv.groupby(['Season'])['MeanC'].rank(ascending=True)
dv['RankW'] = dv.groupby(['Season'])['MeanW'].rank(ascending=True)
dv['RankD'] = dv.groupby(['Season'])['MeanD'].rank(ascending=True)
dv = dv.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
114,2010,EDM,2,2,0,6,4,1.0,0.0,2.75,2.4375,2.416667,1.0,17.0,13.0,17.0
18,2010,ANA,8,7,1,26,17,0.875,0.125,2.34375,2.375,2.0,2.0,10.0,11.0,14.5
120,2010,CAR,6,5,1,21,18,0.833333,0.166667,1.583333,2.354167,1.888889,3.0,1.0,10.0,13.0
153,2010,OTT,5,4,1,21,9,0.8,0.2,2.55,2.725,2.133333,4.0,15.0,17.0,16.0
27,2010,ATL,8,6,2,31,25,0.75,0.25,2.3125,2.328125,1.708333,5.0,8.0,9.0,6.0
15,2010,WSH,3,2,1,8,10,0.666667,0.333333,2.583333,1.791667,1.833333,6.0,16.0,2.0,9.5
3,2010,VAN,5,3,2,17,9,0.6,0.4,2.25,2.2,1.633333,7.5,6.5,5.0,4.0
177,2010,CHI,5,3,2,18,11,0.6,0.4,2.45,2.025,1.5,7.5,13.0,3.0,1.0
0,2010,LA,9,4,5,16,22,0.444444,0.555556,2.333333,2.583333,1.777778,9.0,9.0,15.0,8.0
84,2010,STL,7,3,4,16,20,0.428571,0.571429,2.071429,2.232143,1.619048,10.0,5.0,6.0,3.0


In [86]:
dv.shape

(17, 16)

In [109]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_4C_8W_6D_nhl_positions.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_ranking_4C_8W_6D_nhl_positions.csv', index='False', sep=',')
