# Data

## season_game_level_data

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

**data frames used in this notebook:**
- da = pbp
- dg = season_games_data
- dm = play_by_play
- dp = player_rankings
- dw = team_roster_player_rank
- dv = season_team_roster_ranking
- dx = season_game_team_roster
- dz = season_team
- dy = season_game_roster

**for analysis:**
- ds = season_level
- dl = season_game_level
- dt = season_game_team_level

In [2]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import play by play data set

In [3]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/pbp_merged.csv')
#da = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/pbp_merged.csv')
da = da.drop('Unnamed: 0', axis=1)
da = da.rename(columns={'TeamCode': 'EventTeamCode'})


keep regular season games

In [4]:
da = da[da['GameNumber'] <= 21230]

drop irrelevant data

In [5]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']

- display goals for each game and drop duplicates.

In [6]:
dh = da[da['EventTeamCode']==da['HTeamCode']]
dh = dh.rename(columns={'EventTeamCode': 'HTeam'})
dh['goal'] = dh.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dh['HGF'] = dh.groupby(['Season', 'GameNumber', 'HTeam'])['goal'].transform('sum')
dh = dh[['Season', 'GameNumber', 'HGF']]
dh = dh.drop_duplicates(['Season', 'GameNumber'])

In [7]:
dv = da[da['EventTeamCode']==da['VTeamCode']]
dv = dv.rename(columns={'EventTeamCode': 'VTeam'})
dv['goal'] = dv.apply(lambda x: 1 if (x['EventType'] == 'GOAL') else 0, axis=1)
dv['VGF'] = dv.groupby(['Season', 'GameNumber', 'VTeam'])['goal'].transform('sum')
dv = dv[['Season', 'GameNumber', 'VGF']]
dv = dv.drop_duplicates(['Season', 'GameNumber'])

Merge into season-game data

In [8]:
dg = da[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode']]
dg = dg.drop_duplicates(['Season', 'GameNumber'])

In [9]:
dg = pd.merge(dg, dh, on=['Season', 'GameNumber'], how='left')
dg = pd.merge(dg, dv, on=['Season', 'GameNumber'], how='left')

- find the goal differential per game with respect to home team.

In [10]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

In [11]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,HGF,VGF,GD,WinTeam,LossTeam
0,2010,20001,MTL,TOR,3,2,1,TOR,MTL
1,2010,20002,PHI,PIT,2,3,-1,PHI,PIT
2,2010,20003,CAR,MIN,3,4,-1,CAR,MIN
3,2010,20004,CHI,COL,4,3,1,COL,CHI
4,2010,20005,CGY,EDM,4,0,4,EDM,CGY


- display goals againest per team.

In [12]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [13]:
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF', 'GD', 'VGA', 'HGA', 'WinTeam', 'LossTeam']]
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [26]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_game_data.csv', index='False', sep=',')

## season_level_data

In [27]:
dm = da.copy()

events that happened in regulation time only

In [28]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [29]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [30]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
c = [col for col in dm.columns if 'VPosition' in col]
d = [col for col in dm.columns if 'HPosition' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b, 'VPosition' : c, 'HPosition': d})

In [31]:
dm.shape

(1796745, 24)

In [32]:
dm.columns

Index(['AdvantageType', 'EventDetail', 'EventNumber', 'EventTeamCode',
       'EventTimeFromTwenty', 'EventTimeFromZero', 'EventType', 'GameDate',
       'GameNumber', 'HTeamCode', 'Length', 'PenaltyType', 'Period',
       'PlayerName', 'PlayerNumber', 'Season', 'ShotResult', 'ShotType',
       'VTeamCode', 'Zone', 'VPlayer', 'HPlayer', 'VPosition', 'HPosition'],
      dtype='object')

In [33]:
dm = dm.rename(columns={'PlayerNumber': 'EventPlayerNumber', 'TeamCode': 'EventTeamCode', 'PlayerName': 'EventPlayerName' })
dm = dm[['Season', 'GameNumber', 'GameDate', 'Period', 'AdvantageType', 'Zone', 'EventNumber', 'EventType', 'EventDetail', 'EventTeamCode', 'EventPlayerNumber', 'EventPlayerName', 'EventTimeFromZero', 'EventTimeFromTwenty', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition', 'ShotType', 'ShotResult', 'Length', 'PenaltyType']]
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

- fill in advantage type with even strength 'EV' and event player number with 'TEAM'

In [34]:
dm['AdvantageType'] = dm['AdvantageType'].fillna('EV')
dm['EventPlayerNumber'] = dm['EventPlayerNumber'].fillna('TEAM')

- save new datast as play by play

In [35]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [36]:
df = dm.copy()
df = df[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'VPosition', 'HTeamCode', 'HPlayer', 'HPosition']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition
0,2010,20001,MTL,11.0,C,TOR,37.0,C
310113,2010,20001,MTL,21.0,R,TOR,9.0,R
620126,2010,20001,MTL,57.0,L,TOR,11.0,L
930061,2010,20001,MTL,26.0,D,TOR,3.0,D
1239931,2010,20001,MTL,75.0,D,TOR,22.0,D


- merge season_game_data (dg) on new dataset

In [37]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,VPosition,HTeamCode,HPlayer,HPosition,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,11.0,C,TOR,37.0,C,2,3,1,3,2,TOR,MTL
1,2010,20001,MTL,21.0,R,TOR,9.0,R,2,3,1,3,2,TOR,MTL
2,2010,20001,MTL,57.0,L,TOR,11.0,L,2,3,1,3,2,TOR,MTL
3,2010,20001,MTL,26.0,D,TOR,3.0,D,2,3,1,3,2,TOR,MTL
4,2010,20001,MTL,75.0,D,TOR,22.0,D,2,3,1,3,2,TOR,MTL


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [38]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'Position' in col]
c = [col for col in df.columns if 'TeamCode' in col]
d = [col for col in df.columns if 'GF' in col]
e = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerNumber' : a, 'PlayerPosition' : b, 'TeamCode' : c, 'GF' : d, 'GA' : e })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,2,3,1,TOR,MTL


### import player position and rankings

In [42]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/clusters/stats_per_player_centers_wingers_defensemen.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/clusters/stats_per_player_centers_wingers_defensemen.csv')
dp = dp.drop('Unnamed: 0', axis=1)
dp = dp.rename(columns={'Position': 'PlayerPosition'})

In [43]:
dp = dp[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition']]
dp.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition
0,2010,MTL,11.0,GOMEZ,C
1,2010,TOR,37.0,BRENT,C
2,2010,MTL,14.0,PLEKANEC,C
3,2010,MTL,76.0,SUBBAN,D
4,2010,TOR,35.0,GIGUERE,G


In [44]:
dp.shape

(1058, 5)

In [45]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/clusters/player_rank_for_centers_wingers_defensemen.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/clusters/player_rank_for_centers_wingers_defensemen.csv')
dr = dr.drop('Unnamed: 0', axis=1)
dr = dr.rename(columns={'Position': 'PlayerPosition'})

In [46]:
dr = dr[['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank']]
dr = dr.sort_values(['TeamCode'], ascending=[True])
dr.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
397,2010,ANA,23.0,BEAUCHEMIN,D,3
129,2010,ANA,20.0,CARTER,C,3
159,2010,ANA,22.0,MARCHANT,C,4
161,2010,ANA,28.0,CHIPCHURA,C,4
164,2010,ANA,12.0,GREEN,C,4


- merge player position and player rankings and drop goaltenders

In [47]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition'], how='left')
ds['Rank'] = ds.groupby(['Season', 'TeamCode', 'PlayerName'])['Rank'].apply(lambda x: x.ffill().bfill())
ds.shape

(1058, 6)

In [48]:
ds.isnull().sum()

Season              0
TeamCode            0
PlayerNumber        0
PlayerName          0
PlayerPosition      0
Rank              256
dtype: int64

skaters that have played less than 9 games were not included in the clusters and therefore not ranked. Since the failed to make the roster on a regular basis, forwards are assigned to the 4th line and defensemen to the bottom (3rd) pairing.

In [49]:
ds['Rank'] = ds.apply(lambda x: 1 if ((x['PlayerPosition'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['PlayerPosition'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['PlayerPosition'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(1058, 6)

In [50]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank
996,2010,OTT,51.0,SMITH,D,4.0


- Derek Smith, a defenseman for the Ottawa Senators has a ranking of 4, which is incorrect since we have 3 defensive pairings. For that reason, he is assigned a rank of 3 which represents the bottom defensive pairing

In [51]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['PlayerPosition'] == 'D') & (x['TeamCode'] == 'OTT') & (x['PlayerName'] == 'SMITH') & (x['PlayerNumber'] == 51.0)) else x['Rank'], axis=1)

In [52]:
ds1 = ds[(ds['PlayerPosition'] == 'D') & (ds['Rank'] == 4)]
ds1.head()

Unnamed: 0,Season,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank


- **display each player by team per game. Drop duplicates.**

In [53]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerNumber'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerNumber', 'PlayerName', 'PlayerPosition', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL


- drop observations that have no player name, position nor ranking. Exclude goaltenders from the roster.

In [54]:
dw = dw[pd.notnull(dw['PlayerPosition'])]
dw.isnull().sum()

Season            0
GameNumber        0
TeamCode          0
PlayerNumber      0
PlayerName        0
PlayerPosition    0
Rank              0
GF                0
GA                0
GD                0
WinTeam           0
LossTeam          0
dtype: int64

In [55]:
dw.shape

(3688734, 12)

In [56]:
dw = dw[dw['PlayerPosition'] != 'G']
dw.shape

(3130215, 12)

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [57]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerNumber'].transform('count')
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition'])['PlayerNumber'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,5.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0


- count the amount of forwards and defensemen by team per game.

In [58]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0


In [59]:
dw1 = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode'])
dw1 = dw1.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [60]:
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount
0,2010,20001,MTL,11.0,GOMEZ,C,2.0,2,3,1,TOR,MTL,18.0,5.0,5.0,7.0,6.0
1,2010,20001,MTL,21.0,GIONTA,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
2,2010,20001,MTL,57.0,POULIOT,W,2.0,2,3,1,TOR,MTL,18.0,7.0,5.0,7.0,6.0
3,2010,20001,MTL,26.0,GORGES,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0
4,2010,20001,MTL,75.0,GILL,D,3.0,2,3,1,TOR,MTL,18.0,6.0,5.0,7.0,6.0


In [61]:
dw1.shape

(2460, 17)

In [62]:
dw1.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')
#dw1.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/team_roster_player_rank_centers_wingers_defensemen.csv', index='False', sep=',')

### full regular season stats

In [63]:
dw2 = dw.copy()
dw2 = dw2.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw2.shape

(7380, 14)

- create columns for team win and team loss. 

In [64]:
dw2['TeamWin'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw2['TeamLos'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [65]:
dw2['GP'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw2['GW'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw2['GL'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw2['GF'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw2['GA'] = dw2.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.8,0,1,82,44,38
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.166667,1,0,82,37,45


- create columns with the mean ranking for forward and defenseman by team per game.

In [66]:
dw2['RankC'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw2['RankW'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw2['RankD'] = dw2.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw2['RankC'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw2['RankW'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw2['RankD'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.8,0,1,82,44,38,2.8,2.857143,2.166667
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,2.8,2.857143,2.166667
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.8,2.857143,2.166667
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.625,2.166667
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.166667,1,0,82,37,45,3.0,2.625,2.166667


- compute the mean per position by team for the season.

In [67]:
dw2['MeanC']= dw2.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw2['MeanW']= dw2.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw2['MeanD']= dw2.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.8,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.625,2.166667,2.765244,2.586793,2.332462
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.166667,1,0,82,37,45,3.0,2.625,2.166667,2.765244,2.586793,2.332462


- display the quantity of wins and losses per team for the whole season

In [68]:
dw2['L'] = dw2.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw2['W'] = dw2.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20001,MTL,C,220,213,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.8,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481,38,44
1,2010,20001,MTL,D,220,213,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481,38,44
2,2010,20001,MTL,W,220,213,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143,0,1,82,44,38,2.8,2.857143,2.166667,2.600407,2.509945,2.226481,38,44
3,2010,20001,TOR,C,225,259,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0,1,0,82,37,45,3.0,2.625,2.166667,2.765244,2.586793,2.332462,45,37
4,2010,20001,TOR,D,225,259,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.166667,1,0,82,37,45,3.0,2.625,2.166667,2.765244,2.586793,2.332462,45,37


- compute win and loss percent by team. Drop duplicate observations.

In [69]:
dw2 = dw2[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw2 = dw2.drop_duplicates(['Season', 'TeamCode'])
dw2['WinPc'] = dw2['W']/ dw2['GP']
dw2['LossPc'] = dw2['L']/ dw2['GP']

dw2 = dw2[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,MTL,82,44,38,220,213,0.536585,0.463415,2.600407,2.509945,2.226481
3,2010,TOR,82,37,45,225,259,0.45122,0.54878,2.765244,2.586793,2.332462
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.72561,2.327575,1.735772
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.79126,2.77727,2.022358
12,2010,CAR,82,40,42,239,242,0.487805,0.512195,1.728659,2.502778,1.963415


- rank teams based on win percent, mean centres, wingers and defensemen. 

In [70]:
dw2['RankWin'] = dw2.groupby(['Season'])['WinPc'].rank(ascending=False)
dw2['RankC'] = dw2.groupby(['Season'])['MeanC'].rank(ascending=True)
dw2['RankW'] = dw2.groupby(['Season'])['MeanW'].rank(ascending=True)
dw2['RankD'] = dw2.groupby(['Season'])['MeanD'].rank(ascending=True)
dw2 = dw2.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw2.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
135,2010,VAN,82,54,28,268,190,0.658537,0.341463,1.801626,2.37079,2.228804,1.0,4.0,9.0,13.0
9,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.79126,2.77727,2.022358,2.0,25.0,23.0,8.0
33,2010,SJ,82,48,34,253,216,0.585366,0.414634,1.583537,2.310414,2.382259,3.5,1.0,6.0,18.0
63,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.379065,2.595901,2.521196,3.5,17.0,18.0,26.0
6,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.72561,2.327575,1.735772,6.0,2.0,7.0,1.0
51,2010,DET,82,47,35,263,241,0.573171,0.426829,2.021748,2.196477,1.813298,6.0,8.0,4.0,2.0
48,2010,ANA,82,47,35,241,237,0.573171,0.426829,2.985163,2.515607,2.223577,6.0,30.0,13.0,11.0
105,2010,TB,82,46,36,252,246,0.560976,0.439024,2.131504,2.360772,2.506678,9.0,11.0,8.0,24.0
66,2010,BOS,82,46,36,250,200,0.560976,0.439024,2.146748,1.7214,1.989837,9.0,13.0,1.0,6.0
132,2010,LA,82,46,36,227,207,0.560976,0.439024,2.63252,2.578736,2.24158,9.0,21.0,16.0,15.0


In [71]:
dw2.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/full_season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')
#dw2.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/full_season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')

### keep games that have 12 forwards and 6 defensemen per team!!

In [72]:
dwfd = dw.copy()
dwfd.shape

(44262, 17)

In [73]:
dwfd = dwfd.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['RosterCount'] == 18) & (x['DCount'] == 6)).all())
dwfd.shape

(31644, 17)

In [74]:
dw3 = dwfd.copy()
dw3 = dw3.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw3.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20001,MTL,C,2,3,18.0,5.0,5.0,7.0,6.0,TOR,MTL,2.8
1,2010,20001,MTL,D,2,3,18.0,6.0,5.0,7.0,6.0,TOR,MTL,2.166667
2,2010,20001,MTL,W,2,3,18.0,7.0,5.0,7.0,6.0,TOR,MTL,2.857143
3,2010,20001,TOR,C,3,2,18.0,4.0,4.0,8.0,6.0,TOR,MTL,3.0
4,2010,20001,TOR,D,3,2,18.0,6.0,4.0,8.0,6.0,TOR,MTL,2.166667


In [75]:
dw3.shape

(5274, 14)

- create columns for team win and team loss. 

In [76]:
dw3['TeamWin'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw3['TeamLos'] =  dw3.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [77]:
dw3['GP'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dw3['GW'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dw3['GL'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dw3['GF'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dw3['GA'] = dw3.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')

- create columns with the mean ranking for forward and defenseman by team per game.

In [78]:
dw3['RankC'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dw3['RankW'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dw3['RankD'] = dw3.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dw3['RankC'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw3['RankW'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw3['RankD'] = dw3.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())

- compute the mean per position by team for the season.

In [79]:
dw3['MeanC']= dw3.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw3['MeanW']= dw3.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw3['MeanD']= dw3.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')

- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [80]:
dw3['L'] = dw3.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw3['W'] = dw3.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)

- compute win and loss percent by team. Drop duplicate observations.

In [81]:
dw3 = dw3[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dw3 = dw3.drop_duplicates(['Season', 'TeamCode'])
dw3['WinPc'] = dw3['W']/ dw3['GP']
dw3['LossPc'] = dw3['L']/ dw3['GP']

dw3 = dw3[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]

- rank teams based on win percent, mean forwards and mean defensemen. 

In [82]:
dw3['RankWin'] = dw3.groupby(['Season'])['WinPc'].rank(ascending=False)
dw3['RankC'] = dw3.groupby(['Season'])['MeanC'].rank(ascending=True)
dw3['RankW'] = dw3.groupby(['Season'])['MeanW'].rank(ascending=True)
dw3['RankD'] = dw3.groupby(['Season'])['MeanD'].rank(ascending=True)
dw3 = dw3.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dw3.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
297,2010,VAN,58,40,18,198,137,0.689655,0.310345,1.825862,2.377292,2.221264,1.0,4.0,9.0,15.0
18,2010,CHI,47,28,19,157,127,0.595745,0.404255,2.097872,1.87272,2.148936,2.0,9.0,2.0,10.0
6,2010,PHI,66,39,27,219,188,0.590909,0.409091,1.719697,2.323653,1.739899,3.0,1.0,6.0,1.0
45,2010,DET,61,36,25,197,181,0.590164,0.409836,2.064754,2.210122,1.806011,4.0,8.0,4.0,2.0
42,2010,ANA,60,35,25,169,168,0.583333,0.416667,3.0075,2.537434,2.208333,5.0,30.0,13.0,12.0
87,2010,TB,35,20,15,112,109,0.571429,0.428571,2.182381,2.361395,2.485714,6.5,13.0,7.0,24.0
9,2010,PIT,70,40,30,200,176,0.571429,0.428571,2.797857,2.779337,2.019048,6.5,26.0,24.0,8.0
267,2010,PHX,57,32,25,173,157,0.561404,0.438596,2.769006,2.273538,2.0,8.0,23.0,5.0,7.0
282,2010,BOS,66,37,29,200,163,0.560606,0.439394,2.15101,1.717833,1.967172,9.0,12.0,1.0,5.0
24,2010,CGY,67,37,30,213,191,0.552239,0.447761,1.874876,2.366945,1.91791,10.0,7.0,8.0,3.0


In [141]:
dw3.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_roster_ranking_forwards_defensemen.csv', index='False', sep=',')
#dw3.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_roster_ranking_forwards_defensemen.csv', index='False', sep=',')

### keep games that have 4 C, 8 W  and 6 D per team!!!!

In [83]:
du = dw.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6)).all())

In [84]:
du.shape

(10116, 17)

- create a new dataset using team roster player rank

In [85]:
dv = du.copy()

In [86]:
dv = dv.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerPosition', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dv.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5
5,2010,20005,EDM,W,4,0,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5
6,2010,20006,CBJ,C,2,3,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,2.0
7,2010,20006,CBJ,D,2,3,18.0,6.0,4.0,8.0,6.0,SJ,CBJ,2.666667
8,2010,20006,CBJ,W,2,3,18.0,8.0,4.0,8.0,6.0,SJ,CBJ,2.625
9,2010,20006,SJ,C,3,2,18.0,4.0,4.0,8.0,6.0,SJ,CBJ,1.75


In [87]:
dv.shape

(1686, 14)

- create columns for team win and team loss. 

In [88]:
dv['TeamWin'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dv['TeamLos'] =  dv.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos
0,2010,20005,CGY,C,0,4,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25,0,1
1,2010,20005,CGY,D,0,4,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1
2,2010,20005,CGY,W,0,4,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1
3,2010,20005,EDM,C,4,0,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0
4,2010,20005,EDM,D,4,0,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0


- display games played, games won, games loss, goals for and goals against by team for the season.

In [89]:
dv['GP'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GameNumber'].transform('count')
dv['GW'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamWin'].transform('sum')
dv['GL'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['TeamLos'].transform('sum')
dv['GF'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GF'].transform('sum')
dv['GA'] = dv.groupby(['Season', 'PlayerPosition', 'TeamCode'])['GA'].transform('sum')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25,0,1,23,11,12
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,17
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17


- create columns with the mean ranking for forward and defenseman by team per game.

In [90]:
dv['RankC'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dv['RankW'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dv['RankD'] = dv.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dv['RankC'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dv['RankW'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dv['RankD'] = dv.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25,0,1,23,11,12,2.25,2.5,2.333333
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.25,2.5,2.333333
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.25,2.5,2.333333
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,17,2.75,2.5,2.5
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,2.75,2.5,2.5


- compute the mean per position by team for the season.

In [91]:
dv['MeanC']= dv.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dv['MeanW']= dv.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dv['MeanD']= dv.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,17,2.75,2.5,2.5,2.903846,2.711538,2.564103
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,2.75,2.5,2.5,2.903846,2.711538,2.564103


- display the quantity of wins and losses per team ( roster of 12 forwards and 6 defensemen)

In [92]:
dv['L'] = dv.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dv['W'] = dv.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dv.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerPosition,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,MeanC,MeanW,MeanD,L,W
0,2010,20005,CGY,C,75,75,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.25,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304,12,11
1,2010,20005,CGY,D,75,75,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.333333,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304,12,11
2,2010,20005,CGY,W,75,75,18.0,8.0,4.0,8.0,6.0,EDM,CGY,2.5,0,1,23,11,12,2.25,2.5,2.333333,1.923913,2.336957,1.891304,12,11
3,2010,20005,EDM,C,67,88,18.0,4.0,4.0,8.0,6.0,EDM,CGY,2.75,1,0,26,9,17,2.75,2.5,2.5,2.903846,2.711538,2.564103,17,9
4,2010,20005,EDM,D,67,88,18.0,6.0,4.0,8.0,6.0,EDM,CGY,2.5,1,0,26,9,17,2.75,2.5,2.5,2.903846,2.711538,2.564103,17,9


- compute win and loss percent by team. Drop duplicate observations.

In [93]:
dv = dv[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD']]
dv = dv.drop_duplicates(['Season', 'TeamCode'])
dv['WinPc'] = dv['W']/ dv['GP']
dv['LossPc'] = dv['L']/ dv['GP']

dv = dv[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD']]
dv.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD
0,2010,CGY,23,11,12,75,75,0.478261,0.521739,1.923913,2.336957,1.891304
3,2010,EDM,26,9,17,67,88,0.346154,0.653846,2.903846,2.711538,2.564103
6,2010,CBJ,25,9,16,69,86,0.36,0.64,2.1,2.66,2.406667
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,1.764706,2.404412,2.362745
12,2010,ANA,10,3,7,18,32,0.3,0.7,2.975,2.5875,2.233333


- rank teams based on win percent, mean forwards and mean defensemen. 

In [94]:
dv['RankWin'] = dv.groupby(['Season'])['WinPc'].rank(ascending=False)
dv['RankC'] = dv.groupby(['Season'])['MeanC'].rank(ascending=True)
dv['RankW'] = dv.groupby(['Season'])['MeanW'].rank(ascending=True)
dv['RankD'] = dv.groupby(['Season'])['MeanD'].rank(ascending=True)
dv = dv.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD'], ascending=[True, True, True, True, True])
dv.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD
39,2010,NSH,16,13,3,55,29,0.8125,0.1875,2.78125,2.546875,2.0,1.0,23.0,14.0,5.5
105,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.009615,2.350962,2.211538,2.0,9.0,8.0,12.0
15,2010,DET,25,18,7,93,63,0.72,0.28,2.0,2.205,1.78,3.0,8.0,5.0,2.0
45,2010,LA,25,17,8,68,54,0.68,0.32,2.59,2.595,2.253333,4.0,20.0,17.0,14.0
18,2010,BUF,18,12,6,62,52,0.666667,0.333333,2.944444,1.923611,2.037037,5.0,27.0,3.0,8.0
51,2010,PHI,31,19,12,102,82,0.612903,0.387097,1.774194,2.294355,1.725806,6.0,4.0,6.0,1.0
66,2010,CHI,20,12,8,68,54,0.6,0.4,1.9875,1.91875,2.158333,7.5,7.0,2.0,10.0
573,2010,OTT,10,6,4,22,22,0.6,0.4,2.725,2.85,2.566667,7.5,22.0,23.0,26.0
21,2010,NYR,22,13,9,72,56,0.590909,0.409091,1.75,2.471591,2.409091,9.0,2.0,11.0,21.0
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,1.764706,2.404412,2.362745,10.0,3.0,10.0,16.0


In [95]:
dv.shape

(30, 16)

In [96]:
dv.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')
#dv.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_roster_ranking_centers_wingers_defensemen.csv', index='False', sep=',')

## season_game_team_roster_data

- use the team roster player rank dataset (dw) to display the roster quality by team per game

In [97]:
dx1 = du.copy()

In [98]:
dx2 = dv.copy()
dx2 = dx2[['Season', 'TeamCode', 'MeanC', 'MeanW', 'MeanD']]

In [99]:
dx = pd.merge(dx1, dx2, on=['Season', 'TeamCode'], how='left')
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerNumber,PlayerName,PlayerPosition,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,MeanC,MeanW,MeanD
0,2010,20005,CGY,13.0,JOKINEN,C,1.0,0,4,4,EDM,CGY,18.0,4.0,4.0,8.0,6.0,1.923913,2.336957,1.891304
1,2010,20005,CGY,12.0,IGINLA,W,1.0,0,4,4,EDM,CGY,18.0,8.0,4.0,8.0,6.0,1.923913,2.336957,1.891304
2,2010,20005,CGY,40.0,TANGUAY,W,1.0,0,4,4,EDM,CGY,18.0,8.0,4.0,8.0,6.0,1.923913,2.336957,1.891304
3,2010,20005,CGY,3.0,CARSON,D,3.0,0,4,4,EDM,CGY,18.0,6.0,4.0,8.0,6.0,1.923913,2.336957,1.891304
4,2010,20005,CGY,28.0,REGEHR,D,2.0,0,4,4,EDM,CGY,18.0,6.0,4.0,8.0,6.0,1.923913,2.336957,1.891304


In [100]:
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank', 'MeanC', 'MeanW', 'MeanD']]
dx['RankC'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='C' else np.NaN, 1)
dx['RankW'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='W' else np.NaN, 1)
dx['RankD'] = dx.apply(lambda x: x['Rank'] if x['PlayerPosition']=='D' else np.NaN, 1)
dx['RankC'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dx['RankW'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dx['RankD'] = dx.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dx['GMeanC']= dx.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dx['GMeanW']= dx.groupby(['Season', 'GameNumber', 'TeamCode'])['RankW'].transform('mean')
dx['GMeanD']= dx.groupby(['Season', 'GameNumber', 'TeamCode'])['RankD'].transform('mean')
dx = dx[['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'PlayerNumber', 'PlayerPosition', 'Rank', 'MeanC', 'MeanW', 'MeanD', 'GMeanC', 'GMeanW', 'GMeanD']]
dx = dx.rename(columns={'PlayerPosition': 'Position'})
dx.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,PlayerNumber,Position,Rank,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD
0,2010,20005,CGY,18.0,13.0,C,1.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111
1,2010,20005,CGY,18.0,12.0,W,1.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111
2,2010,20005,CGY,18.0,40.0,W,1.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111
3,2010,20005,CGY,18.0,3.0,D,3.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111
4,2010,20005,CGY,18.0,28.0,D,2.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111


- group by season, game number, team and player to count the occurance of each player per game and sum up the observations of players. There should be 18 players per team and 36 per game for the dataset to be correct.

In [101]:
dx['playercount'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'PlayerNumber',])['PlayerNumber'].transform('count')

In [102]:
dx['rosterposition'] = dx.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'Rank'])['playercount'].transform('sum')

#### pivot table

- the next step is to group players by gamenumber, teamcode, position and rank, to display the quality of players each team has per position. **Pivot table** by player position and rank using roster position values. Game number and team are the indexes. We want to join the levels to generate columns by roster position and rank. 

In [103]:
dx = pd.pivot_table(dx, index=['Season', 'GameNumber', 'TeamCode', 'RosterCount', 'MeanC', 'MeanW', 'MeanD', 'GMeanC', 'GMeanW', 'GMeanD'], columns=['Position', 'Rank'], values=['rosterposition'])
dx = dx.reset_index()
dx.columns = ['_'.join(str(s).strip() for s in col if s) for col in dx.columns]
dx.reset_index()
dx = dx.fillna(0)
dx = dx.rename(columns={'rosterposition_C_1.0': 'C1', 'rosterposition_C_2.0': 'C2', 'rosterposition_C_3.0': 'C3', 'rosterposition_C_4.0': 'C4', 'rosterposition_W_1.0': 'W1', 'rosterposition_W_2.0': 'W2', 'rosterposition_W_3.0': 'W3', 'rosterposition_W_4.0': 'W4', 'rosterposition_D_1.0': 'D1', 'rosterposition_D_2.0': 'D2', 'rosterposition_D_3.0': 'D3' })
dx.head(10)

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20005,CGY,18.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0
1,2010,20005,EDM,18.0,2.903846,2.711538,2.564103,2.816239,2.111111,2.333333,0.0,2.0,1.0,1.0,0.0,3.0,3.0,1.0,4.0,1.0,2.0
2,2010,20006,CBJ,18.0,2.1,2.66,2.406667,2.304444,2.722222,2.555556,2.0,1.0,0.0,1.0,0.0,2.0,4.0,1.0,3.0,2.0,2.0
3,2010,20006,SJ,18.0,1.764706,2.404412,2.362745,1.669935,2.333333,2.111111,3.0,0.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0
4,2010,20009,ANA,18.0,2.975,2.5875,2.233333,2.627778,2.222222,2.444444,1.0,0.0,1.0,2.0,2.0,0.0,4.0,3.0,1.0,1.0,3.0
5,2010,20009,DET,18.0,2.0,2.205,1.78,1.877778,2.111111,1.722222,1.0,3.0,0.0,0.0,3.0,2.0,1.0,2.0,3.0,3.0,0.0
6,2010,20013,BUF,18.0,2.944444,1.923611,2.037037,2.907407,2.111111,2.333333,1.0,0.0,3.0,0.0,2.0,2.0,2.0,4.0,1.0,1.0,2.0
7,2010,20013,NYR,18.0,1.75,2.471591,2.409091,1.815657,2.444444,2.444444,2.0,1.0,1.0,0.0,0.0,4.0,2.0,1.0,2.0,4.0,1.0
8,2010,20015,DAL,18.0,2.298387,2.858871,2.575269,2.213262,2.5,2.722222,2.0,0.0,1.0,1.0,0.0,2.0,4.0,2.0,1.0,1.0,4.0
9,2010,20015,NYI,18.0,2.25,2.875,2.393333,2.186667,2.833333,2.166667,1.0,2.0,0.0,1.0,1.0,2.0,3.0,0.0,3.0,1.0,4.0


In [104]:
dx.shape

(562, 21)

In [105]:
dx.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')
#dx.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_game_team_roster_centers_wingers_defensemen.csv', index='False', sep=',')

- create a dataset that will display the mean of forwards and defencemen by season per team

- drop duplicates by season and team

In [109]:
dz = dx.copy()
dz = dz.drop_duplicates(['Season', 'TeamCode'])
dz.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
0,2010,20005,CGY,18.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0
1,2010,20005,EDM,18.0,2.903846,2.711538,2.564103,2.816239,2.111111,2.333333,0.0,2.0,1.0,1.0,0.0,3.0,3.0,1.0,4.0,1.0,2.0
2,2010,20006,CBJ,18.0,2.1,2.66,2.406667,2.304444,2.722222,2.555556,2.0,1.0,0.0,1.0,0.0,2.0,4.0,1.0,3.0,2.0,2.0
3,2010,20006,SJ,18.0,1.764706,2.404412,2.362745,1.669935,2.333333,2.111111,3.0,0.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0
4,2010,20009,ANA,18.0,2.975,2.5875,2.233333,2.627778,2.222222,2.444444,1.0,0.0,1.0,2.0,2.0,0.0,4.0,3.0,1.0,1.0,3.0


In [110]:
dz.shape

(30, 21)

In [398]:
#dz.to_csv('season_team.csv', index='False', sep=',')

In [111]:
dz.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_team_centers_wingers_defensemen.csv', index='False', sep=',')
#dz.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_team_centers_wingers_defensemen.csv', index='False', sep=',')

## game level data

- create an index variable to deterime if a team is considered visitor or home for a given game. The column will be named "A". The 1st observation per game is the visitor team and will be assigned a value of 1. The 2nd and final observation per game, is the home team, so we fill in NaN with a value of 2 (home team)

In [112]:
dy = dx.copy()

In [113]:
dy.loc[dy.groupby('GameNumber',as_index=False).head(1).index,'A'] = 1
dy = dy.fillna(2)

In [114]:
dy.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,A
0,2010,20005,CGY,18.0,1.923913,2.336957,1.891304,1.855072,2.222222,2.611111,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0
1,2010,20005,EDM,18.0,2.903846,2.711538,2.564103,2.816239,2.111111,2.333333,0.0,2.0,1.0,1.0,0.0,3.0,3.0,1.0,4.0,1.0,2.0,2.0
2,2010,20006,CBJ,18.0,2.1,2.66,2.406667,2.304444,2.722222,2.555556,2.0,1.0,0.0,1.0,0.0,2.0,4.0,1.0,3.0,2.0,2.0,1.0
3,2010,20006,SJ,18.0,1.764706,2.404412,2.362745,1.669935,2.333333,2.111111,3.0,0.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0,2.0
4,2010,20009,ANA,18.0,2.975,2.5875,2.233333,2.627778,2.222222,2.444444,1.0,0.0,1.0,2.0,2.0,0.0,4.0,3.0,1.0,1.0,3.0,1.0


- **pivot table using game number as index by whether a team is visitor (1) or home (2)**. The table will display the quality of each player per position and team. The next step is to join columns by team and player quality value. We will have for each team 10 columns ( 5 positions x 2 type of player quality). We will rename the columns as following: VC1 shows the amount of elite centers for the visitor team, HC1 displays the amount of elite centers for the home team etc. We rename the columns and sort them based on team, position and quality. 

In [115]:
dy = pd.pivot_table(dy, index=['Season', 'GameNumber'], columns=['A'], values=['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3'])
dy = dy.reset_index()
dy.columns = ['_'.join(str(s).strip() for s in col if s) for col in dy.columns]
dy = dy.reset_index()
dy = dy.rename(columns={'C1_1.0': 'VC1', 'C2_1.0': 'VC2', 'C3_1.0': 'VC3', 'C4_1.0': 'VC4', 'W1_1.0': 'VW1', 'W2_1.0': 'VW2', 'W3_1.0': 'VW3', 'W4_1.0': 'VW4', 'D1_1.0': 'VD1', 'D2_1.0': 'VD2', 'D3_1.0': 'VD3', 'C1_2.0': 'HC1', 'C2_2.0': 'HC2', 'C3_2.0': 'HC3', 'C4_2.0': 'HC4', 'W1_2.0': 'HW1', 'W2_2.0': 'HW2', 'W3_2.0': 'HW3', 'W4_2.0': 'HW4', 'D1_2.0': 'HD1', 'D2_2.0': 'HD2', 'D3_2.0': 'HD3'})
dy = dy[['Season', 'GameNumber', 'VC1', 'VC2', 'VC3', 'VC4', 'VW1', 'VW2', 'VW3', 'VW4', 'VD1', 'VD2', 'VD3', 'HC1', 'HC2', 'HC3', 'HC4', 'HW1', 'HW2', 'HW3', 'HW4', 'HD1', 'HD2', 'HD3']]
dy.sort_values(['Season', 'GameNumber'], ascending=[True, True], inplace=True)
dy.head()

Unnamed: 0,Season,GameNumber,VC1,VC2,VC3,VC4,VW1,VW2,VW3,VW4,VD1,VD2,VD3,HC1,HC2,HC3,HC4,HW1,HW2,HW3,HW4,HD1,HD2,HD3
0,2010,20005,1.0,2.0,0.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,0.0,2.0,1.0,1.0,1.0,4.0,1.0,2.0,0.0,3.0,3.0
1,2010,20006,2.0,1.0,0.0,1.0,1.0,3.0,2.0,2.0,0.0,2.0,4.0,3.0,0.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0,2.0,3.0
2,2010,20009,1.0,0.0,1.0,2.0,3.0,1.0,1.0,3.0,2.0,0.0,4.0,1.0,3.0,0.0,0.0,2.0,3.0,3.0,0.0,3.0,2.0,1.0
3,2010,20013,1.0,0.0,3.0,0.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,2.0,4.0,1.0,0.0,4.0,2.0
4,2010,20015,2.0,0.0,1.0,1.0,2.0,1.0,1.0,4.0,0.0,2.0,4.0,1.0,2.0,0.0,1.0,0.0,3.0,1.0,4.0,1.0,2.0,3.0


In [116]:
dy.shape

(281, 24)

In [405]:
#dy.to_csv('season_game_roster.csv', index='False', sep=',')

In [117]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/season_game_roster_center_winger_defensemen.csv', index='False', sep=',')

# Roster Analysis

## season_level_analysis

#### $WinPc = \beta_{0} + \beta_{1}MeanC_{1} + \beta_{2}MeanC_{2}+ \beta_{3}MeanC_{3} + \beta_{4}MeanC_{4} + \beta_{5}MeanW_{1} + \beta_{6}MeanW_{2}+ \beta_{7}MeanW_{3} + \beta_{8}MeanW_{4} + \beta_{9}MeanD_{1} + \beta_{10}MeanD_{2}+ \beta_{11}MeanD_{3} + e_{s}$

#### games with 4 centres, 8 wingers and 6 defensemen

- merge season_team dataset (dz) and season_team_roster_ranking (dv) for roster analysis at the season level. Use **ds** as the merging dataset.

In [140]:
ds = pd.merge(dv, dz, on=['Season', 'TeamCode', 'MeanC', 'MeanW', 'MeanD'], how='left')
ds.sort_values(['MeanC'], ascending=[True], inplace=True)
ds.head(30)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,GameNumber,RosterCount,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4
18,2010,CAR,16,7,9,39,41,0.4375,0.5625,1.71875,2.5,2.020833,19.0,1.0,13.0,7.0,20309,18.0,1.677083,2.277778,2.0,2.0,1.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
8,2010,NYR,22,13,9,72,56,0.590909,0.409091,1.75,2.471591,2.409091,9.0,2.0,11.0,21.0,20013,18.0,1.815657,2.444444,2.444444,2.0,1.0,1.0,0.0,0.0,4.0,2.0,1.0,2.0,4.0,1.0
9,2010,SJ,17,10,7,61,47,0.588235,0.411765,1.764706,2.404412,2.362745,10.0,3.0,10.0,16.0,20006,18.0,1.669935,2.333333,2.111111,3.0,0.0,0.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,3.0
5,2010,PHI,31,19,12,102,82,0.612903,0.387097,1.774194,2.294355,1.725806,6.0,4.0,6.0,1.0,20031,18.0,1.543011,2.944444,1.555556,3.0,0.0,0.0,1.0,2.0,4.0,0.0,3.0,2.0,0.0,3.0
27,2010,COL,30,8,22,77,106,0.266667,0.733333,1.875,2.945833,2.455556,28.0,5.0,29.0,22.0,20031,18.0,1.727778,3.111111,2.888889,2.0,1.0,1.0,0.0,1.0,0.0,5.0,0.0,3.0,3.0,2.0
15,2010,CGY,23,11,12,75,75,0.478261,0.521739,1.923913,2.336957,1.891304,16.0,6.0,7.0,3.0,20005,18.0,1.855072,2.222222,2.611111,1.0,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0
6,2010,CHI,20,12,8,68,54,0.6,0.4,1.9875,1.91875,2.158333,7.5,7.0,2.0,10.0,20040,18.0,1.875,1.888889,2.444444,2.0,1.0,1.0,0.0,2.0,0.0,4.0,2.0,5.0,1.0,0.0
2,2010,DET,25,18,7,93,63,0.72,0.28,2.0,2.205,1.78,3.0,8.0,5.0,2.0,20009,18.0,1.877778,2.111111,1.722222,1.0,3.0,0.0,0.0,3.0,2.0,1.0,2.0,3.0,3.0,0.0
1,2010,VAN,26,21,5,87,54,0.807692,0.192308,2.009615,2.350962,2.211538,2.0,9.0,8.0,12.0,20076,18.0,1.814103,2.5,1.944444,2.0,1.0,0.0,1.0,2.0,1.0,3.0,2.0,1.0,3.0,2.0
19,2010,BOS,14,6,8,31,36,0.428571,0.571429,2.053571,1.75,1.988095,20.5,10.0,1.0,4.0,20129,18.0,1.948413,1.666667,2.0,2.0,0.0,2.0,0.0,1.0,3.0,2.0,4.0,2.0,2.0,0.0


In [119]:
#ds['c1'] = ds.apply(lambda x: 1 if x['MeanC1'] > x['Mean_C'] else 0, 1)
#ds.sort_values(['c1'], ascending=[False], inplace=True)

- A balanced roster will have one player ranked in each centre position (C1, C2, C3, C4), two wingers ranked on every line (LW1, RW1, LW2,RW2, etc) and two defensemen ranked in all three defensive pairings. The ranking of a balanced roster is 2.5 [(1+2+3+4)/4] centres, 2.5 [(1+1+2+2+3+3+4+4)/8] for wingers and 2 [(1+1+2+2+3+3)/6] for defensemen. Since players are ranked from 1 to 4 for forwards and 1 to 3 for defensemen, 1 being the highest ranked. A team is considered to have an above average team when the **mean of each forward position is smaller than 2.5 and the mean of defensive pairings is smaller than 2**. 

In [136]:
ds['C'] = 2.5 - ds['MeanC']
ds['W'] = 2.5 - ds['MeanW']
ds['D'] = 2 - ds['MeanD']

In [137]:
ds.shape

(30, 34)

### summary analysis

In [138]:
ds.describe()

Unnamed: 0,Season,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,RankWin,RankC,RankW,RankD,GameNumber,RosterCount,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,C,D
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,2010.0,18.733333,-0.041531,9.366667,53.133333,53.133333,0.480995,0.519005,2.355433,2.541531,2.273266,15.5,15.5,15.5,15.5,20119.633333,18.0,2.261073,2.440741,2.261111,1.266667,1.0,1.066667,0.666667,1.166667,1.966667,2.866667,1.766667,2.333333,1.766667,2.133333,0.144567,-0.273266
std,0.0,7.750121,0.33592,4.810071,26.654537,25.429732,0.182702,0.182702,0.42722,0.33592,0.264274,8.80047,8.802429,8.802429,8.802429,196.311414,0.0,0.429319,0.402983,0.306781,0.868345,1.082781,0.944433,0.606478,0.833908,1.129032,1.136642,1.165106,1.24106,1.250747,1.074255,0.42722,0.264274
min,2010.0,4.0,-0.721154,2.0,7.0,10.0,0.0,0.1875,1.71875,1.75,1.725806,1.0,1.0,1.0,1.0,20005.0,18.0,1.543011,1.666667,1.555556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.690476
25%,2010.0,13.25,-0.315385,6.25,31.0,33.25,0.387311,0.402273,2.002404,2.363221,2.040033,7.875,8.25,8.25,8.25,20013.5,18.0,1.875694,2.138889,2.055556,1.0,0.0,0.0,0.0,1.0,1.25,2.0,1.0,2.0,1.0,2.0,-0.267188,-0.480556
50%,2010.0,17.0,-0.071094,8.0,58.0,49.0,0.481066,0.518934,2.319648,2.571094,2.31016,15.5,15.5,15.5,15.5,20031.0,18.0,2.217742,2.444444,2.222222,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,2.0,0.180352,-0.31016
75%,2010.0,25.0,0.136779,12.0,74.25,72.0,0.597727,0.612689,2.767187,2.815385,2.480556,22.75,22.75,22.75,22.75,20117.75,18.0,2.616944,2.708333,2.444444,2.0,1.75,1.75,1.0,2.0,2.75,4.0,2.75,3.0,2.75,3.0,0.497596,-0.040033
max,2010.0,33.0,0.75,22.0,102.0,106.0,0.8125,1.0,3.0,3.221154,2.690476,30.0,29.5,30.0,30.0,20741.0,18.0,2.952381,3.166667,2.888889,3.0,4.0,3.0,2.0,3.0,4.0,5.0,4.0,5.0,5.0,4.0,0.78125,0.274194


### estimate roster model 

- regress **team win percent** on the mean of players by position and quality (predictor variables). Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on team winning percent.

In [139]:
y = ds['WinPc']   
X1 = sm.add_constant(ds[['C', 'W', 'D']] )
m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,WinPc,R-squared:,0.257
Model:,OLS,Adj. R-squared:,0.172
Method:,Least Squares,F-statistic:,3.003
Date:,"Fri, 09 Mar 2018",Prob (F-statistic):,0.0486
Time:,23:02:58,Log-Likelihood:,13.4
No. Observations:,30,AIC:,-18.8
Df Residuals:,26,BIC:,-13.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5157,0.052,9.884,0.000,0.408 0.623
C,-0.0338,0.075,-0.454,0.654,-0.187 0.119
W,0.2418,0.121,2.006,0.055,-0.006 0.490
D,0.0725,0.155,0.469,0.643,-0.245 0.390

0,1,2,3
Omnibus:,0.669,Durbin-Watson:,0.681
Prob(Omnibus):,0.716,Jarque-Bera (JB):,0.286
Skew:,-0.239,Prob(JB):,0.867
Kurtosis:,3.013,Cond. No.,6.32


## season_game_team_level_analysis

#### $Win = \beta_{0} + \beta_{1}C_{1} + \beta_{2}C_{2} + \beta_{3}C_{3} + \beta_{4}C_{4} + \beta_{5}W_{1} + \beta_{6}W_{2} + \beta_{7}W_{3} + \beta_{8}W_{4} + \beta_{9}D_{1} + \beta_{10}D_{2} + \beta_{11}D_{3} + e_{s,g,t}$

- use season game data (dg) and season game team roster (dx) to conduct season game team level analysis (dt).

In [415]:
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20001,MTL,TOR,2,3,1,3,2,TOR,MTL
1,2010,20002,PHI,PIT,3,2,-1,2,3,PHI,PIT
2,2010,20003,CAR,MIN,4,3,-1,3,4,CAR,MIN
3,2010,20004,CHI,COL,3,4,1,4,3,COL,CHI
4,2010,20005,CGY,EDM,0,4,4,4,0,EDM,CGY


In [416]:
dt = dx.merge(dg, on=['Season', 'GameNumber'], how='left')
dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam
0,2010,20005,CGY,18.0,1.923913,2.097826,2.057971,1.855072,2.277778,2.666667,1.0,2.0,0.0,1.0,0.0,3.0,3.0,3.0,2.0,0.0,3.0,CGY,EDM,0,4,4,4,0,EDM,CGY
1,2010,20005,EDM,18.0,2.653846,2.605769,2.141026,2.634615,2.0,2.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,4.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY
2,2010,20006,CBJ,18.0,2.14,2.595,2.373333,2.191111,2.888889,2.333333,1.0,2.0,1.0,0.0,0.0,3.0,3.0,2.0,2.0,1.0,3.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ
3,2010,20006,SJ,18.0,1.794118,2.470588,2.137255,1.676471,2.277778,1.833333,3.0,0.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,0.0,3.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ
4,2010,20009,ANA,18.0,2.6,2.7375,1.933333,2.188889,2.388889,2.111111,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3.0,1.0,0.0,4.0,ANA,DET,0,4,4,4,0,DET,ANA


In [417]:
dt.shape

(562, 30)

- Sum up goals for and against by team per game and find the goal differential (GD) per game. Assign a value of 1 to the team that won the game. 

In [418]:
dt['GD'] = dt.apply(lambda x: (x['HGF'] - x['VGF']) if x['HTeamCode']== x['TeamCode'] else (x['VGF'] - x['HGF']), 1)
dt['Win'] = dt.apply(lambda x: 1 if x['WinTeam']== x['TeamCode'] else 0, 1)
dt['GF'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']== x['TeamCode'] else x['VGF'], 1)
dt['GA'] = dt.apply(lambda x: x['HGF'] if x['HTeamCode']!= x['TeamCode'] else x['VGF'], 1)
dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,Win,GF,GA
0,2010,20005,CGY,18.0,1.923913,2.097826,2.057971,1.855072,2.277778,2.666667,1.0,2.0,0.0,1.0,0.0,3.0,3.0,3.0,2.0,0.0,3.0,CGY,EDM,0,4,-4,4,0,EDM,CGY,0,0,4
1,2010,20005,EDM,18.0,2.653846,2.605769,2.141026,2.634615,2.0,2.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,4.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY,1,4,0
2,2010,20006,CBJ,18.0,2.14,2.595,2.373333,2.191111,2.888889,2.333333,1.0,2.0,1.0,0.0,0.0,3.0,3.0,2.0,2.0,1.0,3.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ,0,2,3
3,2010,20006,SJ,18.0,1.794118,2.470588,2.137255,1.676471,2.277778,1.833333,3.0,0.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,0.0,3.0,SJ,CBJ,3,2,1,2,3,SJ,CBJ,1,3,2
4,2010,20009,ANA,18.0,2.6,2.7375,1.933333,2.188889,2.388889,2.111111,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3.0,1.0,0.0,4.0,ANA,DET,0,4,-4,4,0,DET,ANA,0,0,4


In [419]:
dt.shape

(562, 33)

In [420]:
# difference in center position per game
dt['c1'] = dt['C1']/4
dt['c2'] = dt['C2']/4
dt['c3'] = dt['C3']/4
dt['c4'] = dt['C4']/4

# difference in wing position per game
dt['w1'] = dt['W1']/8
dt['w2'] = dt['W2']/8
dt['w3'] = dt['W3']/8
dt['w4'] = dt['W4']/8

# difference in defence position per game
dt['d1'] = dt['D1']/6
dt['d2'] = dt['D2']/6
dt['d3'] = dt['D3']/6

### summary analysis

In [421]:
dt.describe()

Unnamed: 0,Season,GameNumber,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VGF,HGF,GD,VGA,HGA,Win,GF,GA,c1,c2,c3,c4,w1,w2,w3,w4,d1,d2,d3
count,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0
mean,2010.0,20630.074733,18.0,2.211299,2.430827,2.119217,2.110815,2.292902,2.100732,1.323843,1.05694,1.069395,0.549822,1.048043,3.188612,1.763345,1.829181,3.094306,0.877224,2.199288,2.711744,2.960854,0.0,2.960854,2.711744,0.5,2.836299,2.836299,0.330961,0.264235,0.267349,0.137456,0.228648,0.386788,0.109653,0.274911,0.174674,0.531435,0.293891
std,0.0,364.33307,0.0,0.445056,0.303678,0.170496,0.426052,0.396166,0.263991,0.939394,1.030876,0.746105,0.676977,0.829173,1.139164,0.924537,1.115122,1.411692,0.810502,1.324175,1.703149,1.73726,2.480584,1.73726,1.703149,0.500445,1.7248,1.7248,0.234848,0.257719,0.186526,0.169244,0.13939,0.176461,0.101313,0.165522,0.138195,0.189861,0.154089
min,2010.0,20005.0,18.0,1.524194,1.7625,1.677083,1.369176,1.333333,1.444444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2010.0,20322.0,18.0,1.910714,2.242188,2.04,1.855072,2.0,1.944444,1.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,-2.0,2.0,1.0,0.0,2.0,2.0,0.25,0.0,0.25,0.0,0.125,0.25,0.0,0.125,0.0,0.333333,0.166667
50%,2010.0,20612.0,18.0,2.14,2.459677,2.119048,2.100427,2.277778,2.055556,1.0,1.0,1.0,0.0,1.0,3.0,2.0,2.0,3.0,1.0,2.0,3.0,3.0,0.0,3.0,3.0,0.5,3.0,3.0,0.25,0.25,0.25,0.0,0.25,0.375,0.125,0.25,0.166667,0.5,0.333333
75%,2010.0,20974.0,18.0,2.59,2.595,2.21875,2.452991,2.555556,2.277778,2.0,2.0,2.0,1.0,2.0,4.0,2.0,3.0,4.0,1.0,3.0,4.0,4.0,2.0,4.0,4.0,1.0,4.0,4.0,0.5,0.5,0.5,0.25,0.375,0.5,0.125,0.375,0.333333,0.666667,0.333333
max,2010.0,21230.0,18.0,3.208333,3.089286,2.547619,3.108025,3.444444,2.888889,3.0,4.0,3.0,3.0,3.0,6.0,4.0,4.0,6.0,3.0,5.0,9.0,9.0,8.0,9.0,9.0,1.0,9.0,9.0,0.75,1.0,0.75,0.75,0.5,0.75,0.375,0.625,0.5,1.0,0.666667


In [422]:
dt.groupby(['Win'])['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3,C4,W1,W2,W3,W4,D1,D2,D3
Win,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,count,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0
0,mean,1.295374,1.096085,1.074733,0.533808,1.740214,3.003559,0.843416,2.412811,0.960854,3.213523,1.825623
0,std,0.915119,0.971754,0.725629,0.670631,1.095503,1.402799,0.744289,1.317619,0.794106,1.154477,0.99005
0,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,25%,1.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,2.0,1.0
0,50%,1.0,1.0,1.0,0.0,2.0,3.0,1.0,2.0,1.0,3.0,2.0
0,75%,2.0,2.0,2.0,1.0,3.0,4.0,1.0,3.0,2.0,4.0,3.0
0,max,3.0,3.0,3.0,3.0,4.0,6.0,3.0,5.0,3.0,6.0,4.0
1,count,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0,281.0
1,mean,1.352313,1.017794,1.064057,0.565836,1.918149,3.185053,0.911032,1.985765,1.135231,3.163701,1.701068


In [423]:
_var = ['C1', 'C2', 'C3', 'C4', 'W1', 'W2', 'W3', 'W4', 'D1', 'D2', 'D3']
d1 = dt[dt.Win==1][_var].mean()
d2 = dt[dt.Win==0][_var].mean()
df = d1/d2
df

C1    1.043956
C2    0.928571
C3    0.990066
C4    1.060000
W1    1.102249
W2    1.060427
W3    1.080169
W4    0.823009
D1    1.181481
D2    0.984496
D3    0.931774
dtype: float64

### estimate roster model

- regress **win** on the difference in number of players by position and quality per team. Add a constant to the predictors and use **OLS**. The purpose is to deterimine the impact each roster position has on home team success.

In [424]:
dt['mC1'] = dt.C1 - 1
dt['mC2'] = dt.C2 - 1
dt['mC3'] = dt.C3 - 1
dt['mC4'] = dt.C4 - 1

dt['mW1'] = dt.W1 - 2
dt['mW2'] = dt.W2 - 2
dt['mW3'] = dt.W3 - 2
dt['mW4'] = dt.W4 - 2

dt['mD1'] = dt.D1 - 2
dt['mD2'] = dt.D2 - 2
dt['mD3'] = dt.D3 - 2

_m1 = smf.ols('Win ~ mC1 + mC2 + mC3 + mC4', data=dt).fit()
_m2 = smf.ols('Win ~ mW1 + mW2 + mW3 + mW4', data=dt).fit()
_m3 = smf.ols('Win ~ mD1 + mD2 + mD3', data=dt).fit()
_m4 = smf.ols('Win ~ mC1 + mC2 + mC3 +  mW1 + mW2 + mW3 + mW4', data=dt).fit()

_m5 = smf.ols('Win ~ mC1 + mC2 + mC3 + mC4 + mW1 + mW2 + mW3 + mW4 + mD1 + mD2 + mD3', data=dt).fit()
_m6 = smf.logit('Win ~ mC1 + mC2 + mC3 + mC4 +  mW1 + mW2 + mW3 + mW4 + mD1 + mD2 + mD3', data=dt).fit()


summary_col([_m1, _m2, _m3, _m4, _m5, _m6], stars=True)


         Current function value: 0.673577
         Iterations: 35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  res.ix[:, 0][idx] = res.ix[:, 0][idx] + '*'
  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3,4,5,6
,Win I,Win II,Win III,Win IIII,Win IIIII,Win IIIIII
Intercept,0.5041***,0.4969***,0.5491***,0.5231***,0.5669***,0.2803
,(0.0258),(0.0391),(0.0333),(0.0423),(0.0564),(0.2328)
mC1,0.0085,,,-0.0499,-0.0058,-0.0262
,(0.0177),,,(0.0379),(0.0198),(nan)
mC2,-0.0135,,,-0.0531,-0.0200,-0.0848
,(0.0158),,,(0.0333),(0.0172),(nan)
mC3,-0.0074,,,-0.0481,-0.0033,-0.0102
,(0.0219),,,(0.0407),(0.0226),(nan)
mC4,0.0123,,,,0.0290,0.1212


In [425]:
dt['RC1'] = dt.C1 - dt.C4
dt['RC2'] = dt.C2 - dt.C4
dt['RC3'] = dt.C3 - dt.C4

dt['RW1'] = dt.W1 - dt.C4
dt['RW2'] = dt.W2 - dt.C4
dt['RW3'] = dt.W3 - dt.C4
dt['RW4'] = dt.W4 - dt.C4

dt.head()

Unnamed: 0,Season,GameNumber,TeamCode,RosterCount,MeanC,MeanW,MeanD,GMeanC,GMeanW,GMeanD,C1,C2,C3,C4,D1,D2,D3,W1,W2,W3,W4,VTeamCode,HTeamCode,VGF,HGF,GD,VGA,HGA,WinTeam,LossTeam,Win,GF,GA,c1,c2,c3,c4,w1,w2,w3,w4,d1,d2,d3,mC1,mC2,mC3,mC4,mW1,mW2,mW3,mW4,mD1,mD2,mD3,RC1,RC2,RC3,RW1,RW2,RW3,RW4
0,2010,20005,CGY,18.0,1.923913,2.097826,2.057971,1.855072,2.277778,2.666667,1.0,2.0,0.0,1.0,0.0,3.0,3.0,3.0,2.0,0.0,3.0,CGY,EDM,0,4,-4,4,0,EDM,CGY,0,0,4,0.25,0.5,0.0,0.25,0.375,0.25,0.0,0.375,0.0,0.5,0.5,0.0,1.0,-1.0,0.0,1.0,0.0,-2.0,1.0,-2.0,1.0,1.0,0.0,1.0,-1.0,2.0,1.0,-1.0,2.0
1,2010,20005,EDM,18.0,2.653846,2.605769,2.141026,2.634615,2.0,2.0,0.0,2.0,2.0,0.0,0.0,6.0,0.0,1.0,4.0,1.0,2.0,CGY,EDM,0,4,4,4,0,EDM,CGY,1,4,0,0.0,0.5,0.5,0.0,0.125,0.5,0.125,0.25,0.0,1.0,0.0,-1.0,1.0,1.0,-1.0,-1.0,2.0,-1.0,0.0,-2.0,4.0,-2.0,0.0,2.0,2.0,1.0,4.0,1.0,2.0
2,2010,20006,CBJ,18.0,2.14,2.595,2.373333,2.191111,2.888889,2.333333,1.0,2.0,1.0,0.0,0.0,3.0,3.0,2.0,2.0,1.0,3.0,SJ,CBJ,3,2,-1,2,3,SJ,CBJ,0,2,3,0.25,0.5,0.25,0.0,0.25,0.25,0.125,0.375,0.0,0.5,0.5,0.0,1.0,0.0,-1.0,0.0,0.0,-1.0,1.0,-2.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,3.0
3,2010,20006,SJ,18.0,1.794118,2.470588,2.137255,1.676471,2.277778,1.833333,3.0,0.0,0.0,1.0,1.0,4.0,1.0,3.0,2.0,0.0,3.0,SJ,CBJ,3,2,1,2,3,SJ,CBJ,1,3,2,0.75,0.0,0.0,0.25,0.375,0.25,0.0,0.375,0.166667,0.666667,0.166667,2.0,-1.0,-1.0,0.0,1.0,0.0,-2.0,1.0,-1.0,2.0,-1.0,2.0,-1.0,-1.0,2.0,1.0,-1.0,2.0
4,2010,20009,ANA,18.0,2.6,2.7375,1.933333,2.188889,2.388889,2.111111,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3.0,1.0,0.0,4.0,ANA,DET,0,4,-4,4,0,DET,ANA,0,0,4,0.25,0.25,0.25,0.25,0.375,0.125,0.0,0.5,0.333333,0.166667,0.5,0.0,0.0,0.0,0.0,1.0,-1.0,-2.0,2.0,0.0,-1.0,1.0,0.0,0.0,0.0,2.0,0.0,-1.0,3.0


In [426]:
y = dt['Win']  

X1 = sm.add_constant(dt[['RC1', 'RC2', 'RC3']])


m1 = sm.OLS(y, X1).fit()
m1.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.3751
Date:,"Wed, 07 Mar 2018",Prob (F-statistic):,0.771
Time:,20:42:04,Log-Likelihood:,-407.33
No. Observations:,562,AIC:,822.7
Df Residuals:,558,BIC:,840.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.5041,0.026,19.573,0.000,0.453 0.555
RC1,0.0085,0.018,0.482,0.630,-0.026 0.043
RC2,-0.0135,0.016,-0.852,0.395,-0.045 0.018
RC3,-0.0074,0.022,-0.337,0.736,-0.050 0.036

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.994
Prob(Omnibus):,1.0,Jarque-Bera (JB):,92.912
Skew:,0.001,Prob(JB):,6.670000000000001e-21
Kurtosis:,1.008,Cond. No.,2.56


In [427]:
y = dt['Win']  
X1 = sm.add_constant(dt[['C1', 'C2', 'C3' , 'W1','W2', 'W3', 'D1', 'D2']] )
X2 = sm.add_constant(dt[['c1', 'c2', 'c3', 'w1', 'w2', 'w3', 'd1', 'd2']] )

#m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m2.summary()

0,1,2,3
Dep. Variable:,Win,R-squared:,0.038
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,2.755
Date:,"Wed, 07 Mar 2018",Prob (F-statistic):,0.00544
Time:,20:42:04,Log-Likelihood:,-396.91
No. Observations:,562,AIC:,811.8
Df Residuals:,553,BIC:,850.8
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.2155,0.162,1.332,0.183,-0.102 0.533
c1,-0.1394,0.158,-0.881,0.379,-0.450 0.171
c2,-0.1960,0.136,-1.445,0.149,-0.462 0.070
c3,-0.1292,0.169,-0.767,0.444,-0.460 0.202
w1,0.6973,0.225,3.106,0.002,0.256 1.138
w2,0.4408,0.164,2.687,0.007,0.119 0.763
w3,0.4018,0.227,1.770,0.077,-0.044 0.848
d1,0.2485,0.213,1.166,0.244,-0.170 0.667
d2,-0.0008,0.150,-0.005,0.996,-0.295 0.293

0,1,2,3
Omnibus:,0.0,Durbin-Watson:,2.943
Prob(Omnibus):,1.0,Jarque-Bera (JB):,79.971
Skew:,0.0,Prob(JB):,4.31e-18
Kurtosis:,1.152,Cond. No.,20.9
