In [54]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

### import play by play data set

In [55]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_pbp.csv')
#da.pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_pbp.csv')
da = da.drop('Unnamed: 0', axis=1)

- exclude irrelevant on-ice events

In [56]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']
da = da[da['EventType']!='PSTR']
da = da[da['EventType']!='PEND']
da = da[da['EventType']!='SOC']
da = da[da['EventType']!='GEND']

- create an event number variable that will count the number of events per game. 
- create an advantage type variable for even strength, power play and short handed situations.

In [57]:
da['EventNumber'] = da.groupby(['Season', 'GameNumber']).cumcount()+1

In [58]:
da['AdvantageType'] = da.apply(lambda x: 'EV' if ((x['PlayersV'] == x['PlayersH']) & (x['PlayersV'] != 1) & (x['PlayersH'] != 1)) else 'PP' if ((x['PlayersV'] > x['PlayersH']) & (x['EventTeamCode'] == x['VTeamCode'])) else 'PP' if ((x['PlayersV'] < x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'SH' if ((x['PlayersV'] < x['PlayersH']) & (x['EventTeamCode'] == x['VTeamCode'])) else 'SH' if ((x['PlayersV'] > x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'PP' if ((x['PlayersV'] == 1 ) & (x['PlayersH'] == 1)) else np.nan, axis=1)


- display goals for each game and drop duplicates.

In [59]:
dg = da.copy()
dg = dg[dg['EventType'] == 'GOAL']
dg['hgoal'] = dg.apply(lambda x: 1 if x['EventTeamCode'] == x['HTeamCode'] else 0, axis=1)
dg['vgoal'] = dg.apply(lambda x: 1 if x['EventTeamCode'] == x['VTeamCode'] else 0, axis=1)
dg['HGF'] = dg.groupby(['Season', 'GameNumber', 'HTeamCode'])['hgoal'].transform('sum')
dg['VGF'] = dg.groupby(['Season', 'GameNumber', 'VTeamCode'])['vgoal'].transform('sum')
dg = dg.drop_duplicates(['Season', 'GameNumber'])
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF']]

- find the goal differential per game with respect to home team.

In [60]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

- display goals againest per team.

In [61]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [62]:
dg = dg.sort_values(['Season'], ascending=[True])
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,WinTeam,LossTeam,VGA,HGA
34,2010,20001,MTL,TOR,2,3,1,TOR,MTL,3,2
254870,2010,20825,NYR,ATL,2,3,1,ATL,NYR,3,2
254594,2010,20824,COL,CBJ,1,3,2,CBJ,COL,3,1
254186,2010,20823,PIT,NYI,3,9,6,NYI,PIT,9,3
254069,2010,20822,S.J,N.J,1,2,1,N.J,S.J,2,1


In [63]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_season_game_data.csv', index='False', sep=',')

## season_level_data

In [64]:
dm = da.copy()

- events that happened in regulation time only

In [65]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [66]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [67]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b})

In [68]:
dm.shape

(13810032, 22)

In [69]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [70]:
df = dm.copy()
df = df[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'HTeamCode', 'HPlayer']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,HTeamCode,HPlayer
0,2010,20001,MTL,SCOTT GOMEZ,TOR,TIM BRENT
1,2010,20001,MTL,SCOTT GOMEZ,TOR,TIM BRENT
2,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK
3,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK
4,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK


- merge season_game_data (dg) on new dataset

In [71]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,HTeamCode,HPlayer,VGF,HGF,GD,WinTeam,LossTeam,VGA,HGA
0,2010,20001,MTL,SCOTT GOMEZ,TOR,TIM BRENT,2,3,1,TOR,MTL,3,2
1,2010,20001,MTL,SCOTT GOMEZ,TOR,TIM BRENT,2,3,1,TOR,MTL,3,2
2,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK,2,3,1,TOR,MTL,3,2
3,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK,2,3,1,TOR,MTL,3,2
4,2010,20001,MTL,TOMAS PLEKANEC,TOR,TYLER BOZAK,2,3,1,TOR,MTL,3,2


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [72]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'TeamCode' in col]
c = [col for col in df.columns if 'GF' in col]
d = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerName' : a, 'TeamCode' : b, 'GF' : c, 'GA' : d })
df['GD'] = df['GF'] - df['GA']
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerName', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,SCOTT GOMEZ,2,3,-1,TOR,MTL
1,2010,20001,MTL,SCOTT GOMEZ,2,3,-1,TOR,MTL
2,2010,20001,MTL,TOMAS PLEKANEC,2,3,-1,TOR,MTL
3,2010,20001,MTL,TOMAS PLEKANEC,2,3,-1,TOR,MTL
4,2010,20001,MTL,TOMAS PLEKANEC,2,3,-1,TOR,MTL


### import player position and rankings

In [73]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_player_position.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_player_position.csv')
dp = dp.drop('Unnamed: 0', axis=1)

In [74]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_player_rank.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_player_rank.csv')
dr = dr.drop('Unnamed: 0', axis=1)

- merge player position and player rankings

In [75]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerName', 'Position'], how='left')
ds.shape

(8394, 5)

In [76]:
ds.isnull().sum()

Season           0
TeamCode         0
PlayerName       0
Position         0
Rank          1821
dtype: int64

skaters that were not included in the clusters and therefore not ranked, forwards are assigned to the 4th line, defensemen to the bottom (3rd) pairing and goaltenders to backup. 

In [77]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['Position'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['Position'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['Position'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['Position'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
#ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(8394, 5)

In [78]:
ds.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Position      0
Rank          0
dtype: int64

- **display each player by team per game. Drop duplicates.**

In [79]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerName'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerName', 'Position', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam
0,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL
1,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL
2,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL
3,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL
4,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL


In [80]:
dw.isnull().sum()

Season          0
GameNumber      0
TeamCode        0
PlayerName      0
Position      835
Rank          835
GF              0
GA              0
GD              0
WinTeam         0
LossTeam        0
dtype: int64

In [81]:
w = dw.copy()
w = w[w['Rank'].isnull()]
w['PlayerName'].value_counts()

DANIEL O'REGAN    835
Name: PlayerName, dtype: int64

In [82]:
dw['Position'] = dw['Position'].fillna('C')
dw['Rank'] = dw['Rank'].fillna(4)

In [83]:
dw.isnull().sum()

Season        0
GameNumber    0
TeamCode      0
PlayerName    0
Position      0
Rank          0
GF            0
GA            0
GD            0
WinTeam       0
LossTeam      0
dtype: int64

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [84]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerName'].transform('count')
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'Position'])['PlayerName'].transform('count')
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL,19,7
2,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL,19,7
9,2010,20001,MTL,JEFF HALPERN,C,2.0,2,3,-1,TOR,MTL,19,7
13,2010,20001,MTL,DUSTIN BOYD,C,4.0,2,3,-1,TOR,MTL,19,7
95,2010,20001,MTL,TOM PYATT,C,3.0,2,3,-1,TOR,MTL,19,7


- count the amount of forwards and defensemen by team per game.

In [85]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='D' else np.NaN, 1)
dw['GCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='G' else np.NaN, 1)

dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw['GCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['GCount'].apply(lambda x: x.ffill().bfill())
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount
0,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0
2,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0
9,2010,20001,MTL,JEFF HALPERN,C,2.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0
13,2010,20001,MTL,DUSTIN BOYD,C,4.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0
95,2010,20001,MTL,TOM PYATT,C,3.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0


In [86]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_season_team_game_player.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_season_team_game_player.csv', index='False', sep=',')

In [87]:
dw1 = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode'])
dw1 = dw1.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [88]:
dw1.shape

(18742, 17)

In [89]:
dw1.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount
0,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0
13810032,2010,20001,TOR,TIM BRENT,C,3.0,3,2,1,TOR,MTL,19,4,4.0,8.0,6.0,1.0
1544,2010,20002,PHI,JEFF CARTER,C,1.0,3,2,1,PHI,PIT,19,4,4.0,8.0,6.0,1.0
13811576,2010,20002,PIT,EVGENI MALKIN,C,1.0,2,3,-1,PHI,PIT,19,7,7.0,5.0,6.0,1.0
3157,2010,20003,CAR,JEFF SKINNER,C,1.0,4,3,1,CAR,MIN,19,6,6.0,6.0,6.0,1.0


In [90]:
dw1.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_game_team_roster_player.csv', index='False', sep=',')
#dw1.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_game_team_roster_player.csv', index='False', sep=',')

### full regular season stats

In [91]:
dw2 = dw.copy()
dw2 = dw2.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw2.shape

(74928, 14)

- create columns for team win and team loss. 

In [92]:
dw2['TeamWin'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw2['TeamLos'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [93]:
dw2['GP'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GameNumber'].transform('count')
dw2['GW'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['TeamWin'].transform('sum')
dw2['GL'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['TeamLos'].transform('sum')
dw2['GF'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GF'].transform('sum')
dw2['GA'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GA'].transform('sum')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2010,20001,MTL,C,220,213,19,7,7.0,5.0,6.0,TOR,MTL,2.571429,0,1,82,44,38
1,2010,20001,MTL,D,220,213,19,6,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38
2,2010,20001,MTL,G,220,213,19,1,7.0,5.0,6.0,TOR,MTL,1.0,0,1,82,44,38
3,2010,20001,MTL,W,220,213,19,5,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38
4,2010,20001,TOR,C,225,259,19,4,4.0,8.0,6.0,TOR,MTL,2.5,1,0,82,37,45


- create columns with the mean ranking for forward and defenseman by team per game.

In [94]:
dw2['RankC'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='C' else np.NaN, 1)
dw2['RankW'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='W' else np.NaN, 1)
dw2['RankD'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='D' else np.NaN, 1)
dw2['RankG'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='G' else np.NaN, 1)

dw2['RankC'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw2['RankW'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw2['RankD'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dw2['RankG'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankG'].apply(lambda x: x.ffill().bfill())
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG
0,2010,20001,MTL,C,220,213,19,7,7.0,5.0,6.0,TOR,MTL,2.571429,0,1,82,44,38,2.571429,2.0,2.0,1.0
1,2010,20001,MTL,D,220,213,19,6,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0
2,2010,20001,MTL,G,220,213,19,1,7.0,5.0,6.0,TOR,MTL,1.0,0,1,82,44,38,2.571429,2.0,2.0,1.0
3,2010,20001,MTL,W,220,213,19,5,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0
4,2010,20001,TOR,C,225,259,19,4,4.0,8.0,6.0,TOR,MTL,2.5,1,0,82,37,45,2.5,2.25,2.0,3.0


- compute the mean per position by team for the season.

In [95]:
dw2['MeanC']= dw2.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw2['MeanW']= dw2.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw2['MeanD']= dw2.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dw2['MeanG']= dw2.groupby(['Season', 'TeamCode'])['RankG'].transform('mean')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG,MeanC,MeanW,MeanD,MeanG
0,2010,20001,MTL,C,220,213,19,7,7.0,5.0,6.0,TOR,MTL,2.571429,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537
1,2010,20001,MTL,D,220,213,19,6,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537
2,2010,20001,MTL,G,220,213,19,1,7.0,5.0,6.0,TOR,MTL,1.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537
3,2010,20001,MTL,W,220,213,19,5,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537
4,2010,20001,TOR,C,225,259,19,4,4.0,8.0,6.0,TOR,MTL,2.5,1,0,82,37,45,2.5,2.25,2.0,3.0,2.163211,2.307903,2.256678,2.182927


- display the quantity of wins and losses per team for the whole season

In [96]:
dw2['L'] = dw2.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw2['W'] = dw2.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG,MeanC,MeanW,MeanD,MeanG,L,W
0,2010,20001,MTL,C,220,213,19,7,7.0,5.0,6.0,TOR,MTL,2.571429,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537,38,44
1,2010,20001,MTL,D,220,213,19,6,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537,38,44
2,2010,20001,MTL,G,220,213,19,1,7.0,5.0,6.0,TOR,MTL,1.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537,38,44
3,2010,20001,MTL,W,220,213,19,5,7.0,5.0,6.0,TOR,MTL,2.0,0,1,82,44,38,2.571429,2.0,2.0,1.0,2.193148,1.85842,1.82259,1.158537,38,44
4,2010,20001,TOR,C,225,259,19,4,4.0,8.0,6.0,TOR,MTL,2.5,1,0,82,37,45,2.5,2.25,2.0,3.0,2.163211,2.307903,2.256678,2.182927,45,37


- compute win and loss percent by team. Drop duplicate observations.

In [97]:
dw2 = dw2[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD', 'MeanG']]
dw2 = dw2.drop_duplicates(['Season', 'TeamCode'])
dw2['WinPc'] = dw2['W']/ dw2['GP']
dw2['LossPc'] = dw2['L']/ dw2['GP']

dw2 = dw2[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD', 'MeanG']]
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,MeanG
0,2010,MTL,82,44,38,220,213,0.536585,0.463415,2.193148,1.85842,1.82259,1.158537
4,2010,TOR,82,37,45,225,259,0.45122,0.54878,2.163211,2.307903,2.256678,2.182927
8,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.994919,1.757283,1.947154,2.012195
12,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.024129,2.275203,1.926713,1.756098
16,2010,CAR,82,40,42,239,242,0.487805,0.512195,2.072997,2.473229,1.847561,1.243902


- rank teams based on win percent, mean centres, wingers and defensemen. 

In [98]:
dw2['RankWin'] = dw2.groupby(['Season'])['WinPc'].rank(ascending=False)
dw2['RankC'] = dw2.groupby(['Season'])['MeanC'].rank(ascending=True)
dw2['RankW'] = dw2.groupby(['Season'])['MeanW'].rank(ascending=True)
dw2['RankD'] = dw2.groupby(['Season'])['MeanD'].rank(ascending=True)
dw2['RankG'] = dw2.groupby(['Season'])['MeanG'].rank(ascending=True)
dw2 = dw2.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD', 'RankG'], ascending=[True, True, True, True, True, True])
dw2.head(10)

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,MeanG,RankWin,RankC,RankW,RankD,RankG
180,2010,VAN,82,54,28,268,192,0.658537,0.341463,2.121951,1.851287,1.909117,1.0,1.0,14.0,5.0,10.0,1.0
12,2010,PIT,82,49,33,244,203,0.597561,0.402439,2.024129,2.275203,1.926713,1.756098,2.0,9.0,19.0,14.0,12.0
44,2010,S.J,82,48,34,253,216,0.585366,0.414634,1.650523,1.9527,1.917538,1.573171,3.5,2.0,11.0,12.0,10.0
84,2010,WSH,82,48,34,230,203,0.585366,0.414634,2.416638,1.962152,1.900407,1.536585,3.5,26.0,12.0,9.0,9.0
68,2010,DET,82,47,35,263,241,0.573171,0.426829,1.688618,1.920369,1.66115,2.134146,6.0,3.0,9.0,1.0,21.0
8,2010,PHI,82,47,35,266,233,0.573171,0.426829,1.994919,1.757283,1.947154,2.012195,6.0,7.0,3.0,17.0,16.0
64,2010,ANA,82,47,35,241,237,0.573171,0.426829,2.454268,2.119599,2.171893,1.695122,6.0,28.0,15.0,24.0,11.0
88,2010,BOS,82,46,36,250,200,0.560976,0.439024,1.553833,1.685017,1.912602,1.329268,9.0,1.0,2.0,11.0,7.0
140,2010,T.B,82,46,36,252,246,0.560976,0.439024,2.113531,1.662805,2.023229,2.359756,9.0,13.0,1.0,19.0,26.0
176,2010,L.A,82,46,36,229,207,0.560976,0.439024,2.26626,2.017567,1.809233,2.0,9.0,23.0,14.0,5.0,13.5


In [99]:
dw2.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_season_team_roster_rank.csv', index='False', sep=',')
#dw2.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_season_team_roster_rank.csv', index='False', sep=',')

### keep games that have 12 forwards and 6 defensemen per team!!

In [100]:
du = dw.copy()
du.shape

(357527, 17)

In [101]:
du['FCount'] = du['CCount']+ du['WCount']
du.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,FCount
0,2010,20001,MTL,SCOTT GOMEZ,C,2.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0,12.0
2,2010,20001,MTL,TOMAS PLEKANEC,C,1.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0,12.0
9,2010,20001,MTL,JEFF HALPERN,C,2.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0,12.0
13,2010,20001,MTL,DUSTIN BOYD,C,4.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0,12.0
95,2010,20001,MTL,TOM PYATT,C,3.0,2,3,-1,TOR,MTL,19,7,7.0,5.0,6.0,1.0,12.0


In [102]:
du = du.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['FCount'] == 12) & (x['DCount'] == 6)).all())
du.shape

(317702, 18)

In [103]:
du.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_12f_6d_g_game_team.csv', index='False', sep=',')
#du.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_12f_6d_g_game_team.csv', index='False', sep=',')

### keep games that have 4 C, 8 W and 6D per team

In [104]:
dy = dw.copy()
dy.shape

(357527, 17)

In [105]:
dy = dy.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6)).all())
dy.shape

(15794, 17)

In [106]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2010_2017_4c_8w_6d_game_team.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2010_2017_4c_8w_6d_game_team.csv', index='False', sep=',')