In [1]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

### import play by play data set

In [91]:
da = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_pbp.csv')
#da.pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_pbp.csv')
da = da.drop('Unnamed: 0', axis=1)

- exclude irrelevant on-ice events

In [92]:
da = da[da['EventType']!='STOP']
da = da[da['EventType']!='EISTR']
da = da[da['EventType']!='EIEND']
da = da[da['EventType']!='PSTR']
da = da[da['EventType']!='PEND']
da = da[da['EventType']!='SOC']
da = da[da['EventType']!='GEND']

- create an event number variable that will count the number of events per game. 
- create an advantage type variable for even strength, power play and short handed situations.

In [93]:
da['EventNumber'] = da.groupby(['Season', 'GameNumber']).cumcount()+1

In [94]:
da['AdvantageType'] = da.apply(lambda x: 'EV' if ((x['PlayersV'] == x['PlayersH']) & (x['PlayersV'] != 1) & (x['PlayersH'] != 1)) else 'PP' if ((x['PlayersV'] > x['PlayersH']) & (x['EventTeamCode'] == x['VTeamCode'])) else 'PP' if ((x['PlayersV'] < x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'SH' if ((x['PlayersV'] < x['PlayersH']) & (x['EventTeamCode'] == x['VTeamCode'])) else 'SH' if ((x['PlayersV'] > x['PlayersH']) & (x['EventTeamCode'] == x['HTeamCode'])) else 'PP' if ((x['PlayersV'] == 1 ) & (x['PlayersH'] == 1)) else np.nan, axis=1)


- display goals for each game and drop duplicates.

In [114]:
dg = da.copy()
dg = dg[dg['EventType'] == 'GOAL']
dg['hgoal'] = dg.apply(lambda x: 1 if x['EventTeamCode'] == x['HTeamCode'] else 0, axis=1)
dg['vgoal'] = dg.apply(lambda x: 1 if x['EventTeamCode'] == x['VTeamCode'] else 0, axis=1)
dg['HGF'] = dg.groupby(['Season', 'GameNumber', 'HTeamCode'])['hgoal'].transform('sum')
dg['VGF'] = dg.groupby(['Season', 'GameNumber', 'VTeamCode'])['vgoal'].transform('sum')
dg = dg.drop_duplicates(['Season', 'GameNumber'])
dg = dg[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'VGF', 'HGF']]

- find the goal differential per game with respect to home team.

In [115]:
dg['GD'] = dg['HGF'] - dg['VGF']
dg['WinTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['GD'] > 0 else x['VTeamCode'], axis=1)
dg['LossTeam'] = dg.apply(lambda x: x['HTeamCode'] if x['WinTeam'] != x['HTeamCode'] else x['VTeamCode'], axis=1)

- display goals againest per team.

In [116]:
dg['VGA'] = dg['HGF']
dg['HGA'] = dg['VGF']

In [117]:
dg = dg.sort_values(['Season'], ascending=[True])
dg.head()

Unnamed: 0,Season,GameNumber,VTeamCode,HTeamCode,VGF,HGF,GD,WinTeam,LossTeam,VGA,HGA
43,2011,20001,PHI,BOS,2,1,-1,PHI,BOS,1,2
252331,2011,20826,VAN,CGY,3,4,1,CGY,VAN,4,3
251980,2011,20825,CHI,PHX,0,3,3,PHX,CHI,3,0
251755,2011,20824,CBJ,MIN,3,1,-2,CBJ,MIN,1,3
251436,2011,20823,COL,STL,2,3,1,STL,COL,3,2


In [118]:
dg.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_season_game_data.csv', index='False', sep=',')
#dg.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_season_game_data.csv', index='False', sep=',')

## season_level_data

In [119]:
dm = da.copy()

- events that happened in regulation time only

In [120]:
dm = dm[dm['Period'] <= 3]
dm = dm[dm['Period'] >= 1]

- **reshape the data set from wide to long.**

In [121]:
dm = dm.sort_values(['Season', 'GameNumber', 'Period', 'EventNumber'], ascending=[True, True, True, True])

In [122]:
a = [col for col in dm.columns if 'VPlayer' in col]
b = [col for col in dm.columns if 'HPlayer' in col]
dm = pd.lreshape(dm, {'VPlayer' : a, 'HPlayer' : b})

In [123]:
dm.shape

(12012909, 22)

In [124]:
dm.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_play_by_play.csv', index='False', sep=',')
#dm.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_play_by_play.csv', index='False', sep=',')

#### create new data set and keep variables: 
- (a) game number.
- (b) visitor team information.
- (c) home team information.

In [125]:
df = dm.copy()
df = df[['Season', 'GameNumber', 'VTeamCode', 'VPlayer', 'HTeamCode', 'HPlayer']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])
df.head()

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,HTeamCode,HPlayer
0,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI
1,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI
2,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI
3,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI
4,2011,20001,PHI,DANNY BRIERE,BOS,PATRICE BERGERON


- merge season_game_data (dg) on new dataset

In [126]:
df = pd.merge(df, dg, on=['Season', 'GameNumber', 'VTeamCode', 'HTeamCode'], how='left')
df.head()    

Unnamed: 0,Season,GameNumber,VTeamCode,VPlayer,HTeamCode,HPlayer,VGF,HGF,GD,WinTeam,LossTeam,VGA,HGA
0,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI,2,1,-1,PHI,BOS,1,2
1,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI,2,1,-1,PHI,BOS,1,2
2,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI,2,1,-1,PHI,BOS,1,2
3,2011,20001,PHI,DANNY BRIERE,BOS,DAVID KREJCI,2,1,-1,PHI,BOS,1,2
4,2011,20001,PHI,DANNY BRIERE,BOS,PATRICE BERGERON,2,1,-1,PHI,BOS,1,2


- reshape the data to have home and visitor team observatons under the same coloumns. 

In [127]:
a = [col for col in df.columns if 'Player' in col]
b = [col for col in df.columns if 'TeamCode' in col]
c = [col for col in df.columns if 'GF' in col]
d = [col for col in df.columns if 'GA' in col]
df = pd.lreshape(df, {'PlayerName' : a, 'TeamCode' : b, 'GF' : c, 'GA' : d })
df = df[['Season', 'GameNumber', 'TeamCode', 'PlayerName', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
df.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,GF,GA,GD,WinTeam,LossTeam
0,2011,20001,PHI,DANNY BRIERE,2,1,-1,PHI,BOS
1,2011,20001,PHI,DANNY BRIERE,2,1,-1,PHI,BOS
2,2011,20001,PHI,DANNY BRIERE,2,1,-1,PHI,BOS
3,2011,20001,PHI,DANNY BRIERE,2,1,-1,PHI,BOS
4,2011,20001,PHI,DANNY BRIERE,2,1,-1,PHI,BOS


### import player position and rankings

In [134]:
dp = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_player_position.csv')
#dp = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_player_position.csv')
dp = dp.drop('Unnamed: 0', axis=1)

In [136]:
dr = pd.read_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_player_rank.csv')
#dr = pd.read_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_player_rank.csv')
dr = dr.drop('Unnamed: 0', axis=1)

- merge player position and player rankings

In [137]:
ds = pd.merge(dp, dr, on=['Season', 'TeamCode', 'PlayerName', 'Position'], how='left')
ds.shape

(7336, 5)

In [138]:
ds.isnull().sum()

Season           0
TeamCode         0
PlayerName       0
Position         0
Rank          1581
dtype: int64

skaters that were not included in the clusters and therefore not ranked, forwards are assigned to the 4th line, defensemen to the bottom (3rd) pairing and goaltenders to backup. 

In [139]:
ds['Rank'] = ds.apply(lambda x: 3 if ((x['Position'] == 'G') & (pd.isnull(x.Rank))) else 3 if ((x['Position'] == 'D') & (pd.isnull(x.Rank))) else 4 if ((x['Position'] == 'W') & (pd.isnull(x.Rank))) else 4 if ((x['Position'] == 'C') & (pd.isnull(x.Rank))) else x['Rank'], axis=1)
#ds = ds.sort_values(['TeamCode'], ascending=[True])
ds.shape

(7336, 5)

In [140]:
ds.isnull().sum()

Season        0
TeamCode      0
PlayerName    0
Position      0
Rank          0
dtype: int64

- **display each player by team per game. Drop duplicates.**

In [141]:
dw = pd.merge(df, ds, on=['Season', 'TeamCode', 'PlayerName'], how='left')
dw = dw[['Season', 'GameNumber', 'TeamCode', 'PlayerName', 'Position', 'Rank', 'GF', 'GA', 'GD', 'WinTeam', 'LossTeam']]
dw.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS
1,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS
2,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS
3,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS
4,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS


In [148]:
dw.isnull().sum()

Season          0
GameNumber      0
TeamCode        0
PlayerName      0
Position      835
Rank          835
GF              0
GA              0
GD              0
WinTeam         0
LossTeam        0
dtype: int64

In [150]:
w = dw.copy()
w = w[w['Rank'].isnull()]
w['PlayerName'].value_counts()

DANIEL O'REGAN    835
Name: PlayerName, dtype: int64

In [151]:
dw['Position'] = dw['Position'].fillna('C')
dw['Rank'] = dw['Rank'].fillna(4)

In [152]:
dw.isnull().sum()

Season        0
GameNumber    0
TeamCode      0
PlayerName    0
Position      0
Rank          0
GF            0
GA            0
GD            0
WinTeam       0
LossTeam      0
dtype: int64

- create column that displays the position and roster count by team per game. To simplify matters, we categorize skaters into forwards and defensemen.

In [153]:
dw = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode', 'PlayerName'])
dw['RosterCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode'])['PlayerName'].transform('count')
dw['PositionCount'] = dw.groupby(['Season', 'GameNumber', 'TeamCode', 'Position'])['PlayerName'].transform('count')
dw.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS,19,9
5,2011,20001,PHI,CLAUDE GIROUX,W,1.0,2,1,-1,PHI,BOS,19,9
8,2011,20001,PHI,SEAN COUTURIER,C,2.0,2,1,-1,PHI,BOS,19,3
9,2011,20001,PHI,MAX TALBOT,C,2.0,2,1,-1,PHI,BOS,19,3
23,2011,20001,PHI,ZAC RINALDO,C,4.0,2,1,-1,PHI,BOS,19,3


- count the amount of forwards and defensemen by team per game.

In [154]:
dw['CCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='C' else np.NaN, 1)
dw['WCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='W' else np.NaN, 1)
dw['DCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='D' else np.NaN, 1)
dw['GCount'] = dw.apply(lambda x: x['PositionCount'] if x['Position']=='G' else np.NaN, 1)

dw['CCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['CCount'].apply(lambda x: x.ffill().bfill())
dw['WCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['WCount'].apply(lambda x: x.ffill().bfill())
dw['DCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['DCount'].apply(lambda x: x.ffill().bfill())
dw['GCount'] = dw.groupby(['Season','GameNumber', 'TeamCode'])['GCount'].apply(lambda x: x.ffill().bfill())
dw.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0
5,2011,20001,PHI,CLAUDE GIROUX,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0
8,2011,20001,PHI,SEAN COUTURIER,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0
9,2011,20001,PHI,MAX TALBOT,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0
23,2011,20001,PHI,ZAC RINALDO,C,4.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0


In [155]:
dw.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_season_team_game_player.csv', index='False', sep=',')
#dw.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_season_team_game_player.csv', index='False', sep=',')

In [156]:
dw1 = dw.drop_duplicates(['Season', 'GameNumber', 'TeamCode'])
dw1 = dw1.sort_values(['Season', 'GameNumber'], ascending=[True, True])

In [157]:
dw1.shape

(16282, 17)

In [158]:
dw1.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0
12012909,2011,20001,BOS,DAVID KREJCI,C,1.0,1,2,-1,PHI,BOS,19,6,6.0,6.0,6.0,1.0
1274,2011,20002,MTL,SCOTT GOMEZ,C,3.0,0,2,2,TOR,MTL,19,5,5.0,6.0,7.0,1.0
12014183,2011,20002,TOR,MIKHAIL GRABOVSKI,C,1.0,2,0,2,TOR,MTL,19,5,5.0,7.0,6.0,1.0
2640,2011,20003,PIT,JORDAN STAAL,C,1.0,5,3,-2,PIT,VAN,19,5,5.0,7.0,6.0,1.0


In [159]:
dw1.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_game_team_roster_player.csv', index='False', sep=',')
#dw1.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_game_team_roster_player.csv', index='False', sep=',')

### full regular season stats

In [160]:
dw2 = dw.copy()
dw2 = dw2.groupby(['Season', 'GameNumber', 'TeamCode', 'Position', 'GF', 'GA', 'RosterCount', 'PositionCount', 'CCount', 'WCount', 'DCount', 'WinTeam', 'LossTeam'], as_index=False)['Rank'].mean()
dw2.shape

(65088, 14)

- create columns for team win and team loss. 

In [161]:
dw2['TeamWin'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['WinTeam'] else 0, 1)
dw2['TeamLos'] =  dw2.apply(lambda x: 1 if x['TeamCode']==x['LossTeam'] else 0, 1)

- display games played, games won, games loss, goals for and goals against by team for the season.

In [162]:
dw2['GP'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GameNumber'].transform('count')
dw2['GW'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['TeamWin'].transform('sum')
dw2['GL'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['TeamLos'].transform('sum')
dw2['GF'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GF'].transform('sum')
dw2['GA'] = dw2.groupby(['Season', 'Position', 'TeamCode'])['GA'].transform('sum')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL
0,2011,20001,BOS,C,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.5,0,1,82,49,33
1,2011,20001,BOS,D,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.666667,0,1,82,49,33
2,2011,20001,BOS,G,279,211,19,1,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33
3,2011,20001,BOS,W,279,211,19,6,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33
4,2011,20001,PHI,C,273,242,19,3,3.0,9.0,6.0,PHI,BOS,2.666667,1,0,82,47,35


- create columns with the mean ranking for forward and defenseman by team per game.

In [163]:
dw2['RankC'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='C' else np.NaN, 1)
dw2['RankW'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='W' else np.NaN, 1)
dw2['RankD'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='D' else np.NaN, 1)
dw2['RankG'] = dw2.apply(lambda x: x['Rank'] if x['Position']=='G' else np.NaN, 1)

dw2['RankC'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankC'].apply(lambda x: x.ffill().bfill())
dw2['RankW'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankW'].apply(lambda x: x.ffill().bfill())
dw2['RankD'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankD'].apply(lambda x: x.ffill().bfill())
dw2['RankG'] = dw2.groupby(['Season','GameNumber', 'TeamCode'])['RankG'].apply(lambda x: x.ffill().bfill())
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG
0,2011,20001,BOS,C,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.5,0,1,82,49,33,1.5,2.0,1.666667,2.0
1,2011,20001,BOS,D,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.666667,0,1,82,49,33,1.5,2.0,1.666667,2.0
2,2011,20001,BOS,G,279,211,19,1,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0
3,2011,20001,BOS,W,279,211,19,6,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0
4,2011,20001,PHI,C,273,242,19,3,3.0,9.0,6.0,PHI,BOS,2.666667,1,0,82,47,35,2.666667,1.222222,1.5,2.0


- compute the mean per position by team for the season.

In [164]:
dw2['MeanC']= dw2.groupby(['Season', 'TeamCode'])['RankC'].transform('mean')
dw2['MeanW']= dw2.groupby(['Season', 'TeamCode'])['RankW'].transform('mean')
dw2['MeanD']= dw2.groupby(['Season', 'TeamCode'])['RankD'].transform('mean')
dw2['MeanG']= dw2.groupby(['Season', 'TeamCode'])['RankG'].transform('mean')
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG,MeanC,MeanW,MeanD,MeanG
0,2011,20001,BOS,C,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.5,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878
1,2011,20001,BOS,D,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.666667,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878
2,2011,20001,BOS,G,279,211,19,1,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878
3,2011,20001,BOS,W,279,211,19,6,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878
4,2011,20001,PHI,C,273,242,19,3,3.0,9.0,6.0,PHI,BOS,2.666667,1,0,82,47,35,2.666667,1.222222,1.5,2.0,2.502033,1.32439,1.827236,2.310976


- display the quantity of wins and losses per team for the whole season

In [165]:
dw2['L'] = dw2.apply(lambda x: x['GL'] if x['TeamCode']== x['LossTeam'] else (x['GP'] - x['GW']), 1)
dw2['W'] = dw2.apply(lambda x: x['GW'] if x['TeamCode']== x['WinTeam'] else (x['GP'] - x['GL']), 1)
dw2.head()

Unnamed: 0,Season,GameNumber,TeamCode,Position,GF,GA,RosterCount,PositionCount,CCount,WCount,DCount,WinTeam,LossTeam,Rank,TeamWin,TeamLos,GP,GW,GL,RankC,RankW,RankD,RankG,MeanC,MeanW,MeanD,MeanG,L,W
0,2011,20001,BOS,C,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.5,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878,33,49
1,2011,20001,BOS,D,279,211,19,6,6.0,6.0,6.0,PHI,BOS,1.666667,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878,33,49
2,2011,20001,BOS,G,279,211,19,1,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878,33,49
3,2011,20001,BOS,W,279,211,19,6,6.0,6.0,6.0,PHI,BOS,2.0,0,1,82,49,33,1.5,2.0,1.666667,2.0,1.635889,1.92619,1.743612,1.804878,33,49
4,2011,20001,PHI,C,273,242,19,3,3.0,9.0,6.0,PHI,BOS,2.666667,1,0,82,47,35,2.666667,1.222222,1.5,2.0,2.502033,1.32439,1.827236,2.310976,35,47


- compute win and loss percent by team. Drop duplicate observations.

In [166]:
dw2 = dw2[['Season', 'TeamCode', 'GP', 'L', 'W', 'GF', 'GA', 'MeanC', 'MeanW', 'MeanD', 'MeanG']]
dw2 = dw2.drop_duplicates(['Season', 'TeamCode'])
dw2['WinPc'] = dw2['W']/ dw2['GP']
dw2['LossPc'] = dw2['L']/ dw2['GP']

dw2 = dw2[['Season', 'TeamCode', 'GP','W', 'L', 'GF', 'GA', 'WinPc', 'LossPc', 'MeanC', 'MeanW', 'MeanD', 'MeanG']]
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,MeanG
0,2011,BOS,82,49,33,279,211,0.597561,0.402439,1.635889,1.92619,1.743612,1.804878
4,2011,PHI,82,47,35,273,242,0.573171,0.426829,2.502033,1.32439,1.827236,2.310976
8,2011,MTL,82,31,51,220,235,0.378049,0.621951,2.094803,2.127584,2.011324,2.0
12,2011,TOR,82,35,47,235,268,0.426829,0.573171,2.744715,2.013357,1.720383,3.0
16,2011,PIT,82,51,31,292,229,0.621951,0.378049,1.815447,1.721453,2.071138,2.219512


- rank teams based on win percent, mean centres, wingers and defensemen. 

In [172]:
dw2['RankWin'] = dw2.groupby(['Season'])['WinPc'].rank(ascending=False)
dw2['RankC'] = dw2.groupby(['Season'])['MeanC'].rank(ascending=True)
dw2['RankW'] = dw2.groupby(['Season'])['MeanW'].rank(ascending=True)
dw2['RankD'] = dw2.groupby(['Season'])['MeanD'].rank(ascending=True)
dw2['RankG'] = dw2.groupby(['Season'])['MeanG'].rank(ascending=True)
dw2 = dw2.sort_values(['Season', 'RankWin', 'RankC', 'RankW', 'RankD', 'RankD'], ascending=[True, True, True, True, True, True])
dw2.head()

Unnamed: 0,Season,TeamCode,GP,W,L,GF,GA,WinPc,LossPc,MeanC,MeanW,MeanD,MeanG,RankWin,RankC,RankW,RankD,RankG
16,2011,PIT,82,51,31,292,229,0.621951,0.378049,1.815447,1.721453,2.071138,2.219512,2.0,5.0,2.0,24.0,24.0
20,2011,VAN,82,51,31,259,211,0.621951,0.378049,2.354878,1.869774,1.58856,1.634146,2.0,19.0,6.0,3.0,5.0
68,2011,NYR,82,51,31,231,192,0.621951,0.378049,2.44788,2.159553,1.908537,1.5,2.0,25.0,20.0,19.0,4.0
0,2011,BOS,82,49,33,279,211,0.597561,0.402439,1.635889,1.92619,1.743612,1.804878,4.5,4.0,8.0,10.0,6.0
116,2011,STL,82,49,33,215,169,0.597561,0.402439,2.331707,2.101239,1.776423,1.0,4.5,18.0,16.0,11.0,1.0


In [173]:
dw2.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_season_team_roster_rank.csv', index='False', sep=',')
#dw2.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_season_team_roster_rank.csv', index='False', sep=',')

### keep games that have 12 forwards and 6 defensemen per team!!

In [174]:
du = dw.copy()
du.shape

(310611, 17)

In [175]:
du['FCount'] = du['CCount']+ du['WCount']
du.head()

Unnamed: 0,Season,GameNumber,TeamCode,PlayerName,Position,Rank,GF,GA,GD,WinTeam,LossTeam,RosterCount,PositionCount,CCount,WCount,DCount,GCount,FCount
0,2011,20001,PHI,DANNY BRIERE,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0,12.0
5,2011,20001,PHI,CLAUDE GIROUX,W,1.0,2,1,-1,PHI,BOS,19,9,3.0,9.0,6.0,1.0,12.0
8,2011,20001,PHI,SEAN COUTURIER,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0
9,2011,20001,PHI,MAX TALBOT,C,2.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0
23,2011,20001,PHI,ZAC RINALDO,C,4.0,2,1,-1,PHI,BOS,19,3,3.0,9.0,6.0,1.0,12.0


In [176]:
du = du.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['FCount'] == 12) & (x['DCount'] == 6)).all())
du.shape

(277752, 18)

In [177]:
du.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_12f_6d_g_game_team.csv', index='False', sep=',')
#du.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_12f_6d_g_game_team.csv', index='False', sep=',')

### keep games that have 4 C, 8 W, 6 D and 1G per team!!!!

In [178]:
dy = dw.copy()
dy.shape

(310611, 17)

In [179]:
dy = dy.groupby(['Season', 'GameNumber']).filter(lambda x: ((x['CCount'] == 4) & (x['WCount'] == 8) & (x['DCount'] == 6) & (x['GCount'] == 1)).all())
dy.shape

(10982, 17)

In [180]:
dy.to_csv('/Users/stefanostselios/Brock University/Kevin Mongeon - StephanosShare/out/data/2011_2017_4c_8w_6d_1g_game_team.csv', index='False', sep=',')
#dy.to_csv('/Users/kevinmongeon/Brock University/Steve Tselios - StephanosShare/out/data/2011_2017_4c_8w_6d_1g_game_team.csv', index='False', sep=',')