In [402]:
import sys
import os
import pandas as pd
import numpy as np
import datetime, time
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pylab import hist, show
import scipy
import zipfile


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [403]:
pwd

'/Users/stefanostselios/Desktop/nhl_roster_design-master'

### import data and keep regular season games.

In [404]:
dm = pd.read_csv('season_games.csv')
dm = dm.drop('Unnamed: 0', axis=1)
dm = dm[dm['GameNumber'] <= 21230]

In [406]:
dm = dm.rename(columns={'WinTeam': 'Win'})

- create win and loss team columns.

In [407]:
dm['WinTeam'] = dm.apply(lambda x: x['HTeamCode'] if x['Win'] == 'HOME' else x['VTeamCode'], axis=1)

In [408]:
dm['LossTeam'] = dm.apply(lambda x: x['HTeamCode'] if x['Win'] != 'HOME' else x['VTeamCode'], axis=1)

In [409]:
df = dm[['Season', 'GameNumber', 'VTeamCode', 'HTeamCode', 'Win', 'WinTeam', 'LossTeam']]

- reshape data to have home and visitor team in one column.

In [410]:
a = [col for col in df.columns if 'Code' in col]
df = pd.lreshape(df, {'TeamCode' : a})

In [411]:
df = df[['Season', 'GameNumber','TeamCode', 'WinTeam', 'LossTeam', 'Win']]
df = df.sort_values(['Season', 'GameNumber'], ascending=[True, True])

- calcuate the games played and won by each team. Drop duplicates.

In [412]:
df['GP'] = df.groupby(['Season', 'TeamCode'])['GameNumber'].transform('count')

In [414]:
df = df.drop_duplicates(['Season', 'GameNumber'])

In [416]:
df['GW'] = df.groupby(['Season', 'WinTeam'])['GameNumber'].transform('count')

In [419]:
df = df.drop_duplicates(['Season', 'WinTeam'])

In [423]:
df = df[['Season','WinTeam', 'LossTeam', 'GP', 'GW']]

- reshape data and drop duplicates by teamcode.

In [425]:
a = [col for col in df.columns if 'Team' in col]
df = pd.lreshape(df, {'TeamCode' : a})
df = df[['Season','TeamCode','GP', 'GW']]
df = df.sort_values(['Season', 'TeamCode'], ascending=[True, True])
df = df.drop_duplicates(['Season', 'TeamCode'])

In [426]:
df['GL'] = df['GP'] - df['GW']

In [427]:
df['WinPc'] = df['GW']/ df['GP']

In [428]:
df['LossPc'] = df['GL']/ df['GP']

### rank teams based on their winning percentage

In [438]:
df['Rank'] = df.groupby('Season')['WinPc'].rank(ascending=False)

In [439]:
df = df.sort_values(['Season', 'Rank'], ascending=[True, True])

In [440]:
df.shape

(30, 8)

In [441]:
df.head(30)

Unnamed: 0,Season,TeamCode,GP,GW,GL,WinPc,LossPc,Rank
24,2010,VAN,82,54,28,0.658537,0.341463,1.0
21,2010,PIT,82,49,33,0.597561,0.402439,2.0
5,2010,SJ,82,48,34,0.585366,0.414634,3.5
13,2010,WSH,82,48,34,0.585366,0.414634,3.5
26,2010,ANA,82,47,35,0.573171,0.426829,6.0
7,2010,DET,82,47,35,0.573171,0.426829,6.0
1,2010,PHI,82,47,35,0.573171,0.426829,6.0
19,2010,BOS,82,46,36,0.560976,0.439024,9.0
18,2010,LA,82,46,36,0.560976,0.439024,9.0
14,2010,TB,82,46,36,0.560976,0.439024,9.0


In [442]:
df.to_csv('season_teams.csv', index='False')