In [162]:
import pandas
import requests 
from bs4 import BeautifulSoup
import json
import numpy as np

import pickle


# Derive some stats and stuff

The NHL data doesn't quite have everything. We might need to a few simple calculations from the data (noteably, save percentage). 

In [3]:
game_data_df = pandas.read_csv('/Users/greg/Desktop/nhl_game_data.csv')

#Seems like I scraped certain seasons too many times, so let's just drop duplicate rows

game_data_df.drop_duplicates(subset=['gameId','teamAbbrev','game_type'],inplace=True)

In [4]:
game_data_df['save_percentage'] = 1.0 - (1.0*game_data_df.goalsAgainst) / (game_data_df.shotsAgainst)
game_data_df['home_game'] = 1.0*(game_data_df.gameLocationCode=='H')
game_data_df['regulation_win'] = 1.0*(game_data_df.points==2)
game_data_df['win_percentage'] = (1.0*game_data_df.wins)/ (1.0*game_data_df.gamesPlayed)
game_data_df['standing_points_per_game'] = (1.0*game_data_df.points)/ (1.0*game_data_df.gamesPlayed)

# Compute rolling statistics

In [5]:
game_data_df['gameDate']= pandas.to_datetime(game_data_df.gameDate)
game_data_df.sort_values(by='gameDate', inplace=True)

In [6]:
def convert_stat(df, metric, rolling_game_windows):
    df[metric+'_expanding_mean'] = df.groupby(['season_id','teamAbbrev'])[metric].apply(pandas.expanding_mean)
    df[metric+'_expanding_mean'] = df.groupby(['season_id','teamAbbrev'])[metric+'_expanding_mean'].shift(1)
    added_stats = [metric + '_expanding_mean']                                      
    for window_size in rolling_game_windows:
        rolling_func = lambda x: pandas.rolling_mean(x,window_size, min_periods=0)
        col_name = metric + '_rolling_mean_' + str(window_size)
        df[col_name] = df.groupby(['season_id','teamAbbrev'])[metric].apply(rolling_func)
        df[col_name] = df.groupby(['season_id','teamAbbrev'])[col_name].shift(1)                                    
        added_stats.append(col_name)
    
    
    return df, added_stats

In [7]:
metrics = ['shotsFor',
          'shotsAgainst',
          'save_percentage',
          'faceoffWinPctg',
          'goalsFor',
          'goalsAgainst',
          'ppOpportunities',
          'shNumTimes',
          'ppPctg',
           'points',
          ]
team_stats = []
window_sizes = [5,10,25]
for m in metrics:
    game_data_df, new_stats = convert_stat(game_data_df, m, window_sizes)
    team_stats.extend(new_stats)
    
opponent_data = game_data_df.groupby(['teamAbbrev','gameId'])[team_stats].min()

opponent_stats = [m + '_opponent' for m in team_stats]

game_data_df = game_data_df.join(opponent_data, on=['opponentTeamAbbrev','gameId'],rsuffix='_opponent')


game_data_df['team_season_id'] = game_data_df.teamAbbrev + game_data_df.season_id.astype(str)
game_data_df['team_season_id_opponent'] = game_data_df.opponentTeamAbbrev + game_data_df.season_id.astype(str)

game_data_df.to_csv('/Users/greg/Desktop/nhl_game_data_converted.csv', index=False, encoding='utf-8')

import pickle

pickle.dump(team_stats, file('team_stats.txt','w'))
pickle.dump(opponent_stats, file('opponent_stats.txt','w'))

In [3]:
def create_playoff_prediction_data(regular_season_data, playoff_data, team_stats):
    season_end_team_stats = regular_season_data.groupby(['teamAbbrev','season_id'])[team_stats+['team_season_id']].last()

    temp_playoff_data = playoff_data[['teamAbbrev','opponentTeamAbbrev','season_id','wins','home_game']].copy()

    playoff_games = temp_playoff_data.join(season_end_team_stats, on=['teamAbbrev','season_id'])
    playoff_games = playoff_games.join(season_end_team_stats, on=['opponentTeamAbbrev','season_id'], rsuffix='_opponent')
    
    return playoff_games

In [125]:
data = pandas.read_csv('/Users/greg/Desktop/nhl_game_data_converted.csv')
data.dropna(inplace=True)

data.drop_duplicates(subset=['game_type','gameId'], inplace=True)

In [160]:
team_stats = pickle.load(file('team_stats.txt'))
opponent_stats = pickle.load(file('opponent_stats.txt'))

regular_season_data = data[data.game_type=='regular']
_playoff_data = data[data.game_type=='playoff']

playoff_data = create_playoff_prediction_data(regular_season_data, _playoff_data, team_stats)

In [None]:
regular_season_data.to_csv('regular_season_data.csv', index=False)
playoff_data.to_csv('playoff_data.csv', index=False)

# Generate playoff schedule

In [135]:
season_end_team_stats = regular_season_data.groupby(['teamAbbrev','season_id'])[team_stats+['team_season_id']].last()

temp_playoff_data = playoff_data[['teamAbbrev','opponentTeamAbbrev','season_id','wins','home_game']].copy()

playoff_games = temp_playoff_data.join(season_end_team_stats, on=['teamAbbrev','season_id'])
playoff_games = playoff_games.join(season_end_team_stats, on=['opponentTeamAbbrev','season_id'], rsuffix='_opponent')

current_season_data = season_end_team_stats.reset_index()
current_season_data = current_season_data[current_season_data.season_id == 20152016]

In [136]:
def make_schedule(home_ice_team, away_ice_team):
    series_name = home_ice_team + '_' + away_ice_team
    games = [dict(home_team=home_ice_team, away_team=away_ice_team, series_name = series_name) for i in np.arange(4)]
    games += [dict(home_team=away_ice_team, away_team=home_ice_team, series_name = series_name) for i in np.arange(3)]
    return games

In [143]:
playoff_schedule = []
playoff_schedule += make_schedule('PIT','NYR')
playoff_schedule += make_schedule('WSH','PHI')
playoff_schedule += make_schedule('FLA','NYI')
playoff_schedule += make_schedule('TBL','DET')
playoff_schedule += make_schedule('DAL','MIN')
playoff_schedule += make_schedule('STL','CHI')
playoff_schedule += make_schedule('ANA','NSH')
playoff_schedule += make_schedule('LAK','SJS')



temp_team_stats = team_stats + ['team_season_id']

schedule_df = pandas.DataFrame(playoff_schedule)
schedule_df = schedule_df.join(current_season_data.set_index('teamAbbrev')[temp_team_stats], on='home_team')

schedule_df = schedule_df.join(current_season_data.set_index('teamAbbrev')[temp_team_stats], on='away_team', rsuffix='_opponent')

schedule_df['home_game'] = 1

schedule_df.to_csv('round_1_schedule.csv', index=False)