In [86]:
# First let's bring in a few imports. Since we need to scrape some web data I'm going to bring in the
# urllib and json libraries
import urllib3
import json
import itertools

# And the standard data science data manipulation imports
import pandas as pd
import numpy as np

# The goal in this case is to predict the winner or loser of a given match using logistic regression. For
# the time being I am going to take out ties. Concepturally our feature set will include three things, the
# performance of the two teams (home and away) based on their statistics this season so far, the salary cap
# for the teams which is an indicator of the value of players (one would think teams which pay more will
# have stronger players earning that money), and the standings of the teams (again, home and away) from the
# previous season.

# Take note of the temporal nature of some of this data -- as we gain more information about the season our
# machine learning model should be able to better predict the immediate future. Also, we use last season's
# stats to help incorporate prior knowledge, but we could look back at several seasons and this would effect
# the model accuracy

In [87]:
# First step is that I want to write a function to retrieve data from the wonderful NHL APIs which
# are available directly from the NHL. In this case we're going to build a model for the 2017-2018
# season. This season had 1,271 games in it

def get_game_data(gameid=1):
    '''Retrieves individual game data and creates a DataFrame of the home and away teams and their
    score results. This function assumes we are interested in the season 2017 and that we want
    only regular season games.

    :param gameid: The game number to retrieve data from.
    '''
    # We can pull down the JSON data directly from the NHL API
    game_url=f'https://statsapi.web.nhl.com/api/v1/game/201702{str(gameid).zfill(4)}/feed/live'
    http = urllib3.PoolManager()
    r = http.request('GET', game_url)
    data=json.loads(r.data)

    # The JSON data is pretty rich. For this analysis we want to get information on scoring which
    # is in the goals JSON object
    results=data['liveData']['plays']['currentPlay']['about']['goals']

    # We also need to get information on the home and away team names. This isn't useful for our
    # model per se, since we want to predict just whether the home team or the away team will win,
    # but we need this in order to connect to our other data sources
    teams={'home_team': data['gameData']['teams']['home']['name'], 'away_team': data['gameData']['teams']['away']['name']}

    # And we'll include when the game happened
    time={'time': data['metaData']['timeStamp']}

    # Now we can just bring these three dictionaries together. This might be unfamiliar syntax, 
    # it's called dictionary unpacking, but it just breaks each dictionary up and creates a new
    # dictionary which combines them all. In the end we want to work with pandas DataFrame objects
    # so that's what we can return to the caller (indexed by the time of the game)
    row={**results,**teams,**time}
    return pd.DataFrame(row, index=[row["time"]])

# Commented out for Coursera, you can uncomment the code below if you are running locally.
# Now let's just call this function for every game in the season, 1 through 1,271
# game_results=pd.concat( [get_game_data(x) for x in range(1,1271)] )

# And now that we've pulled this down, I'm going to save it for offline use
# game_results.to_csv("assets/game_results.csv")

# And let's take a look at that DataFrame
# game_results.head()

In [88]:
# If you are on coursera, you'll want to load the datafile
game_results=pd.read_csv("assets/game_results.csv",index_col=0)
game_results

Unnamed: 0,away,home,home_team,away_team,time
20171006_173713,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713
20171008_080732,5,4,Pittsburgh Penguins,St. Louis Blues,20171008_080732
20171008_080734,0,3,Edmonton Oilers,Calgary Flames,20171008_080734
20171008_160734,5,3,San Jose Sharks,Philadelphia Flyers,20171008_160734
20171009_153739,3,4,Boston Bruins,Nashville Predators,20171009_153739
...,...,...,...,...,...
20180409_215908,2,5,Colorado Avalanche,St. Louis Blues,20180409_215908
20180410_202907,3,0,Arizona Coyotes,Anaheim Ducks,20180410_202907
20180410_145907,1,7,Calgary Flames,Vegas Golden Knights,20180410_145907
20180411_035911,2,2,Edmonton Oilers,Vancouver Canucks,20180411_035911


In [91]:
game_results[(game_results['home_team'] == 'Vegas Golden Knights') | (game_results['away_team'] == 'Vegas Golden Knights')]

Unnamed: 0,away,home,home_team,away_team,time
20171010_223738,2,1,Dallas Stars,Vegas Golden Knights,20171010_223738
20171011_133742,2,1,Arizona Coyotes,Vegas Golden Knights,20171011_133742
20171014_220740,2,5,Vegas Golden Knights,Arizona Coyotes,20171014_220740
20171017_163741,6,3,Vegas Golden Knights,Detroit Red Wings,20171017_163741
20171019_213731,1,3,Vegas Golden Knights,Boston Bruins,20171019_213731
...,...,...,...,...,...
20180402_151552,3,4,Vegas Golden Knights,St. Louis Blues,20180402_151552
20180404_181600,2,3,Vegas Golden Knights,San Jose Sharks,20180404_181600
20180407_063426,4,4,Vancouver Canucks,Vegas Golden Knights,20180407_063426
20180409_212933,3,4,Edmonton Oilers,Vegas Golden Knights,20180409_212933


In [92]:
# So this includes our game-by-game breakdown of the season. We need to add a new column which
# indicates which team won, either home or away. We can do this by setting the default winner to
# away then looking at the game scores and flipping it to home where appropriate.
game_results["outcome_categorical"]="away"
game_results.loc[ (game_results["away"]<game_results["home"]), "outcome_categorical"]="home"
game_results.head()

Unnamed: 0,away,home,home_team,away_team,time,outcome_categorical
20171006_173713,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713,away
20171008_080732,5,4,Pittsburgh Penguins,St. Louis Blues,20171008_080732,away
20171008_080734,0,3,Edmonton Oilers,Calgary Flames,20171008_080734,home
20171008_160734,5,3,San Jose Sharks,Philadelphia Flyers,20171008_160734,away
20171009_153739,3,4,Boston Bruins,Nashville Predators,20171009_153739,home


In [None]:
# Commented out for Coursera, you can uncomment the code below if you are running locally.

# Now, let's bring in salary information. I'm going to pull this down from a website called
# cap friendly. This website does not have an API, so we need to scrape it. Thankfully,
# pandas has a function which aims to turn HTML tables into DataFrames for us automatically
# called read_html(). The result of this function is a list of DataFrames, and I've manually
# inspected this to see that there is only one which has all of our cap information.
# salary=pd.read_html("https://www.capfriendly.com/archive/2017")[0]

# Now this website has pretty values of dollars, but we just want these as numeric values,
# so I'm going to change our column of interest (the final cap hit) to be stripped of
# commas and dollar signs
# salary["FINAL CAP HIT"]=salary["FINAL CAP HIT"].str.replace(',', '').str.replace('$', '').astype(int)

# Let's store this data to a file too
# salary.to_csv("assets/salary.csv",index=False)
# salary.head()

In [93]:
# If you are on coursera, you'll want to load the datafile
salary=pd.read_csv("assets/salary.csv")
salary.head()

Unnamed: 0,TEAM,FINAL CAP HIT,LTIR USED,FINAL CAP SPACE
0,San Jose Sharks,73286670,"$455,625","$168,955"
1,Philadelphia Flyers,73546558,"$598,173","$51,615"
2,Calgary Flames,73824956,"$824,956",$0
3,Arizona Coyotes,75286038,"$2,286,038",$0
4,Anaheim Ducks,76957176,"$3,957,176",$0


In [94]:
# The dirty secret of data science and analytics is that most of the work is in obtaining
# and cleaning data. It's good to build in some checks to see that all of the teams in our
# salary data are actually in the game data we had. We can do this through a set difference.
set( game_results["home_team"].unique() ) - set( salary["TEAM"] )

{'Montréal Canadiens', 'Vegas Golden Knights'}

In [95]:
# Ok, so there are two problem teams, the Canadiens and the Golden Knights. Now, as a die hard
# Canadian who is also a strong Oilers fan I don't have a problem dropping the Canadiens from
# our analysis completely, but it turns out my wife (a french Canadian) disagrees with this
# so instead let's rename the team in our salary data
salary["TEAM"]=salary["TEAM"].replace("Montreal Canadiens","Montréal Canadiens")

# And I'm going to promote the team column to the index of the dataframe, and just get rid
# of the columns we are not going to use
salary=salary.set_index("TEAM")
salary=salary["FINAL CAP HIT"]

# The Golden Knights represent another important problem -- they didn't exist in the league
# in the 2016 season, so they didn't have salary cap information. This is going to be a
# problem when looking at their stats from the previous season too. I'm going to fill in their
# data as missing using the numpy NaN values, but it turns out we'll have to address this
# again later.
salary.loc['Vegas Golden Knights']= 0
salary.head()

TEAM
San Jose Sharks        73286670
Philadelphia Flyers    73546558
Calgary Flames         73824956
Arizona Coyotes        75286038
Anaheim Ducks          76957176
Name: FINAL CAP HIT, dtype: int64

In [96]:
salary

TEAM
San Jose Sharks          73286670
Philadelphia Flyers      73546558
Calgary Flames           73824956
Arizona Coyotes          75286038
Anaheim Ducks            76957176
Pittsburgh Penguins      77649912
Detroit Red Wings        78458260
Chicago Blackhawks       72989072
Washington Capitals      72967796
Toronto Maple Leafs      72821399
Minnesota Wild           72702142
Montréal Canadiens       72579037
Los Angeles Kings        72501361
Colorado Avalanche       72090269
Vancouver Canucks        71908430
St. Louis Blues          71891679
New York Islanders       71770356
New York Rangers         71752556
Buffalo Sabres           71677209
Tampa Bay Lightning      71604944
Columbus Blue Jackets    71408979
Dallas Stars             70101967
Boston Bruins            69972306
Nashville Predators      69429301
Ottawa Senators          69252320
Edmonton Oilers          68534176
Winnipeg Jets            66630635
Florida Panthers         64075585
New Jersey Devils        64006212
Carolina 

In [97]:
game_results = game_results[(game_results['home_team'] != 'Vegas Golden Knights') & (game_results['away_team'] != 'Vegas Golden Knights')].reset_index(drop=True)
game_results

Unnamed: 0,away,home,home_team,away_team,time,outcome_categorical
0,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713,away
1,5,4,Pittsburgh Penguins,St. Louis Blues,20171008_080732,away
2,0,3,Edmonton Oilers,Calgary Flames,20171008_080734,home
3,5,3,San Jose Sharks,Philadelphia Flyers,20171008_160734,away
4,3,4,Boston Bruins,Nashville Predators,20171009_153739,home
...,...,...,...,...,...,...
1183,2,4,Nashville Predators,Columbus Blue Jackets,20180409_142902,home
1184,2,5,Colorado Avalanche,St. Louis Blues,20180409_215908,home
1185,3,0,Arizona Coyotes,Anaheim Ducks,20180410_202907,away
1186,2,2,Edmonton Oilers,Vancouver Canucks,20180411_035911,away


In [98]:
# Great, we have two data sources down and ready for analysis, now we need to get some prior
# information about teams from the previous season. This will be useful for our model when
# we want to make early predictions and don't have the current season data.

# The NHL API has another great place to get standings for a whole season, so we'll use that
def team_standings(season="20162017"):
    '''Pull down the standings for teams in a single season.
    :param season: The season code (e.g. 20162017 for the 2016-2017 season)
    '''
    # Pull down the JSON data from the API directly
    game_url=f"https://statsapi.web.nhl.com/api/v1/standings?season={season}"
    http = urllib3.PoolManager()
    r = http.request('GET', game_url)
    data=json.loads(r.data)

    # In this case the JSON data has a record element for divisions and then lists the team 
    # records inside of that, so we need to do a nested iteration
    df_standings=pd.DataFrame()
    for record in data["records"]:
        for team_record in record["teamRecords"]:

            # We have to decide which standings we want to incorporate. Do we want just the
            # rank of the team from last season? The number of games they won? The number of
            # goals scored? This is where your knowledge of the sport can come in to add
            # context and value. I'm going to just include everything - for now - but this
            # is usually a poor choice in practice.

            # Since this is a JSON structure, and we want to turn it into a DataFrame, we can
            # use the handy json_normalize() function in pandas to "flatten" the JSON. And
            # we can just add that DataFrame to the bottom of our df_standings
            df_standings=df_standings.append(pd.json_normalize(team_record))
    return df_standings

# Commented out for Coursera, you can uncomment the code below if you are running locally.
# previous_season_standings=team_standings()

# Let's save this for offline use
# previous_season_standings.to_csv("assets/previous_season_standings.csv",index=False)
# previous_season_standings.head()

In [99]:
# If you are on coursera, you'll want to load the datafile
previous_season_standings=pd.read_csv("assets/previous_season_standings.csv")
previous_season_standings.head()

Unnamed: 0,goalsAgainst,goalsScored,points,divisionRank,divisionL10Rank,divisionRoadRank,divisionHomeRank,conferenceRank,conferenceL10Rank,conferenceRoadRank,...,team.id,team.name,team.link,leagueRecord.wins,leagueRecord.losses,leagueRecord.ot,leagueRecord.type,streak.streakType,streak.streakNumber,streak.streakCode
0,182,263,118,1,1,2,1,1,2,2,...,15,Washington Capitals,/api/v1/teams/15,55,19,8,league,losses,1,L1
1,234,282,111,2,5,4,2,2,10,8,...,5,Pittsburgh Penguins,/api/v1/teams/5,50,21,11,league,losses,2,L2
2,195,249,108,3,7,3,3,3,15,3,...,29,Columbus Blue Jackets,/api/v1/teams/29,50,24,8,league,wins,1,W1
3,220,256,102,4,6,1,7,5,12,1,...,3,New York Rangers,/api/v1/teams/3,48,28,6,league,wins,1,W1
4,242,241,94,5,2,5,6,9,3,10,...,2,New York Islanders,/api/v1/teams/2,41,29,12,league,wins,6,W6


In [100]:
# Ok, we have our three sources of data for features. First, we have an game by game breakdown
# of teams and scores for this season in game_results, and our target column (the one we want
# to predict) is outcome_categorical. We also have the salary information in the salary Series,
# and we have last year's data in previous_season_standings. What we are missing, however, is
# and cummulative knowledge about how the teams are performing in the season of interest. Our
# game_results dataframe only has who won and the game time, but it doesn't tell use what the
# stats are for each team thus far in the season. Of course, we would expect the stats for the
# team this season to have the highest predictive power for an upcoming game, so we need to build
# this cummulative DataFrame.

# Let's create a new DataFrame with won and lost columns, and initialize it with the teams in
# our game_results data and set the initial values to 0. We can incremend this as we gain new
# evidence of game performance
df_cum=pd.DataFrame()
df_cum.loc['won', list(game_results["home_team"].unique()) ]=0
df_cum.loc['lost', list(game_results["home_team"].unique()) ]=0

# I'm going to use a bit more advanced pandas here in the form of a multi-index on columns by
# calling unstack() and then adding a time row. This is just an entry for those default 0's
# and we'll get rid of it after we have built the cumulative DataFrame
df_cum=df_cum.unstack()
df_cum=pd.DataFrame(df_cum,columns=['time']).T
df_cum

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Dallas Stars,Dallas Stars,Colorado Avalanche,Colorado Avalanche,Arizona Coyotes,Arizona Coyotes,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
# Now we just need to iterate through all of the results in our game_results and calculate
# the cummulative wins and losses as appropriate. Pandas provides a nice way to do this
# using the iterrows() function
for idx,row in game_results.iterrows():
    # Identifying the winner and loser is pretty easy - remember we got rid of ties!
    if row["away"]>row["home"]:
        winner=row["away_team"]
        loser=row["home_team"]
    elif row["away"]< row["home"]:
        winner=row["home_team"]
        loser=row["away_team"]

    # Now we just update the entry in our cumulative DataFrame. The syntax here might be
    # a bit surprising because we have a multi-index on columns.
    df_cum.loc[idx, (winner,"won")]=df_cum[(winner,"won")].max()+1
    df_cum.loc[idx, (loser,"lost")]=df_cum[(loser,"lost")].max()+1

# Let's see what we have
df_cum

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Dallas Stars,Dallas Stars,Colorado Avalanche,Colorado Avalanche,Arizona Coyotes,Arizona Coyotes,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,,1.0,,,,,,,,,...,,,,,,,,,,
1,,,,1.0,,,,,,,...,,,,,,,,,,
2,,,,,1.0,,,,,,...,,,,,,,,,,
3,,,,,,,,1.0,,,...,,,,,,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,,,,,,,,,,,...,,,,,,,,,,
1184,,,,,,,,,,,...,,,46.0,,,,,,,
1185,,,,,,,,,,,...,,,,,,44.0,,,,
1186,,,,,,,,,,,...,,,,,,45.0,,,,


In [102]:
# Great, now let's propogate our scores forward in time, e.g. everyone gets a 0 until they
# play their first game, and we can do this with the ffilna() function. We'll also drop that
# first row of data (which we called time) since it's no longer needed.
df_cum=df_cum.fillna(method='ffill').drop(index="time")
df_cum.head()

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Dallas Stars,Dallas Stars,Colorado Avalanche,Colorado Avalanche,Arizona Coyotes,Arizona Coyotes,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [103]:
# Ok, I'm getting excited, we're almost at the good part! Now we just need to turn these
# three different data objects into a feature vector for prediction! Let's write another
# function, and we can have this function operate on a single row of game_results data,
# and pull from the other DataFrames to create a feature vector. 
def create_features(row):
    '''Operates on a single row of data from game_results, and interacts with global
    dataframes salary, previous_season_standings, and df_cum to generate a feature
    vector for that row.
    :param row: A single row in game_results
    :param return: A feature vector as a pandas Series object
    '''
    # Inside of this function let's store our features in a dictionary
    features={}

    # We can start by looking up the number of games the home and away teams have lost thus
    # far in the season
    features["away_won"]=df_cum.loc[row.name,(row["away_team"],"won")]
    features["away_lost"]=df_cum.loc[row.name,(row["away_team"],"lost")]
    features["home_won"]=df_cum.loc[row.name,(row["home_team"],"won")]
    features["home_lost"]=df_cum.loc[row.name,(row["home_team"],"lost")]

    # We have to adjust this to ensure that we're not leaking the results of this match!
    if row["outcome_categorical"]=="home":
        features["home_won"]=features["home_won"]-1
        features["away_lost"]=features["away_lost"]-1
    else:
        features["home_lost"]=features["home_lost"]-1
        features["away_won"]=features["away_won"]-1


    # Let's add in the salary cap information from last year
    features["away_cap"]=salary[row["away_team"]]
    features["home_cap"]=salary[row["home_team"]]

    # Let's get the previous season standings for each team too, and add an indicator
    # to each standing whether it was for the home or away team
    home_last_season=previous_season_standings.query(f"`team.name`=='{row['home_team']}'").add_prefix("home_last_season_")
    away_last_season=previous_season_standings.query(f"`team.name`=='{row['away_team']}'").add_prefix("away_last_season_")

    # Remember those Vegas Golden Knights? They didn't exist in the previous season, so
    # our code to convert the values to a dictionary won't work. We need to be robust to
    # this case, so let's just create an empty dictionary for teams which have no previous
    # season
    if len(home_last_season)>0:
        home_last_season=home_last_season.iloc[0].to_dict()
    else:
        home_last_season={}
    if len(away_last_season)>0:
        away_last_season=away_last_season.iloc[0].to_dict()
    else:
        away_last_season={}

    # Now we can leverage dictionary unpacking, returning all of the items including the
    # data from the game_results (which has our target variable) as a new Series
    return pd.Series({**features, **home_last_season, **away_last_season, **row})

# Let's generate these game results and put them into a new DataFrame called observations
observations=game_results.apply(create_features, axis='columns')
observations.head()

Unnamed: 0,away_won,away_lost,home_won,home_lost,away_cap,home_cap,home_last_season_goalsAgainst,home_last_season_goalsScored,home_last_season_points,home_last_season_divisionRank,...,away_last_season_leagueRecord.type,away_last_season_streak.streakType,away_last_season_streak.streakNumber,away_last_season_streak.streakCode,away,home,home_team,away_team,time,outcome_categorical
0,0.0,0.0,0.0,0.0,72821399,66630635,256,249,87,5,...,league,losses,1,L1,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713,away
1,0.0,0.0,0.0,0.0,71891679,77649912,234,282,111,2,...,league,wins,3,W3,5,4,Pittsburgh Penguins,St. Louis Blues,20171008_080732,away
2,0.0,0.0,0.0,0.0,73824956,68534176,212,247,103,2,...,league,losses,1,L1,0,3,Edmonton Oilers,Calgary Flames,20171008_080734,home
3,0.0,0.0,0.0,0.0,73546558,73286670,201,221,99,3,...,league,ot,1,OT1,5,3,San Jose Sharks,Philadelphia Flyers,20171008_160734,away
4,0.0,0.0,0.0,0.0,69429301,69972306,212,234,95,3,...,league,losses,1,L1,3,4,Boston Bruins,Nashville Predators,20171009_153739,home


In [104]:
# Ok, almost done with the data cleaning, now we have to go through and decide which
# columns we want to include in our model. There are a few obvious ones, for instance
# if we include the away and home scores in our model we should get a perfect
# prediction since our outcome target is completely based on this information. So lets
# get rid of those.
observations=observations.drop(["away","home"],axis='columns')

# We're also going to get rid of team name too
observations=observations.drop(["away_team","home_team"],axis='columns')

# In this example I'm just going to do a simple logistic regression, so some of the
# non-numeric data like "clinchIndicator" can't be used without converting this to dummy
# variables. Converting to dummy, or indicator, features is a reasonable choice, but
# I'm going to aim for simplicty for this demonstration instead, and get rid of these
# columns too
observations=observations.drop(['away_last_season_clinchIndicator','away_last_season_lastUpdated',
    'away_last_season_leagueRecord.type','away_last_season_streak.streakCode',
    'away_last_season_streak.streakType','away_last_season_team.link','away_last_season_team.name',
    'home_last_season_clinchIndicator','home_last_season_lastUpdated',
    'home_last_season_leagueRecord.type','home_last_season_streak.streakCode',
    'home_last_season_streak.streakType','home_last_season_team.link','home_last_season_team.name'],
    axis='columns')

# I made some pretty arbitrary and questionable choices here. For instance, I got rid of the
# semantic information about streaks (e.g. whether they were winning or losing streaks), but I
# left the numeric value for the length of the streak! This is not meaningful information
# for our model anymore, as a team which had a great 5 game winning streak last year and one
# which had a horrible 5 game losing streak will look the same.

# In the end, the best model will be one which has clean and thoughtful data coming into it
# where the data is indicative of future data, and we'll talk a bit about the principle of
# parsimony in a future lecture.

# One last bit of cleaning - let's get rid of the time column, it's just noise which we already
# have captured in the index and won't be useful for our modeling approach
observations=observations.drop("time",axis="columns")
observations

Unnamed: 0,away_won,away_lost,home_won,home_lost,away_cap,home_cap,home_last_season_goalsAgainst,home_last_season_goalsScored,home_last_season_points,home_last_season_divisionRank,...,away_last_season_pointsPercentage,away_last_season_ppDivisionRank,away_last_season_ppConferenceRank,away_last_season_ppLeagueRank,away_last_season_team.id,away_last_season_leagueRecord.wins,away_last_season_leagueRecord.losses,away_last_season_leagueRecord.ot,away_last_season_streak.streakNumber,outcome_categorical
0,0.0,0.0,0.0,0.0,72821399,66630635,256,249,87,5,...,0.579268,4,8,14,10,40,27,15,1,away
1,0.0,0.0,0.0,0.0,71891679,77649912,234,282,111,2,...,0.603659,3,6,11,19,46,29,7,3,away
2,0.0,0.0,0.0,0.0,73824956,68534176,212,247,103,2,...,0.573171,4,8,17,20,45,33,4,1,home
3,0.0,0.0,0.0,0.0,73546558,73286670,201,221,99,3,...,0.536585,6,11,19,4,39,33,10,1,away
4,0.0,0.0,0.0,0.0,69429301,69972306,212,234,95,3,...,0.573171,4,7,15,18,41,29,12,1,home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,43.0,33.0,47.0,23.0,71408979,69429301,224,240,94,4,...,0.658537,3,3,4,29,50,24,8,1,home
1184,44.0,39.0,45.0,39.0,71891679,72090269,278,166,48,7,...,0.603659,3,6,11,19,46,29,7,3,home
1185,43.0,30.0,30.0,43.0,76957176,75286038,260,197,70,6,...,0.640244,1,3,6,24,46,23,13,4,away
1186,33.0,47.0,31.0,46.0,71908430,68534176,212,247,103,2,...,0.420732,7,13,29,23,30,43,9,8,away


In [105]:
# With our observations cleaned we now face another choice: what to do with 
# missing values. Missing values are a signal of their own, but most machine learning
# techniques want an explicit indication of what missing means. Strategies for numeric
# values, which we have here, are usually aggregation functions based on the rest
# of the data we have. For the Vegas Golden Knights, for instance, what should
# we be expecting their performance from the last year was like? The average performance
# of other teams? Worse than the bottom performing team, since the Golden Knights are
# new? Again, this is a place you need to bring your understanding of the domain to
# the process and set reasonable values. In this case, I'm going to fill all missing
# values with the mean() value from other observations.

# But, a moment of caution! Remember that we are working through the whole machine
# learning pipeline here. So we are planning to build a model on some training data, and
# then determine how well it works on some held out test data. We want to split up
# our sets before we start replacing imputed data, so that we don't "leak" information
# from our test set.

# Let's first make sure all of our columns are numeric
for col in observations.columns:
    if col != 'outcome_categorical':
        observations[col]=pd.to_numeric(observations[col])

# And let's save this list of observations for use in the future
observations.to_csv("assets/observations.csv")

# Let's put the first 800 observations in our training data.
training_df=observations[0:799]
testing_df=observations[800:]

# And now lets impute the missing data for each set independently
training_df=training_df.fillna(training_df.mean())
testing_df=testing_df.fillna(testing_df.mean())

In [106]:
# Now let's go on to building a logistic model. You've seen this before, a regression technique
# applied to categorical data, and we're going to use the sklearn LogisticRegression
# class to build our model
from sklearn.linear_model import LogisticRegression

# Building the classifier is straight forward, we just create a new instance of the LogisticRegression
# class then call the fit() method passing in our features we wish to train on and our labels
# which we want to predict.

# For this first model we'll pass in all of the observations and all columns except for the 
# target which is the outcome_categorical column.
features=training_df.drop('outcome_categorical', axis='columns')
target=training_df['outcome_categorical']

clf=LogisticRegression()
reg=clf.fit(features,target)

# Now let's print out the R squared value of this model on the same data
reg.score(features, target)

0.49937421777221525

In [107]:
# So, that's a pretty bad model. Let's see how well it works on our test data.
# Specifically, let's take a look at the accuracy, or the number of correct 
# predictions we can make. We can import an accuracy_score helper function 
# from sklearn.metrics
from sklearn.metrics import accuracy_score

# Now form a variable which has the correct labels and one which has the predictions
labels=testing_df['outcome_categorical']
predictions=reg.predict(testing_df.drop('outcome_categorical', axis='columns'))

# And let's take a look at our results
print( f"score {accuracy_score(labels,predictions)}")

score 0.4536082474226804


In [85]:
testing_df

Unnamed: 0,away_cap,away_last_season_conferenceHomeRank,away_last_season_conferenceL10Rank,away_last_season_conferenceRank,away_last_season_conferenceRoadRank,away_last_season_divisionHomeRank,away_last_season_divisionL10Rank,away_last_season_divisionRank,away_last_season_divisionRoadRank,away_last_season_gamesPlayed,...,home_last_season_ppConferenceRank,home_last_season_ppDivisionRank,home_last_season_ppLeagueRank,home_last_season_row,home_last_season_streak.streakNumber,home_last_season_team.id,home_last_season_wildCardRank,home_lost,home_won,outcome_categorical
20180207_212103,72702142,4.000000,5.000000,2.000000,2.000000,2.000000,3.000000,2.000000,2.000000,82.0,...,11.0,6.0,24.0,33.0,1.0,25.0,5.0,22.0,27.0,home
20180207_084101,72989072,2.000000,9.000000,1.000000,1.000000,1.000000,6.000000,1.000000,1.000000,82.0,...,8.0,4.0,17.0,41.0,1.0,20.0,1.0,21.0,26.0,home
20180207_222103,71604944,9.000000,1.000000,10.000000,9.000000,3.000000,1.000000,5.000000,5.000000,82.0,...,13.0,7.0,29.0,26.0,8.0,23.0,7.0,29.0,22.0,away
20180207_181101,75286038,13.000000,10.000000,12.000000,12.000000,7.000000,4.000000,6.000000,6.000000,82.0,...,10.0,5.0,22.0,37.0,1.0,26.0,4.0,28.0,26.0,home
20180208_215102,0,8.052632,8.085526,7.945175,7.890351,4.263158,4.287281,4.212719,4.182018,82.0,...,1.0,1.0,1.0,53.0,1.0,15.0,0.0,21.0,32.0,away
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20180409_215908,71891679,7.000000,4.000000,5.000000,4.000000,4.000000,2.000000,3.000000,3.000000,82.0,...,14.0,7.0,30.0,21.0,1.0,21.0,8.0,41.0,45.0,home
20180410_202907,76957176,1.000000,1.000000,3.000000,7.000000,1.000000,1.000000,1.000000,4.000000,82.0,...,12.0,6.0,27.0,24.0,1.0,53.0,6.0,47.0,31.0,away
20180410_145907,0,8.052632,8.085526,7.945175,7.890351,4.263158,4.287281,4.212719,4.182018,82.0,...,8.0,4.0,17.0,41.0,1.0,20.0,1.0,42.0,39.0,home
20180411_035911,71908430,12.000000,14.000000,13.000000,13.000000,6.000000,7.000000,7.000000,7.000000,82.0,...,4.0,2.0,7.0,43.0,3.0,22.0,0.0,47.0,34.0,away
