In [1]:
# Import the necessary libraries

import numpy as np
import csv

In [2]:
# Construct a mapping between different data formats. Not clean, but works!

name_cities_mapping = {
	"Anaheim": "ANA", 
	"Anaheim Ducks":"ANA", 

	"Arizona": "ARI", 
	"Phoenix": "ARI", 
	"Phoenix Coyotes":"ARI", 

	"Boston": "BOS", 
	"Boston Bruins":"BOS", 

	"Buffalo": "BUF", 
	"Buffalo Sabres":"BUF", 

	"Carolina": "CAR", 
	"Carolina Hurricanes":"CAR", 

	"Columbus": "CBJ", 
	"Columbus Blue Jackets":"CBJ", 

	"Calgary": "CGY", 
	"Calgary Flames":"CGY", 

	"Chicago": "CHI", 
	"Chicago Blackhawks":"CHI", 

	"Colorado": "COL", 
	"Colorado Avalanche":"COL", 

	"Dallas": "DAL", 
	"Dallas Stars":"DAL", 

	"Detroit": "DET", 
	"Detroit Red Wings":"DET", 

	"Edmonton":"EDM", 
	"Edmonton Oilers":"EDM", 

	"Florida":"FLA", 
	"Florida Panthers":"FLA", 

	"LosAngeles":"L.A",
	"Los Angeles":"L.A", 
	"Los Angeles Kings":"L.A", 

	"Minnesota":"MIN", 
	"Minnesota Wild":"MIN", 

	"Montreal":"MTL", 
	"Montreal Canadiens":"MTL", 

	"NewJersey":"N.J", 
	"New Jersey":"N.J", 
	"New Jersey Devils":"N.J", 

	"Nashville":"NSH", 
	"Nashville Predators":"NSH", 

	"NYIslanders":"NYI", 
	"NY Islanders":"NYI", 
	"New York Islanders":"NYI",

	"NYRangers":"NYR",
	"NY Rangers":"NYR", 
	"New York Rangers":"NYR", 

	"Ottawa":"OTT", 
	"Ottawa Senators":"OTT", 

	"Philadelphia":"PHI", 
	"Philadelphia Flyers":"PHI", 

	"Pittsburgh":"PIT", 
	"Pittsburgh Penguins":"PIT", 

	"SanJose":"S.J",
	"San Jose":"S.J", 
	"San Jose Sharks":"S.J", 

	"St.Louis":"STL", 
	"St. Louis":"STL", 
	"St. Louis Blues":"STL", 

	"TampaBay":"T.B",
	"Tampa Bay":"T.B", 
	"Tampa Bay Lightning":"T.B", 

	"Toronto":"TOR", 
	"Toronto Maple Leafs":"TOR", 

	"Vancouver":"VAN", 
	"Vancouver Canucks":"VAN", 

	"Winnipeg":"WPG", 
	"Winnipeg Jets":"WPG", 

	"Washington":"WSH",
	"Washington Capitals":"WSH"
}

In [3]:
# Load the features from the csv file. 
# Simple features found in team_stats.csv
# Advanced features found in team_stats_2017-12-03.csv and team_stats_2017-12-04.csv
# Difference between 03 and 04 (ignore the date) is that 03 is 5on5 games, and 04 is all games.
features = np.genfromtxt('team_stats_2017-12-03.csv', delimiter=',', skip_header=1, dtype=("|S10", "|S10", float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float ) )

In [4]:
# Utility to convert dates between formats used in the various data files.
def convert_date(date):
    # date is format YYYY-MM-DD, return int
    return int(date[0:4])*10000 + int(date[5:7])*100 + int(date[8:])

In [5]:
# Find the date that lies in the particular season.
def find_season(seasons, date):
    for i in range(len(seasons)):
        if date < seasons[i]:
            return i
    return -1

In [6]:
# Get the name of each feature from the file (first line).
feature_names = np.genfromtxt('team_stats_2017-12-03.csv', dtype='str', delimiter=',', skip_header=0)[0]

In [7]:
# Advanced Feature List (Overall Play)
adv_keep = []

for i in range(feature_names.shape[0]):
    if '%' not in feature_names[i] and i > 3:
        print(i, feature_names[i])
        adv_keep.append(i)

# Simple Feature List (Just Shots)
simple_keep = [4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 21, 28, 29, 30, 32, 33]

4 "CF"
5 "CA"
6 "C+/-"
8 "CF/60"
9 "CA/60"
10 "GF"
11 "GA"
12 "G+/-"
14 "GF/60"
15 "GA/60"
16 "xGF"
17 "xGA"
18 "xG+/-"
20 "xGF/60"
21 "xGA/60"
22 "PENT"
23 "PEND"
24 "P+/-"
27 "PDO"
28 "FF"
29 "FA"
30 "F+/-"
32 "FF/60"
33 "FA/60"
34 "SF"
35 "SA"
36 "S+/-"
38 "SF/60"
39 "SA/60"
40 "PENT/60"
41 "PEND/60"
48 "xPDO"
49 "dPDO"
50 "OZS"
51 "DZS"
52 "NZS"
56 "ZSR"
58 "FOW"
59 "FOL"
60 "GVA"
61 "TKA"
62 "GVA/60"
63 "TKA/60"
64 "HF"
65 "HA"
66 "H+/-"
67 "HF/60"
68 "HA/60"


In [8]:
# Load the data into a "mapping" array (part 1 of creating the feature set)

season_names = ["12-13", "13-14", "14-15", "15-16", "16-17"]
seasons = [20130501, 20140501, 20150501, 20160501, 20170501]

# games = {} # Season -> Team -> Date -> just date
mapping_adv = {} # Season -> Team -> Date -> Average upto date
mapping_simp = {}
for name in season_names:
    mapping_adv[name] = {}
    mapping_simp[name] = {}
#     games[name] = {}

for i in range(features.shape[0]):
    name = str(features[i][0], 'utf-8').replace('"', '').strip()
    date = convert_date(str(features[i][1], 'utf-8').strip())
    adv_stats = np.asarray(list((features[i])))[adv_keep]
    adv_stats = adv_stats.astype(float)
    simple_stats = np.asarray(list((features[i])))[simple_keep]
    simple_stats = simple_stats.astype(float)
    season = find_season(seasons, date)
        
    # add to mapping[season_names[season]]
    if name not in mapping_adv[season_names[season]]:
        mapping_adv[season_names[season]][name] = {}
#         games[season_names[season]][name] = {}
        mapping_adv[season_names[season]][name]['total'] = np.zeros(len(adv_keep))
        mapping_adv[season_names[season]][name]['count'] = 0
    if name not in mapping_simp[season_names[season]]:
        mapping_simp[season_names[season]][name] = {}
#         games[season_names[season]][name] = {}
        mapping_simp[season_names[season]][name]['total'] = np.zeros(len(simple_keep))
        mapping_simp[season_names[season]][name]['count'] = 0

        
#     games[season_names[season]][name][date] = stats

    if mapping_adv[season_names[season]][name]['count'] != 0:
        mapping_adv[season_names[season]][name][date] = [mapping_adv[season_names[season]][name]['count'], mapping_adv[season_names[season]][name]['total'] / mapping_adv[season_names[season]][name]['count']]

    if mapping_simp[season_names[season]][name]['count'] != 0:
        mapping_simp[season_names[season]][name][date] = [mapping_simp[season_names[season]][name]['count'], mapping_simp[season_names[season]][name]['total'] / mapping_simp[season_names[season]][name]['count']]
        

    mapping_adv[season_names[season]][name]['count'] += 1        
    mapping_adv[season_names[season]][name]['total'] += adv_stats
        
    mapping_simp[season_names[season]][name]['count'] += 1        
    mapping_simp[season_names[season]][name]['total'] += simple_stats
    
    

In [9]:
# Convert the format of dates used in the odds file.

def convert_odds_date(entry, year):
	if len(entry) == 4:
		return int(year)*10000 + int(entry[0:2])*100 + int(entry[2:])
	else:
		return int(year+1)*10000 + int(entry[0])*100 + int(entry[1:])

In [10]:
# Was used when trying out different values of hyperparameter "K"
# which relates to the last number of games that are averaged (instead of
# looking at the entire season). Varying K doesn't impact performance, so
# this function is no longer used.

# DEPRECATED
def get_last_N_games_average(games, season, team, current, N, lower_bound):
    print("Warning: get_last_N_games_average is deprecated")
    return None # keep (deprecated)
    avg_games = np.zeros(len(keep))
    count = 0
    
    while count < N:
        current -= 1
        if current in games[season][team]:
            avg_games += games[season][team][current]
            count += 1
        if current < lower_bound:
            break
    
    if current < lower_bound:
        return None
    
    avg_games /= count
    return avg_games

lower_bounds = [20120501, 20130501, 20140501, 20150501, 20160501]

In [12]:
# Reading the odds file, keep track of wins/losses for teams up to that point

def load_wins_losses(txtfile, year, mapping, season):
    with open(txtfile, 'r') as f:
        lines = [row for row in csv.reader(f.read().splitlines())]
        lines.pop(0)
    
        for i in range(0, len(lines), 2):
            line1 = lines[i]
            line2 = lines[i+1]
            
            if line1[7].strip() == '' or line2[7].strip() == '':
                continue
            
            date = convert_odds_date(line1[0], year)
            team1 = name_cities_mapping[line1[3].strip()] # Visitor
            team2 = name_cities_mapping[line2[3].strip()] # Home
            score1 = int(line1[7])
            score2 = int(line2[7])

            if date not in mapping[season][team2] or date not in mapping[season][team1]:
                continue
            
            if 'wins' not in mapping[season][team1]:
                mapping[season][team1]['wins'] = 0
                mapping[season][team1]['losses'] = 0
                mapping[season][team1][date][1] = np.append(mapping[season][team1][date][1], 0)
            else:
                pct1 = mapping[season][team1]['wins'] * 1. / (mapping[season][team1]['wins'] + mapping[season][team1]['losses'])
                mapping[season][team1][date][1] = np.append(mapping[season][team1][date][1], pct1)
                
            if 'wins' not in mapping[season][team2]:
                mapping[season][team2]['wins'] = 0
                mapping[season][team2]['losses'] = 0
                mapping[season][team2][date][1] = np.append(mapping[season][team2][date][1], 0)
            else:
                pct2 = mapping[season][team2]['wins'] * 1. / (mapping[season][team2]['wins'] + mapping[season][team2]['losses'])
                mapping[season][team2][date][1] = np.append(mapping[season][team2][date][1], pct2)

            if score2 > score1:
                mapping[season][team2]['wins'] += 1
                mapping[season][team1]['losses'] += 1
            else:
                mapping[season][team1]['wins'] += 1
                mapping[season][team2]['losses'] += 1
    
load_wins_losses("nhl_odds_2012-13.csv", 2012, mapping_adv, season_names[0])
load_wins_losses("nhl_odds_2013-14.csv", 2013, mapping_adv, season_names[1])
load_wins_losses("nhl_odds_2014-15.csv", 2014, mapping_adv, season_names[2])
load_wins_losses("nhl_odds_2015-16.csv", 2015, mapping_adv, season_names[3])
load_wins_losses("nhl_odds_2016-17.csv", 2016, mapping_adv, season_names[4])

load_wins_losses("nhl_odds_2012-13.csv", 2012, mapping_simp, season_names[0])
load_wins_losses("nhl_odds_2013-14.csv", 2013, mapping_simp, season_names[1])
load_wins_losses("nhl_odds_2014-15.csv", 2014, mapping_simp, season_names[2])
load_wins_losses("nhl_odds_2015-16.csv", 2015, mapping_simp, season_names[3])
load_wins_losses("nhl_odds_2016-17.csv", 2016, mapping_simp, season_names[4])

In [14]:
# Construct the feature set. Initially, we tried out various types of feature sets (code is still here, just commented).
# Those included looking at the last 5 games, the last 3, the last 10, the last 20, the last 40,
# just those games, or a combination of those with the overall season.
# We also tried representing the features as a difference between two teams or just in vector form.

# Vector form gave the model more flexibility/was better on the dev set, so we stuck with that.

# simple_features_diff = []
simple_features = []
advanced_features = []

#features_justlast5_diff = []
#features_justlast5_vec = []

#features_alsolast5_diff = []
#features_alsolast5_vec = []
#features_alsolast3_vec = []
# features_alsolast10_vec = []
# features_alsolast20_vec = []
# features_alsolast30_vec = []
#features_alsolast40_vec = []

labels_single_simp = []
labels_multi_simp = []
N_simp = []

labels_single_adv = []
labels_multi_adv = []
N_adv = []

def load_odds_file(txtfile, year, mapping, season, lower_bound, labelss, labelsm, Narr, featuresarr):
    with open(txtfile, 'r') as f:
        lines = [row for row in csv.reader(f.read().splitlines())]
        lines.pop(0)
        
        for i in range(0, len(lines), 2):
            line1 = lines[i]
            line2 = lines[i+1]
            
            if line1[7].strip() == '' or line2[7].strip() == '':
                continue
            
            #print(line1, line2)
            
            date = convert_odds_date(line1[0], year)
            team1 = name_cities_mapping[line1[3].strip()] # Visitor
            team2 = name_cities_mapping[line2[3].strip()] # Home
            score1 = int(line1[7])
            score2 = int(line2[7])
            
            #print team1, team2, date, score1, score2
            
            if date not in mapping[season][team2] or date not in mapping[season][team1]:
                continue
            
#             features_diff.append(mapping[season][team2][date][1] - mapping[season][team1][date][1])
            
            vec = []
            vec.extend(mapping[season][team2][date][1])
            vec.extend(mapping[season][team1][date][1])
            featuresarr.append(vec)

# Old (not clean) code for the old feature sets commented out:
#             features_last5_team1 = get_last_N_games_average(games, season, team1, date, 5, lower_bound)
#             features_last5_team2 = get_last_N_games_average(games, season, team2, date, 5, lower_bound)

#             features_last3_team1 = get_last_N_games_average(games, season, team1, date, 3, lower_bound)
#             features_last3_team2 = get_last_N_games_average(games, season, team2, date, 3, lower_bound)

#             features_last10_team1 = get_last_N_games_average(games, season, team1, date, 10, lower_bound)
#             features_last10_team2 = get_last_N_games_average(games, season, team2, date, 10, lower_bound)

#             features_last20_team1 = get_last_N_games_average(games, season, team1, date, 20, lower_bound)
#             features_last20_team2 = get_last_N_games_average(games, season, team2, date, 20, lower_bound)
            
#             features_last30_team1 = get_last_N_games_average(games, season, team1, date, 30, lower_bound)
#             features_last30_team2 = get_last_N_games_average(games, season, team2, date, 30, lower_bound)

#             if features_last5_team1 is not None and features_last5_team2 is not None:
                            
#                 features_justlast5_diff.append(features_last5_team2 - features_last5_team1)
            
#                 vec2 = []
#                 vec2.extend(features_last5_team2)
#                 vec2.extend(features_last5_team1)
#                 features_justlast5_vec.append(vec2)

#                 vec3 = []
#                 vec3.extend(mapping[season][team2][date][1] - mapping[season][team1][date][1])
#                 vec3.extend(features_last5_team2 - features_last5_team1)
#                 features_alsolast5_diff.append(vec3)

#                 vec4 = []
#                 vec4.extend(mapping[season][team2][date][1])
#                 vec4.extend(mapping[season][team1][date][1])
#                 vec4.extend(features_last5_team2)
#                 vec4.extend(features_last5_team1)
#                 features_alsolast5_vec.append(vec4)
            
#             if features_last3_team1 is not None and features_last3_team2 is not None:
#                 vec4 = []
#                 vec4.extend(mapping[season][team2][date][1])
#                 vec4.extend(mapping[season][team1][date][1])
#                 vec4.extend(features_last3_team2)
#                 vec4.extend(features_last3_team1)
#                 features_alsolast3_vec.append(vec4) 

#             if features_last10_team1 is not None and features_last10_team2 is not None:
#                 vec4 = []
#                 vec4.extend(mapping[season][team2][date][1])
#                 vec4.extend(mapping[season][team1][date][1])
#                 vec4.extend(features_last10_team2)
#                 vec4.extend(features_last10_team1)
#                 features_alsolast10_vec.append(vec4)

#             if features_last20_team1 is not None and features_last20_team2 is not None:
#                 vec4 = []
#                 vec4.extend(mapping[season][team2][date][1])
#                 vec4.extend(mapping[season][team1][date][1])
#                 vec4.extend(features_last20_team2)
#                 vec4.extend(features_last20_team1)
#                 features_alsolast20_vec.append(vec4)
            
#             if features_last30_team1 is not None and features_last30_team2 is not None:
#                 vec4 = []
#                 vec4.extend(mapping[season][team2][date][1])
#                 vec4.extend(mapping[season][team1][date][1])
#                 vec4.extend(features_last30_team2)
#                 vec4.extend(features_last30_team1)
#                 features_alsolast30_vec.append(vec4)
                
            Narr.append([mapping[season][team2][date][0], mapping[season][team1][date][0]])
            labelss.append(1 if score2 > score1 else 0)
            labelsm.append(score2 - score1)
            
load_odds_file("nhl_odds_2012-13.csv", 2012, mapping_simp, season_names[0], lower_bounds[0], labels_single_simp, labels_multi_simp, N_simp, simple_features)
load_odds_file("nhl_odds_2013-14.csv", 2013, mapping_simp, season_names[1], lower_bounds[1], labels_single_simp, labels_multi_simp, N_simp, simple_features)
load_odds_file("nhl_odds_2014-15.csv", 2014, mapping_simp, season_names[2], lower_bounds[2], labels_single_simp, labels_multi_simp, N_simp, simple_features)
load_odds_file("nhl_odds_2015-16.csv", 2015, mapping_simp, season_names[3], lower_bounds[3], labels_single_simp, labels_multi_simp, N_simp, simple_features)
load_odds_file("nhl_odds_2016-17.csv", 2016, mapping_simp, season_names[4], lower_bounds[4], labels_single_simp, labels_multi_simp, N_simp, simple_features)

load_odds_file("nhl_odds_2012-13.csv", 2012, mapping_adv, season_names[0], lower_bounds[0], labels_single_adv, labels_multi_adv, N_adv, advanced_features)
load_odds_file("nhl_odds_2013-14.csv", 2013, mapping_adv, season_names[1], lower_bounds[1], labels_single_adv, labels_multi_adv, N_adv, advanced_features)
load_odds_file("nhl_odds_2014-15.csv", 2014, mapping_adv, season_names[2], lower_bounds[2], labels_single_adv, labels_multi_adv, N_adv, advanced_features)
load_odds_file("nhl_odds_2015-16.csv", 2015, mapping_adv, season_names[3], lower_bounds[3], labels_single_adv, labels_multi_adv, N_adv, advanced_features)
load_odds_file("nhl_odds_2016-17.csv", 2016, mapping_adv, season_names[4], lower_bounds[4], labels_single_adv, labels_multi_adv, N_adv, advanced_features)

In [16]:
# Import models from sklearn.

import numpy as np
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score as cv
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier

In [17]:
# Used when varying the hyperparameter "N".

def get_mod_features_labels(cutoff, features, labels, Ns):
    fts = []
    lbls = []
    for i in range(len(features)):
        if Ns[i][0] < cutoff or Ns[i][1] < cutoff:
            continue
        fts.append(features[i])
        lbls.append(labels[i])
    return fts, lbls

In [18]:
# Used previously for PCA. PCA didn't increase the accuracy
# and removed our ability to understand the features intuitively
# so we decided not to use it. Might be worth exploring in the future
# if the curse of dimensionality is an issue.

def PCA(clf, arr, labels):
    best = (0, 0)
    arr_std = StandardScaler().fit_transform(arr)
    for i in range(1, len(arr[0])+1):
        pca = decomposition.PCA(n_components = i)
        arr_new = pca.fit_transform(arr_std)
        score = cv(clf, arr_new, labels, cv=10)
        avg = sum(score)/len(score)
        print( i, avg )
        if avg>best[0]:
            best = (avg, i)
    return best

In [19]:
# Used for debugging

def get_playoff_feature(season, team, mapping, start):
    while start not in mapping[season][team]:
        start -= 1
    return mapping[season][team][start][1]

#print(get_playoff_feature(season_names[0], "ANA", mapping, seasons[0]))

In [22]:
# Logistic Regression - Simple Features

fts, lbls = get_mod_features_labels(0, simple_features, labels_single_simp, N_simp)

logreg = lm.LogisticRegression()
logreg.fit(fts, lbls)
print ("LR", "N", 0, "Train", logreg.score(fts, lbls), "Test", np.mean(cv(logreg, fts, lbls, cv=10)))

LR N 0 Train 0.592599277978 Test 0.578518722272


In [20]:
# Logistic Regression - Adv Features

fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)

logreg = lm.LogisticRegression()
logreg.fit(fts, lbls)
print ("LR", "N", 0, "Train", logreg.score(fts, lbls), "Test", np.mean(cv(logreg, fts, lbls, cv=10)))

LR N 0 Train 0.579241877256 Test 0.573831110473


In [113]:
# Logistic Regression - Adv - Feature Analysis

fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)
print(len(advanced_features[0]))
logreg = lm.LogisticRegression()
logreg.fit(fts, lbls)
print ("LR", "N", 0, "Train", logreg.score(fts, lbls), "Test", np.mean(cv(logreg, fts, lbls, cv=10)))
print(logreg.coef_, logreg.intercept_)

42
LR N 0 Train 0.578880866426 Test 0.572024747147
[[ 0.01458429 -0.00319405  0.01777834 -0.00214508  0.00222488  0.02248043
  -0.1052488   0.12772923  0.08738838 -0.05980569 -0.03916719 -0.0060844
  -0.03010745 -0.04210995  0.01588167  0.01511821 -0.03830144  0.04947413
  -0.02174842  0.03038766 -0.11160686  0.09915301  0.08572148  0.01343153
  -0.13889024 -0.05335984 -0.03630351  0.03938682 -0.07569033 -0.07192053
  -0.00516776  0.01842421 -0.00248705  0.01563002  0.01808529 -0.01109703
  -0.01051419 -0.2715411   0.298647   -0.19092727  0.16976221 -0.27068811]] [ 0.01484725]


In [24]:
# SVM - Simp Features

Cs = [0.01, 0.1, 0.5, 1]
kernels = ["rbf", "linear"]
fts, lbls = get_mod_features_labels(0, simple_features, labels_single_simp, N_simp)

for c in Cs:
    for kernel in kernels:
        svmo = svm.SVC(gamma=0.001, C=c, kernel=kernel)
        svmo.fit(fts, lbls)
        print ("SVM", "C", c, "Kernel", kernel, "Train", svmo.score(fts, lbls), "Test", np.mean(cv(svmo, fts, lbls, cv=10)))
        

SVM C 0.01 Kernel rbf Train 0.545667870036 Test 0.54566796002
SVM C 0.01 Kernel linear Train 0.586281588448 Test 0.577256485306
SVM C 0.1 Kernel rbf Train 0.579963898917 Test 0.571489094542
SVM C 0.1 Kernel linear Train 0.586101083032 Test 0.574908608088
SVM C 0.5 Kernel rbf Train 0.597292418773 Test 0.576719530584
SVM C 0.5 Kernel linear Train 0.587725631769 Test 0.57328373294
SVM C 1 Kernel rbf Train 0.612635379061 Test 0.576359167871
SVM C 1 Kernel linear Train 0.587003610108 Test 0.574367744665


In [25]:
# SVM - Adv Features

Cs = [0.01, 0.1, 0.5, 1]
kernels = ["rbf", "linear"]
fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)

for c in Cs:
    for kernel in kernels:
        svmo = svm.SVC(gamma=0.001, C=c, kernel=kernel)
        svmo.fit(fts, lbls)
        print ("SVM", "C", c, "Kernel", kernel, "Train", svmo.score(fts, lbls), "Test", np.mean(cv(svmo, fts, lbls, cv=10)))
        

SVM C 0.01 Kernel rbf Train 0.545667870036 Test 0.54566796002
SVM C 0.01 Kernel linear Train 0.563176895307 Test 0.560823603779
SVM C 0.1 Kernel rbf Train 0.56155234657 Test 0.556497009867
SVM C 0.1 Kernel linear Train 0.571119133574 Test 0.563351991117
SVM C 0.5 Kernel rbf Train 0.570577617329 Test 0.563000109215
SVM C 0.5 Kernel linear Train 0.571119133574 Test 0.565701827979
SVM C 1 Kernel rbf Train 0.578700361011 Test 0.565343422558
SVM C 1 Kernel linear Train 0.574368231047 Test 0.565520669742


In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [63]:
# Decision Tree - Simp

fts, lbls = get_mod_features_labels(0, simple_features, labels_single_simp, N_simp)

tree = DecisionTreeClassifier()
tree.fit(fts, lbls)
print ("LR", "N", 0, "Train", tree.score(fts, lbls), "Test", np.mean(cv(tree, fts, lbls, cv=10)))

LR N 0 Train 1.0 Test 0.516614357354


In [64]:
# Decision Tree - Adv

fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)

logreg = DecisionTreeClassifier()
tree.fit(fts, lbls)
print ("LR", "N", 0, "Train", tree.score(fts, lbls), "Test", np.mean(cv(tree, fts, lbls, cv=10)))

LR N 0 Train 1.0 Test 0.511892530011


In [66]:
# Random Forest - Simp

fts, lbls = get_mod_features_labels(0, simple_features, labels_single_simp, N_simp)

tree = RandomForestClassifier()
tree.fit(fts, lbls)
print ("LR", "N", 0, "Train", tree.score(fts, lbls), "Test", np.mean(cv(tree, fts, lbls, cv=10)))

LR N 0 Train 0.98321299639 Test 0.522390864108


In [67]:
# Random Forest - Advanced

fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)

tree = RandomForestClassifier()
tree.fit(fts, lbls)
print ("LR", "N", 0, "Train", tree.score(fts, lbls), "Test", np.mean(cv(tree, fts, lbls, cv=10)))

LR N 0 Train 0.985740072202 Test 0.510826751442


In [None]:
# # # for cutoff in range(0, 50, 10):

# cutoff = 0

# # NEUNET Size (5, 10) Solv lbfgs Act identity Alpha 1e-05 Train 0.578519855596 Test 0.578520997741
# # NEUNET Size 5 Solv sgd Act relu Alpha 1e-05 Train 0.580866425993 Test 0.571109195381

# Cs = [0.01, 0.1, 0.5, 1]
# kernels = ["rbf", "linear"]

# for c in Cs:
#     for kernel in kernels:
#         svmo = svm.SVC(gamma=0.001, C=c, kernel=kernel) #lm.LogisticRegression(), C=100.
#         svmo.fit(fts, lbls)
#         print ("SVM", "C", c, "Kernel", kernel, "Train", svmo.score(fts, lbls), "Test", np.mean(cv(svmo, fts, lbls, cv=10)))
#         #     #print ("Vec", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))

# sizes = [(5), (10), (15), (5, 5), (10, 5), (5, 10), (8, 10, 12), (10, 15, 20)]
# solvers = ['adam', 'lbfgs', 'sgd']
# activations = ['identity', 'logistic', 'tanh', 'relu']
# alphas = [1e-5, 1e-4, 1e-3]
# for size in sizes:
#     for solver in solvers:
#         for activation in activations:
#             for alpha in alphas:
#                 clf = MLPClassifier(solver=solver, activation=activation, alpha=alpha, hidden_layer_sizes=size, random_state=1)
#                 clf.fit(fts, lbls)
                
#                 print ("NEUNET", "Size", size, "Solv", solver, "Act", activation, "Alpha", alpha, "Train", clf.score(fts, lbls), "Test", np.mean(cv(clf, fts, lbls, cv=10)))

# # # fts, lbls = get_mod_features_labels(cutoff, features_justlast5_diff, labels_single, N)
# # # logreg = lm.LogisticRegression()
# # # logreg.fit(fts, lbls)
# # # print ("Last5-D", "N", cutoff, "Training accuracy", logreg.score(fts, lbls))

# # # fts, lbls = get_mod_features_labels(cutoff, features_alsolast10_vec, labels_single, N)
# # # logreg = svm.SVC(gamma=0.001, C=0.5) #lm.LogisticRegression()
# # # logreg.fit(fts, lbls)
# # # print ("Last10", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))

# # # # fts, lbls = get_mod_features_labels(cutoff, features_alsolast5_diff, labels_single, N)
# # # # logreg = lm.LogisticRegression()
# # # # logreg.fit(fts, lbls)
# # # # print ("All-D", "N", cutoff, "Training accuracy", logreg.score(fts, lbls))

# # # fts, lbls = get_mod_features_labels(cutoff, features_alsolast20_vec, labels_single, N)
# # # logreg = svm.SVC(gamma=0.001, C=0.5) #lm.LogisticRegression()
# # # logreg.fit(fts, lbls)
# # # print ("Last20", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))

# # # fts, lbls = get_mod_features_labels(cutoff, features_alsolast30_vec, labels_single, N)
# # # logreg = svm.SVC(gamma=0.001, C=0.5) #lm.LogisticRegression()
# # # logreg.fit(fts, lbls)
# # # print ("Last30", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))

# # # fts, lbls = get_mod_features_labels(cutoff, features_alsolast10_vec, labels_single, N)
# # # logreg = svm.SVC(gamma=0.001, C=0.5) #lm.LogisticRegression()
# # # logreg.fit(fts, lbls)
# # # print ("All-V-10", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))

    

In [27]:
# Simple NN 
fts, lbls = get_mod_features_labels(0, simple_features, labels_single_simp, N_simp)
sizes = [(5), (15), (5, 10)] #  (4, 6, 8, 10, 12), (12, 10, 8, 6, 4), (4, 8, 12), (12, 8, 4), (14, 12, 10, 8), (8, 10, 12, 14)
solvers = ['adam', 'lbfgs']
activations = ['identity', 'logistic', 'relu', 'tanh']
for size in sizes:
    for solver in solvers:
        for activation in activations:
            clf = MLPClassifier(solver=solver, activation=activation, alpha=1e-5, hidden_layer_sizes=size, random_state=1)
            clf.fit(fts, lbls)
                
            print ("NEUNET", "Size", size, "Solv", solver, "Act", activation, "Alpha", 1e-5, "Train", clf.score(fts, lbls), "Test", np.mean(cv(clf, fts, lbls, cv=10)))


NEUNET Size 5 Solv adam Act identity Alpha 1e-05 Train 0.536823104693 Test 0.528319509082
NEUNET Size 5 Solv adam Act logistic Alpha 1e-05 Train 0.545667870036 Test 0.54566796002
NEUNET Size 5 Solv adam Act relu Alpha 1e-05 Train 0.508122743682 Test 0.524159085545
NEUNET Size 5 Solv adam Act tanh Alpha 1e-05 Train 0.545848375451 Test 0.5458481402
NEUNET Size 5 Solv lbfgs Act identity Alpha 1e-05 Train 0.577436823105 Test 0.570033650352
NEUNET Size 5 Solv lbfgs Act logistic Alpha 1e-05 Train 0.545667870036 Test 0.54566796002
NEUNET Size 5 Solv lbfgs Act relu Alpha 1e-05 Train 0.589711191336 Test 0.577616839785
NEUNET Size 5 Solv lbfgs Act tanh Alpha 1e-05 Train 0.545667870036 Test 0.54566796002
NEUNET Size 15 Solv adam Act identity Alpha 1e-05 Train 0.549277978339 Test 0.529024282702
NEUNET Size 15 Solv adam Act logistic Alpha 1e-05 Train 0.558122743682 Test 0.564618143842
NEUNET Size 15 Solv adam Act relu Alpha 1e-05 Train 0.541155234657 Test 0.548906972856
NEUNET Size 15 Solv adam Act

In [28]:
# Advanced NN 
fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)
sizes = [(5), (15), (5, 10)] #  (4, 6, 8, 10, 12), (12, 10, 8, 6, 4), (4, 8, 12), (12, 8, 4), (14, 12, 10, 8), (8, 10, 12, 14)
solvers = ['adam', 'lbfgs']
activations = ['identity', 'logistic', 'relu', 'tanh']
for size in sizes:
    for solver in solvers:
        for activation in activations:
            clf = MLPClassifier(solver=solver, activation=activation, alpha=1e-5, hidden_layer_sizes=size, random_state=1)
            clf.fit(fts, lbls)
                
            print ("NEUNET", "Size", size, "Solv", solver, "Act", activation, "Alpha", 1e-5, "Train", clf.score(fts, lbls), "Test", np.mean(cv(clf, fts, lbls, cv=10)))


NEUNET Size 5 Solv adam Act identity Alpha 1e-05 Train 0.529241877256 Test 0.552887532038
NEUNET Size 5 Solv adam Act logistic Alpha 1e-05 Train 0.564801444043 Test 0.561366426846
NEUNET Size 5 Solv adam Act relu Alpha 1e-05 Train 0.549458483755 Test 0.537343521246
NEUNET Size 5 Solv adam Act tanh Alpha 1e-05 Train 0.545667870036 Test 0.545306949189
NEUNET Size 5 Solv lbfgs Act identity Alpha 1e-05 Train 0.58285198556 Test 0.575816668863
NEUNET Size 5 Solv lbfgs Act logistic Alpha 1e-05 Train 0.580866425993 Test 0.559926288697
NEUNET Size 5 Solv lbfgs Act relu Alpha 1e-05 Train 0.580324909747 Test 0.572026056321
NEUNET Size 5 Solv lbfgs Act tanh Alpha 1e-05 Train 0.545848375451 Test 0.545125790952
NEUNET Size 15 Solv adam Act identity Alpha 1e-05 Train 0.554693140794 Test 0.536459889558
NEUNET Size 15 Solv adam Act logistic Alpha 1e-05 Train 0.566606498195 Test 0.56859578179
NEUNET Size 15 Solv adam Act relu Alpha 1e-05 Train 0.531588447653 Test 0.554306494625
NEUNET Size 15 Solv adam 

In [43]:
# Error Analysis - Use this Model

fts, lbls = get_mod_features_labels(0, advanced_features, labels_single_adv, N_adv)
size = (5)
solver = 'lbfgs'
activation = 'relu'

clf = MLPClassifier(solver=solver, activation=activation, alpha=1e-5, hidden_layer_sizes=size, random_state=1)
clf.fit(fts, lbls)
print ("NEUNET", "Size", size, "Solv", solver, "Act", activation, "Alpha", 1e-5, "Train", clf.score(fts, lbls), "Test", np.mean(cv(clf, fts, lbls, cv=10)))

print(len(fts), len(lbls))

NEUNET Size 5 Solv lbfgs Act relu Alpha 1e-05 Train 0.580324909747 Test 0.572026056321
5540 5540


In [44]:
# Error Analysis - Take dev set

indeces = list(range(len(lbls)))
np.random.shuffle(indeces)
indeces = indeces[0:int(len(indeces)/10)]

fts2 = []
lbls2 = []
for index in indeces:
    fts2.append(fts[index])
    lbls2.append(lbls[index])

fts = fts2
lbls = lbls2

print(len(fts), len(lbls))

554 554


In [74]:
# Error Analysis

both_win = {'correct': 0, 'total': 0}
both_loss = {'correct': 0, 'total': 0}
either = {'correct': 0, 'total': 0}

for i in range(len(lbls)):
    team1 = (fts[i][int(len(fts[i])/2) - 1])
    team2 = (fts[i][len(fts[i]) - 1])
    label = lbls[i]
    predict = clf.predict([fts[i]])
    if team1 > 0.5 and team2 > 0.5:
        both_win['total'] += 1
        both_win['correct'] += (1 if predict[0] == label else 0)
    elif team1 < 0.5 and team2 < 0.5:
        both_loss['total'] += 1
        both_loss['correct'] += (1 if predict[0] == label else 0)
    else: 
        either['total'] += 1
        either['correct'] += (1 if predict[0] == label else 0)
    #print(team1, team2, label, predict)

print("Both Win: ", both_win['correct']*1./both_win['total'], both_win['total'])
print("Both Loss: ", both_loss['correct']*1./both_loss['total'], both_loss['total'])
print("1 Win/1 Loss: ", either['correct']*1./either['total'], either['total'])

Both Win:  0.5588235294117647 1088
Both Loss:  0.5627871362940275 1306
1 Win/1 Loss:  0.5950413223140496 3146


In [106]:
# Feature Analysis

both_win = {'better_wins': 0, 'total': 0}
both_loss = {'better_wins': 0, 'total': 0}
either = {'better_wins': 0, 'total': 0}

for i in range(len(lbls)):
    team1 = (fts[i][int(len(fts[i])/2) - 1])
    team2 = (fts[i][len(fts[i]) - 1])
    label = lbls[i]
    predict = clf.predict([fts[i]])
    pred = 0 if team2 > team1 else 1
    if team1 > 0.5 and team2 > 0.5:
        both_win['total'] += 1
        both_win['better_wins'] += (1 if pred == label else 0)
    elif team1 < 0.5 and team2 < 0.5:
        both_loss['total'] += 1
        both_loss['better_wins'] += (1 if pred == label else 0)
    else: 
        either['total'] += 1
        either['better_wins'] += (1 if pred == label else 0)
    #print(team1, team2, label, predict)

print("Both Win: ", both_win['better_wins']*1./both_win['total'], both_win['total'])
print("Both Loss: ", both_loss['better_wins']*1./both_loss['total'], both_loss['total'])
print("1 Win/1 Loss: ", either['better_wins']*1./either['total'], either['total'])

Both Win:  0.5238970588235294 1088
Both Loss:  0.5474732006125574 1306
1 Win/1 Loss:  0.5718372536554355 3146


In [104]:
# Error Analysis: 2

distances = [.01, .02, .03, .04, .05, .075, .1, .2, .5]
corrects = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
totals = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

for i in range(len(lbls)):
    predict = clf.predict_proba([fts[i]])[0]
    predict = predict[0] if predict[0] > predict[1] else predict[1]
    label = lbls[i]
    for j in range(len(distances)):
        if predict - 0.5 <= distances[j]:
            totals[j] += 1
            if clf.predict([fts[i]]) == label:
                corrects[j] += 1
            break

for i in range(len(distances)):
    print((0.5-distances[i]), "-", (0.5+distances[i]), " ", totals[i], " ", corrects[i]*1./totals[i])

0.49 - 0.51   451   0.5144124168514412
0.48 - 0.52   462   0.5281385281385281
0.47 - 0.53   410   0.5585365853658537
0.46 - 0.54   457   0.5470459518599562
0.45 - 0.55   461   0.5574837310195228
0.425 - 0.575   978   0.5501022494887525
0.4 - 0.6   747   0.5689424364123159
0.3 - 0.7   1397   0.654974946313529
0.0 - 1.0   177   0.7062146892655368


In [71]:
print("Accuracy on dev set", clf.score(fts, lbls))

Accuracy on dev set 0.580324909747


In [None]:
# N: 0, 10, 20, 30, 40, 50
# models: logreg, svm (linear), svm (default - rbf), neural net

# Playoff results, directly imported (messy but works!)

playoff_results = {
    "12-13":[
        ["PIT", "NYI", 4, 2, ],
        ["MTL", "OTT", 1, 4],
        ["WSH", "NYR", 3, 4],
        ["BOS", "TOR", 4, 3],
        ["CHI", "MIN", 4, 1],
        ["ANA", "DET", 3, 4],
        ["VAN", "S.J", 0, 4],
        ["STL", "L.A", 2, 4],
        ["PIT", "OTT", 4, 1],
        ["BOS", "NYR", 4, 1],
        ["CHI", "DET", 4, 3],
        ["L.A", "S.J", 4, 3],
        ["PIT", "BOS", 0, 4],
        ["CHI", "L.A", 4, 1],
        ["BOS", "CHI", 2, 4]
    ],
    "13-14":[
        ["BOS", "DET", 4, 1],
        ["T.B", "MTL", 0, 4],
        ["PIT", "CBJ", 4, 2],
        ["NYR", "PHI", 4, 3],
        ["COL", "MIN", 3, 4],
        ["STL", "CHI", 2, 4],
        ["ANA", "DAL", 4, 2],
        ["S.J", "L.A", 3, 4],
        ["BOS", "MTL", 3, 4],
        ["PIT", "NYR", 3, 4],
        ["MIN", "CHI", 2, 4],
        ["ANA", "L.A", 3, 4],
        ["MTL", "NYR", 2, 4],
        ["CHI", "L.A", 3, 4],
        ["NYR", "L.A", 1, 4]
    ],
    "14-15":[
        ["MTL", "OTT", 4, 2],
        ["T.B", "DET", 4, 3],
        ["NYR", "PIT", 4, 1],
        ["WSH", "NYI", 4, 3],
        ["STL", "MIN", 2, 4],
        ["NSH", "CHI", 2, 4],
        ["ANA", "WPG", 4, 0],
        ["VAN", "CGY", 2, 4],
        ["MTL", "T.B", 2, 4],
        ["NYR", "WSH", 4, 3],
        ["MIN", "CHI", 0, 4],
        ["ANA", "CGY", 4, 1],
        ["T.B", "NYR", 4, 3],
        ["CHI", "ANA", 4, 3],
        ["T.B", "CHI", 2, 4]
    ],
    "15-16":[
        ["FLA", "NYI", 2, 4],
        ["T.B", "DET", 4, 1],
        ["WSH", "PHI", 4, 2],
        ["PIT", "NYR", 4, 1],
        ["DAL", "MIN", 4, 2],
        ["STL", "CHI", 4, 3],
        ["ANA", "NSH", 3, 4],
        ["L.A", "S.J", 1, 4],
        ["NYI", "T.B", 1, 4],
        ["WSH", "PIT", 2, 4],
        ["DAL", "STL", 3, 4],
        ["NSH", "S.J", 3, 4],
        ["T.B", "PIT", 3, 4],
        ["STL", "S.J", 2, 4],
        ["PIT", "S.J", 4, 2]
    ],
    "16-17":[
        ["MTL", "NYR", 2, 4],
        ["OTT", "BOS", 4, 2],
        ["WSH", "TOR", 4, 2],
        ["PIT", "CBJ", 4, 1],
        ["CHI", "NSH", 0, 4],
        ["MIN", "STL", 1, 4],
        ["ANA", "CGY", 4, 0],
        ["EDM", "S.J", 4, 2],
        ["NYR", "OTT", 2, 4],
        ["WSH", "PIT", 3, 4],
        ["NSH", "STL", 4, 2],
        ["ANA", "EDM", 4, 3],
        ["OTT", "PIT", 3, 4],
        ["NSH", "ANA", 4, 2],
        ["PIT", "NSH", 4, 2]
    ]
}

In [None]:
playoff_features = []
playoff_labels = []

for date in playoff_results:
    for game in playoff_results[date]:
        
        start = seasons[season_names.index(date)]
        team1 = get_playoff_feature(date, game[0], mapping, start)
        team2 = get_playoff_feature(date, game[1], mapping, start)
        vec = []
        vec.extend(team1)
        vec.extend(team2)
        playoff_features.append(vec)
        playoff_labels.append(1 if game[3] > game[2] else 0)

In [None]:
logregp = lm.LogisticRegression()
logregp.fit(playoff_features, playoff_labels)
print ("LR", "N", cutoff, "Train", logregp.score(playoff_features, playoff_labels), "Test", np.mean(cv(logregp, playoff_features, playoff_labels, cv=10)))

# SVM C 100 Kernel rbf Train 1.0 Test 0.7 -> can get 70% accuracy on predicting playoffs
Cs = [0.01, 0.1, 0.5, 1, 10, 20, 40, 50, 75, 100, 1000, 10000]
kernels = ["rbf", "linear", "poly", "sigmoid"]

fts2 = playoff_features
lbls2 = playoff_labels
for c in Cs:
    for kernel in kernels:
        svmo = svm.SVC(gamma=0.001, C=c, kernel=kernel) #lm.LogisticRegression(), C=100.
        svmo.fit(fts2, lbls2)
        print ("SVM", "C", c, "Kernel", kernel, "Train", svmo.score(fts2, lbls2), "Test", np.mean(cv(svmo, fts2, lbls2, cv=10)))
        #     #print ("Vec", "N", cutoff, "Training accuracy", np.mean(cv(logreg, fts, lbls, cv=10)), logreg.score(fts, lbls))
    
sizes = [(5), (10), (15), (5, 5), (10, 5), (5, 10), (8, 10, 12), (10, 15, 20)]
solvers = ['adam', 'lbfgs', 'sgd']
activations = ['identity', 'logistic', 'tanh', 'relu']
alphas = [1e-5, 1e-4, 1e-3]
for size in sizes:
    for solver in solvers:
        for activation in activations:
            for alpha in alphas:
                clf = MLPClassifier(solver=solver, activation=activation, alpha=alpha, hidden_layer_sizes=size, random_state=1)
                clf.fit(fts2, lbls2)
                
                print ("NEUNET", "Size", size, "Solv", solver, "Act", activation, "Alpha", alpha, "Train", clf.score(fts2, lbls2), "Test", np.mean(cv(clf, fts2, lbls2, cv=10)))


In [None]:
import sklearn.linear_model as linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

from math import factorial

def comb(n, k):
    return factorial(n) / factorial(k) / factorial(n - k)

# PROB THAT A wins ONE GAME
def predictSeries(prob):
    five = comb(4, 3)
    six = comb(5, 3)
    seven= comb(6, 3)
    A_4 = prob**4
    B_4 = (1-prob)**4
    A_5 = five*(prob**4)*(1-prob)
    B_5 = five*((1-prob)**4)*(prob)
    A_6 = six*(prob**4)*((1-prob)**2)
    B_6 = six*((1-prob)**4)*(prob**2)
    A_7 = seven*(prob**4)*((1-prob)**3)
    B_7 = seven*((1-prob)**4)*(prob**3)
    return [A_4, B_4, A_5, B_5, A_6, B_6, A_7, B_7]


In [None]:
cutoff = 0
fts, lbls = get_mod_features_labels(cutoff, features_vec, labels_single, N)
logreg = lm.LogisticRegression()
logreg.fit(fts, lbls)

print("Accuracy on regular season", np.mean(cv(logreg, fts, lbls, cv=10)))

cor = 0
tot = 0
for i in range(len(playoff_features)):
    ft = playoff_features[i]
    lbl = playoff_labels[i]
    prob = logreg.predict_proba([ft])[0][0] # returns prob for 0, 1 -> probability that team2 wins one game
    series = predictSeries(prob)
    sum_prob = series[0] + series[2] + series[4] + series[6] # probability that team2 wins series
    pred_label = 1 if sum_prob > 0.5 else 0
    if lbl == pred_label:
        cor += 1
    tot += 1
print("Accuracy on playoff season", (cor*1./tot))

In [None]:
date_cutoffs = [20130430, 20140416, 20150415, 20160413, 20170412]

probs = {} # Season -> Date -> []
payoffs = {} # Season -> Date -> {} 'H', 'V'
for name in season_names:
    probs[name] = {}
    payoffs[name] = {}

def load_playoffs_file(model, txtfile, year, mapping, season, date_cutoff):
    with open(txtfile, 'r') as f:
        lines = [row for row in csv.reader(f.read().splitlines())]
        lines.pop(0)
        
        for i in range(0, len(lines), 2):
            line1 = lines[i]
            line2 = lines[i+1]
            
            if line1[7].strip() == '' or line2[7].strip() == '':
                continue
            
            date = convert_odds_date(line1[0], year)
            team1 = name_cities_mapping[line1[3].strip()] # Visitor
            team2 = name_cities_mapping[line2[3].strip()] # Home
            score1 = int(line1[7])
            score2 = int(line2[7])
            
            #print(date, date_cutoff)
            
            if date < date_cutoff:
                continue
                        
            # feature vector 1,2
            start = date
            feature_team1 = get_playoff_feature(season, team1, mapping, start)
            feature_team2 = get_playoff_feature(season, team2, mapping, start)
            vec = []
            vec.extend(feature_team1)
            vec.extend(feature_team2)
            vec = [vec]
            prob2 = model.predict_proba(vec)[0][0] # for team2
            #series = predictSeries(prob)
            
            #print(line1)
            
            odds1 = line1[10]
            odds1 = int(odds1[odds1.find("(")+1:odds1.find(")")])
            odds1 = (100-odds1)/(-odds1) if odds1 < 0 else (100+odds1)/(100)
            odds2 = line2[10]
            odds2 = int(odds2[odds2.find("(")+1:odds2.find(")")])
            odds2 = (100-odds2)/(-odds2) if odds2 < 0 else (100+odds2)/(100)
            
            if date not in probs[season]:
                probs[season][date] = []
                payoffs[season][date] = []
            
            probs[season][date].append(prob2)
            mmap = {}
            mmap['H'] = odds2
            mmap['A'] = odds1
            payoffs[season][date].append(mmap)
            
#load_playoffs_file(logreg, "nhl_odds_2012-13.csv", 2012, mapping, season_names[0], date_cutoffs[0])
#load_playoffs_file(logreg, "nhl_odds_2013-14.csv", 2013, mapping, season_names[1], date_cutoffs[1])
load_playoffs_file(logreg, "nhl_odds_2014-15.csv", 2014, mapping, season_names[2], date_cutoffs[2])
load_playoffs_file(logreg, "nhl_odds_2015-16.csv", 2015, mapping, season_names[3], date_cutoffs[3])
load_playoffs_file(logreg, "nhl_odds_2016-17.csv", 2016, mapping, season_names[4], date_cutoffs[4])


In [None]:
print(probs[season_names[2]][20150415])
print(payoffs[season_names[2]][20150415])

In [None]:
import pickle

with open('probs.pickle', 'wb') as handle:
    pickle.dump(probs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('payoffs.pickle', 'wb') as handle:
    pickle.dump(payoffs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
cutoff = 0
fts, lbls = get_mod_features_labels(cutoff, features_vec, labels_single, N)
logreg = lm.LogisticRegression()
logreg.fit(fts, lbls)
confusion_matrix = metrics.confusion_matrix(playoff_labels, logreg.predict(playoff_features))

In [None]:
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
import itertools

np.set_printoptions(precision=2)

class_names = ["Visitor Wins", "Home Wins"]

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(confusion_matrix, classes=class_names,
                      title='Confusion matrix on test set')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(confusion_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()