## Imports

In [1]:
from datetime import timedelta
from collections import defaultdict
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

## Reading Data and Splitting

In [2]:
data = pd.read_csv('data_with_points.csv', sep=',', index_col=0)
data['Date'] = pd.to_datetime(data['Date'])
# Create a column to represent the difference in points between the two teams
data['points diff'] = abs(data['HP'] - data['AP'])
data = data[['Date','WHH','WHD','WHA','HWW','AWW','points diff','watch']]

First we will scale the numerical data

In [3]:
scaler = StandardScaler()
data[['WHH','WHD','WHA','HWW','AWW','points diff']] = scaler.fit_transform(
    data[['WHH','WHD','WHA','HWW','AWW','points diff']])

Now we will split the data into training, validation and testing. I will save two seasons (2014-15 and 2017-18) for testing and use two seasons (2016-17 and 2012-13) for validation. The reason I did not want to use the final 4 seasons for validation and testing is that the training data may be distributed differently in the past.

In [4]:
data.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
8,2000-08-26,-0.789699,0.173834,1.331001,-2.178451,1.05211,-0.694574,1
10,2000-08-26,-0.637783,-0.133676,0.150127,-2.178451,4.2739,-0.694574,1
11,2000-08-26,-0.049106,-0.625693,-0.587919,1.046521,1.05211,-0.908086,0
12,2000-08-26,-0.3023,-0.502689,-0.440309,4.271493,1.05211,-1.014842,0
14,2000-08-26,-0.428897,-0.441187,-0.322222,1.046521,1.05211,-0.80133,0


In [5]:
data.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
6241,2018-05-13,-0.789699,2.326408,1.331001,-0.086577,-0.07987,2.721611,0
6242,2018-05-13,2.482834,0.481345,-0.862472,-0.783869,-0.07987,2.081076,0
6243,2018-05-13,3.748804,1.096366,-0.912659,-0.435223,1.139186,5.604018,0
6244,2018-05-13,-0.422568,0.173834,-0.145091,-0.783869,-0.776473,-0.80133,0
6245,2018-05-13,-0.808689,2.941429,1.331001,-0.2609,-0.428171,1.22703,1


In [6]:
train = data[data['Date'] < '2012-07-13']
train = train.append(data[(data['Date'] > '2013-08-01') & (data['Date'] < '2014-07-13')])
train = train.append(data[(data['Date'] > '2015-08-01') & (data['Date'] < '2016-07-13')])
validate = data[(data['Date'] > '2016-08-01') & (data['Date'] < '2017-07-13')]
validate = validate.append(data[(data['Date'] > '2012-08-01') & (data['Date'] < '2013-07-13')])
test = data[(data['Date'] > '2014-08-01') & (data['Date'] < '2015-07-13')]
test = test.append(data[(data['Date'] > '2017-08-01') & (data['Date'] < '2018-07-13')])

In [7]:
train.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
5507,2016-05-15,1.216864,-0.256681,-0.785715,-0.2609,-0.254021,2.935123,1
5508,2016-05-15,-0.770709,1.465379,1.035783,-0.2609,-0.776473,1.120274,1
5509,2016-05-15,0.583879,-0.625693,-0.676484,-0.609546,0.616733,0.479739,0
5510,2016-05-15,1.849849,0.887259,-0.87428,-0.435223,0.268432,0.800007,0
5511,2016-05-15,-0.397248,0.173834,-0.263178,-0.958191,0.442583,-0.481063,1


In [8]:
validate.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
5522,2016-08-27,-0.815018,1.711387,2.216656,-2.178451,-2.169679,-1.014842,0
5523,2016-08-27,-0.125065,-0.502689,-0.410788,-2.178451,1.05211,-1.014842,0
5524,2016-08-27,-0.523845,-0.010672,0.076323,-2.178451,1.05211,-0.694574,0
5525,2016-08-27,2.799327,0.481345,-0.883137,-2.178451,1.05211,-1.014842,0
5526,2016-08-27,-0.536505,-0.010672,0.150127,-2.178451,-2.169679,-0.908086,0


In [9]:
validate.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
4412,2013-05-19,-0.492196,-0.010672,-0.047669,1.133683,1.313336,-0.481063,1
4413,2013-05-19,-0.808689,1.711387,1.921438,0.785037,0.268432,2.294588,0
4414,2013-05-19,1.109256,-0.133676,-0.785715,0.610714,0.965035,4.002681,1
4415,2013-05-19,-0.555494,-0.256681,0.297737,0.785037,0.442583,0.372984,1
4416,2013-05-19,-0.270651,-0.379685,-0.322222,1.308005,0.442583,-0.587818,1


In [10]:
test.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
4790,2014-08-30,2.482834,-0.256681,-0.847711,1.046521,-2.169679,-0.908086,0
4791,2014-08-30,0.647177,-0.896303,-0.661723,4.271493,1.05211,-0.80133,1
4792,2014-08-30,-0.871987,2.326408,4.873622,1.046521,-2.169679,-0.80133,0
4793,2014-08-30,-0.523845,-0.502689,0.297737,-2.178451,1.05211,-0.908086,1
4794,2014-08-30,-0.049106,-0.871702,-0.410788,1.046521,1.05211,-0.908086,0


In [11]:
test.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,points diff,watch
6241,2018-05-13,-0.789699,2.326408,1.331001,-0.086577,-0.07987,2.721611,0
6242,2018-05-13,2.482834,0.481345,-0.862472,-0.783869,-0.07987,2.081076,0
6243,2018-05-13,3.748804,1.096366,-0.912659,-0.435223,1.139186,5.604018,0
6244,2018-05-13,-0.422568,0.173834,-0.145091,-0.783869,-0.776473,-0.80133,0
6245,2018-05-13,-0.808689,2.941429,1.331001,-0.2609,-0.428171,1.22703,1


## The Classifiers

I am going to try a number of different classifiers and assess their quality on the validation set. I have tried some of these before and do not expect them to do well. These are all used because they allow prediction of probabilities for the two classifications (worth watching and not).

In [12]:
rfc = RandomForestClassifier(n_estimators=100, random_state=101)
dtc = DecisionTreeClassifier(random_state=101)
lin = LinearSVC(random_state=101, max_iter=1000000)
lin_svc = CalibratedClassifierCV(lin, cv=5)
rbf_svc = SVC(kernel='rbf', gamma='scale', random_state=101, probability=True)
sig_svc = SVC(kernel='sigmoid', gamma='scale', random_state=101, probability=True)
lrc = LogisticRegression(solver='saga', random_state=101)
clfs = [
    ('rfc', rfc),
    ('dtc', dtc),
    ('lin', lin_svc),
    ('rbf', rbf_svc),
    ('sig', sig_svc),
    ('log', lrc)
]

In [13]:
for clf in clfs:
    clf[1].fit(train[['WHH','WHD','WHA','HWW','AWW','points diff']],train['watch'])

I am assessing their quality by using them to predict for each week, which game is the most likely to be worth watching. I will then check if it was actually worth watching. If it was, that classifier gets a +1 to it's correct count. At the end we see what proportion of weeks the highest probability game was actually worth watching.

In [14]:
correct = defaultdict(int)
weeks = 0
# Loop over every week in the 2016 season going Weds to Tues
for lower_bound_date in pd.Series(pd.date_range('2016-08-24', freq='7D', periods=46, closed='left')):
    # Get the games in the given week
    games = validate[validate['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    # If there was a game in that week (some weeks are empty)
    if len(games) > 0:
        # Loop over the classifiers
        for clf in clfs:
            # Predict the probabilities for the validation data
            probs = clf[1].predict_proba(games[['WHH','WHD','WHA','HWW','AWW','points diff']])
            # Sort the probabilities
            probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
            sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
            # If the highest probability of being worth watching was then 
            # it was correct and update the dictionary
            if games.iloc[sorted_probs[0][0]]['watch'] == 1:
                correct[clf[0]] += 1
        # Keep a count of how many weeks in the season were predicted for
        weeks += 1
# Do the same thing for the 2012 season
for lower_bound_date in pd.Series(pd.date_range('2012-08-15', freq='7D', periods=46, closed='left')):
    games = validate[validate['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    if len(games) > 0:
        for clf in clfs:
            probs = clf[1].predict_proba(games[['WHH','WHD','WHA','HWW','AWW','points diff']])
            probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
            sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
            if games.iloc[sorted_probs[0][0]]['watch'] == 1:
                correct[clf[0]] += 1
        weeks += 1
for clf in clfs:
    print(f'The {clf[0]} classifier had an accuracy of {correct[clf[0]] / weeks}')

The rfc classifier had an accuracy of 0.4782608695652174
The dtc classifier had an accuracy of 0.36231884057971014
The lin classifier had an accuracy of 0.4057971014492754
The rbf classifier had an accuracy of 0.4057971014492754
The sig classifier had an accuracy of 0.36231884057971014
The log classifier had an accuracy of 0.37681159420289856


So the classifier that performed the best was the random forest. It had an accuracy of 47.8% over the weeks in 2016 and 2012. This is promising so let's see how it does on the test set.

In [15]:
final_train = train.append(validate)
rfc.fit(final_train[['WHH','WHD','WHA','HWW','AWW','points diff']], final_train['watch'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=101,
                       verbose=0, warm_start=False)

In [19]:
correct = 0
weeks = 0
for lower_bound_date in pd.Series(pd.date_range('2017-08-23', freq='7D', periods=46, closed='left')):
    games = test[test['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    if len(games) > 0:
        probs = rfc.predict_proba(games[['WHH','WHD','WHA','HWW','AWW','points diff']])
        probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
        sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
        if games.iloc[sorted_probs[0][0]]['watch'] == 1:
            correct += 1
        weeks += 1
for lower_bound_date in pd.Series(pd.date_range('2014-08-13', freq='7D', periods=46, closed='left')):
    games = test[test['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    if len(games) > 0:
        probs = rfc.predict_proba(games[['WHH','WHD','WHA','HWW','AWW','points diff']])
        probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
        sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
        if games.iloc[sorted_probs[0][0]]['watch'] == 1:
            correct += 1
        weeks += 1
print(f'Random forest had a final accuracy of {correct / weeks}')

Random forest had a final accuracy of 0.373134328358209


This final accuracy is 37.3% which is the highest I have encountered so far.