## Imports

In [1]:
from datetime import timedelta
from collections import defaultdict
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

## Reading Data and Splitting

In [2]:
data = pd.read_csv('data_with_points.csv', sep=',', index_col=0)
data['Date'] = pd.to_datetime(data['Date'])
data['point_diff'] = abs(data['HP'] - data['AP'])
data = data[['Date','WHH','WHD','WHA','HWW','AWW','point_diff','watch']]

# Scaling the Data

In [3]:
scaler = StandardScaler()
data[['WHH','WHD','WHA','HWW','AWW','point_diff']] = scaler.fit_transform(
    data[['WHH','WHD','WHA','HWW','AWW','point_diff']])

In [4]:
data.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
8,2000-08-26,-0.789699,0.173834,1.331001,-2.178451,1.05211,-0.694574,1
10,2000-08-26,-0.637783,-0.133676,0.150127,-2.178451,4.2739,-0.694574,1
11,2000-08-26,-0.049106,-0.625693,-0.587919,1.046521,1.05211,-0.908086,0
12,2000-08-26,-0.3023,-0.502689,-0.440309,4.271493,1.05211,-1.014842,0
14,2000-08-26,-0.428897,-0.441187,-0.322222,1.046521,1.05211,-0.80133,0


In [5]:
data.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
6241,2018-05-13,-0.789699,2.326408,1.331001,-0.086577,-0.07987,2.721611,0
6242,2018-05-13,2.482834,0.481345,-0.862472,-0.783869,-0.07987,2.081076,0
6243,2018-05-13,3.748804,1.096366,-0.912659,-0.435223,1.139186,5.604018,0
6244,2018-05-13,-0.422568,0.173834,-0.145091,-0.783869,-0.776473,-0.80133,0
6245,2018-05-13,-0.808689,2.941429,1.331001,-0.2609,-0.428171,1.22703,1


In [6]:
train = data[data['Date'] < '2016-06-13']
validate = data[data['Date'] >= '2016-08-01']
validate = validate[validate['Date'] <= '2017-07-13']
test = data[data['Date'] >= '2017-08-01']

In [7]:
train.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
5507,2016-05-15,1.216864,-0.256681,-0.785715,-0.2609,-0.254021,2.935123,1
5508,2016-05-15,-0.770709,1.465379,1.035783,-0.2609,-0.776473,1.120274,1
5509,2016-05-15,0.583879,-0.625693,-0.676484,-0.609546,0.616733,0.479739,0
5510,2016-05-15,1.849849,0.887259,-0.87428,-0.435223,0.268432,0.800007,0
5511,2016-05-15,-0.397248,0.173834,-0.263178,-0.958191,0.442583,-0.481063,1


In [8]:
validate.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
5522,2016-08-27,-0.815018,1.711387,2.216656,-2.178451,-2.169679,-1.014842,0
5523,2016-08-27,-0.125065,-0.502689,-0.410788,-2.178451,1.05211,-1.014842,0
5524,2016-08-27,-0.523845,-0.010672,0.076323,-2.178451,1.05211,-0.694574,0
5525,2016-08-27,2.799327,0.481345,-0.883137,-2.178451,1.05211,-1.014842,0
5526,2016-08-27,-0.536505,-0.010672,0.150127,-2.178451,-2.169679,-0.908086,0


In [9]:
validate.tail()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
5877,2017-05-21,-0.935286,6.631556,6.349714,0.436391,-0.950624,3.575658,0
5878,2017-05-21,-0.049106,-0.379685,-0.499353,-0.958191,-0.07987,1.333786,0
5879,2017-05-21,-0.523845,0.173834,0.002518,-0.783869,0.094281,-0.481063,0
5880,2017-05-21,-0.33395,-0.133676,-0.2927,0.262068,-0.254021,-0.267551,0
5881,2017-05-21,4.698282,2.326408,-0.945133,0.087746,0.442583,2.5081,1


In [10]:
test.head()

Unnamed: 0,Date,WHH,WHD,WHA,HWW,AWW,point_diff,watch
5892,2017-08-26,4.065297,1.711387,-0.933324,-2.178451,-2.169679,-0.908086,0
5893,2017-08-26,-0.397248,-0.441187,-0.086047,-2.178451,1.05211,-1.014842,0
5894,2017-08-26,0.773774,-0.625693,-0.706006,-2.178451,1.05211,-1.014842,0
5895,2017-08-26,-0.789699,1.711387,1.331001,4.271493,1.05211,-1.014842,0
5896,2017-08-26,-0.175703,-0.502689,-0.381266,-2.178451,4.2739,-1.014842,0


## The Classifiers

In [11]:
rfc = RandomForestClassifier(n_estimators=100, random_state=101)
dtc = DecisionTreeClassifier(random_state=101)
lin = LinearSVC(random_state=101, max_iter=1000000)
lin_svc = CalibratedClassifierCV(lin, cv=5)
rbf_svc = SVC(kernel='rbf', gamma='scale', random_state=101, probability=True)
sig_svc = SVC(kernel='sigmoid', gamma='scale', random_state=101, probability=True)
lrc = LogisticRegression(solver='saga', random_state=101)
clfs = [
    ('rfc', rfc),
    ('dtc', dtc),
    ('lin', lin_svc),
    ('rbf', rbf_svc),
    ('sig', sig_svc),
    ('log', lrc)
]

In [12]:
for clf in clfs:
    clf[1].fit(train[['WHH','WHD','WHA','HWW','AWW','point_diff']],train['watch'])

In [13]:
correct = defaultdict(int)
weeks = 0
# Loop over every week in the 2016 season going Weds to Tues
for lower_bound_date in pd.Series(pd.date_range('2016-08-24', freq='7D', periods=46, closed='left')):
    # Get the games in the given week
    games = validate[validate['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    # If there was a game in that week (some weeks are empty)
    if len(games) > 0:
        # Loop over the classifiers
        for clf in clfs:
            # Predict the probabilities for the validation data
            probs = clf[1].predict_proba(games[['WHH','WHD','WHA','HWW','AWW','point_diff']])
            # Sort the probabilities
            probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
            sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
            # If the highest probability of being worth watching was then 
            # it was correct and update the dictionary
            if games.iloc[sorted_probs[0][0]]['watch'] == 1:
                correct[clf[0]] += 1
        # Keep a count of how many weeks in the season were predicted for
        weeks += 1
for clf in clfs:
    print(f'The {clf[0]} classifier had an accuracy of {correct[clf[0]] / weeks}')

The rfc classifier had an accuracy of 0.47058823529411764
The dtc classifier had an accuracy of 0.2647058823529412
The lin classifier had an accuracy of 0.29411764705882354
The rbf classifier had an accuracy of 0.3235294117647059
The sig classifier had an accuracy of 0.2647058823529412
The log classifier had an accuracy of 0.29411764705882354


So this classifier gave the highest probability to a game actually worth watching 42.4% of the time. This is a bit better than random guessing and the best test accuracy I have encountered so far.

In 75.8% of the weeks, at least one of the top 3 predicted to be worth watching was actually worth watching. If we were to go with this model it could present 3 games as being likely worth watching a week and then the user could choose to watch the game at the most convenient time, or the game with the teams they most want to watch. Still it is not much better than random guessing the one to watch.

In [14]:
ada_clf = AdaBoostClassifier(base_estimator=rfc,n_estimators=200)
ada_clf.fit(train[['WHH','WHD','WHA','HWW','AWW','point_diff']],train['watch'])

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features='auto',
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimators=100,
                        

In [15]:
correct = 0
weeks = 0
# Loop over every week in the 2016 season going Weds to Tues
for lower_bound_date in pd.Series(pd.date_range('2016-08-24', freq='7D', periods=46, closed='left')):
    # Get the games in the given week
    games = validate[validate['Date'] >= lower_bound_date]
    games = games[games['Date'] <= lower_bound_date + timedelta(days=7)]
    games = games.reset_index(drop=True)    
    # If there was a game in that week (some weeks are empty)
    if len(games) > 0:
        # Predict the probabilities for the validation data
        probs = ada_clf.predict_proba(games[['WHH','WHD','WHA','HWW','AWW','point_diff']])
        # Sort the probabilities
        probs_list = [[i, list(probs[i])] for i in range(0, len(probs))]
        sorted_probs = sorted(probs_list, key=lambda x: x[1][1], reverse=True)[:3]
        # If the highest probability of being worth watching was then 
        # it was correct and update the dictionary
        if games.iloc[sorted_probs[0][0]]['watch'] == 1:
            correct += 1
        # Keep a count of how many weeks in the season were predicted for
        weeks += 1
print(f'Ada had an accuracy of {correct / weeks}')

Ada had an accuracy of 0.4117647058823529
