# Imports

In [1]:
import numpy as np
import pandas as pd
import random
import math

# Loading the Data and Discarding Some

In [2]:
data = pd.read_csv('./clean_data.csv',sep='\t',encoding='utf-8',index_col=0)

We will drop all the unnecessary columns of the DataFrame. The ones used for prediction will be the statistics related to the current form of the team in the league. These are the ones that will be available in real-life for prediction. Of course the predictor column, 'watch' is kept.

In [3]:
data.drop(labels=['Date','HomeTeam','AwayTeam',
                  'FTHG','FTAG','HS','AS','HST',
                  'AST','HTHG','HTAG','HF','AF',
                  'FTR','HTR','HY','AY','HR',
                  'AR','Season','WHH','WHD','WHA'],
         inplace=True,
         axis=1)
# Redorder the columns so we have home statistics, then away statistics
# then betting odds and finally the classifier.
data = data[['HGS','HGA','HYC','HRC','HWW','AGS','AGA','AYC','ARC','AWW','watch']]

In [4]:
data.sample(10)

Unnamed: 0,HGS,HGA,HYC,HRC,HWW,AGS,AGA,AYC,ARC,AWW,watch
483,1.2,1.266667,1.466667,0.133333,0.266667,1.571429,1.357143,1.5,0.142857,0.5,1
1912,1.166667,1.2,1.266667,0.066667,0.366667,1.517241,1.517241,1.586207,0.034483,0.517241,1
2877,1.5,1.035714,1.5,0.035714,0.464286,1.178571,1.178571,1.428571,0.142857,0.464286,1
1025,0.5,2.5,3.5,0.0,0.75,1.0,2.5,1.5,0.0,0.75,0
2056,0.75,1.875,1.625,0.25,0.5,0.875,1.625,1.125,0.0,0.5,0
6108,1.0,1.478261,1.391304,0.0,0.391304,2.0,0.913043,1.130435,0.086957,0.478261,0
5197,0.666667,1.166667,2.5,0.166667,0.333333,1.0,1.5,2.333333,0.0,0.333333,1
4111,1.5,2.0,1.666667,0.333333,0.5,1.0,0.833333,1.666667,0.166667,0.166667,0
2450,1.090909,0.909091,1.909091,0.0,0.363636,0.454545,2.090909,1.454545,0.090909,0.363636,1
5565,0.833333,1.666667,1.666667,0.0,0.5,2.666667,1.5,2.0,0.0,0.666667,1


We are trying to classify a new datapoint based on the previous game statistics in the league and betting odds for the coming game, into worth watching or not watching. The model will be trained on the data from previous seasons. This is a situation where k-Nearest Neighbors is appropriate. The model will take a new data point, and classify it based on the k-nearest datapoints. If the k-nearest have a majority of them as worth watching, them the new one will be classified as also worth watching. This model is going to treat the data as points living in a 13-dimensional space and you can imagine points being colored red if they are worth watching and blue if not.

# Implementing the Predictor by Hand

Rather than using an implementation of k-Nearest Neighbors from a package like scikit-learn I will implement it using just numpy and pandas.

In [None]:
# This function normalizes all the columns of a dataframe
# to have mean 0 and standard deviation 1. This ensures 
# that no column has a dominant impact on the model
def normalize(df):
    return (df - df.mean()) / df.std()

In [None]:
# This function takes the dataframe and a newpoint to classify. It returns
# a Series object with the distance from each of the rows of the dataframe
# to the newpoint. We use Euclidean distance
def find_distances(df, newpoint):
    assert type(newpoint) == pd.Series, 'The new point must be a pd.Series object'
    assert newpoint.shape[0] == (df.shape[1] - 1), f'Your point is the wrong shape, newpoint shape is {newpoint.shape[0]} and df columns  - 1 is {df.shape[1] - 1}'
    distances = []
    for i, row in df.drop('watch', axis=1).iterrows():
        distances.append(np.linalg.norm(row - newpoint))
    return pd.Series(distances)        

In [None]:
# Given a dataframe, a series of distances (found above)
# and a value of k this will return the
# k nearest rows of the dataframe
def nearest(df, distances, k):
    assert type(k) == int, 'k must be an integer'
    assert 1 <= k <= df.shape[0], 'k must be a positive integer'
    assert type(distances) == pd.Series, 'Distances must be a pd.Series object'
    copy = df.copy()
    copy['distances'] = distances
    return copy.sort_values(by='distances',axis=0)[:k].drop('distances', axis=1)

In [None]:
# Given a dataframe with k rows will return the majority
# ruling on 'watch' variable, in case of a tie it will return
# 0 = 'not worth watching'
def worth_watching(nearest):
    average = nearest['watch'].sum() / nearest['watch'].count()
    return int(average > 0.5)

In [None]:
# Now we can put these inside a general predcition funciton
def predict(dataframe, newpoint, k=3):
    df = normalize(dataframe)
    dists = find_distances(df, newpoint)
    near = nearest(df, dists, k)
    return worth_watching(near)

In [None]:
# Testing
newpoint = pd.Series({'HGS':2.7,'HGA':1.8,'HYC':3,
                      'HRC':0.1,'HWW':0.6,'AGS':2,
                      'AGA':2.3,'AYC':1.3,'ARC':0.15,
                      'AWW':0.7})
for k in range(1, 5):
    print(f'k={k}, watch={predict(data, newpoint, k)}')

k=1, watch=1
k=2, watch=1
k=3, watch=1
k=4, watch=1


# Model Selection and Testing

To determine the best choice of k-value for the model we will be splitting the data into training and testing subsets. I will assign 80% to training and 20% to testing. These will be randomly assigned. I will run 10-fold cross validation which means that the training subset will be split into 10 same size non-intersecting subsets. We will fit the model for values of k on 90% of the training data then validate it on the remaining 10%. Once we have done this 10 times we will have an accuracy for that value of k. Once we have done this for all values of k the one with the highest accuracy from 10-fold cross-validation will be selected as the optimal value.

Once we have tested all the values of k we will assess the final accuracy of the model by fitting on all the training data and testing on the test data. This process eliminates bias in the final accuracy value.

## Subsetting

You will notice below that the training fraction is only 10% of the data! This is because the prediction takes a very long time to run. This is due to the fact that when we are classifying a new data point we must calculate distance from this new point to all of the training points. In this kind of implementation there are no ways around it. I am going to use 10% for my own implementation and then we will use a package below for the full 80% training. It will be interesting to see the difference

In [None]:
random.seed(2718) # Euler
training_frac = 0.1 # Train on over 600 games
testing_frac = 0.05 # Test on over 300 games
indices_train = random.sample([i for i in range(len(data))], 
                              math.floor(training_frac * len(data)))
indices_test = random.sample([i for i in range(len(data)) if i not in indices_train], 
                             math.floor(testing_frac * len(data)))
data_train = data.iloc[indices_train].reset_index()
data_test = data.iloc[indices_test].reset_index()

## Finding the optimal value of k

In [None]:
accuracy = []
for k in range(1, 10):
    correct = 0
    for i in range(10):
        validate_indices_low = math.floor(i / 10 * len(data_train))
        validate_indices_high = math.floor((i + 1) / 10 * len(data_train))
        data_validate = data_train.iloc[range(validate_indices_low, 
                                              validate_indices_high)]
        data_fit = data_train.iloc[[i for i in range(len(data_train)) 
                                    if i not in range(validate_indices_low, 
                                                      validate_indices_high)]]
        watch_pred = pd.DataFrame(data_validate['watch'])
        watch_pred['pred'] = data_validate.apply(lambda row: predict(data_fit, row.drop('watch'), k), axis=1)
        correct += len(watch_pred[watch_pred['watch'] == watch_pred['pred']])
        print(f'{i} fold done for k={k}')
    accuracy.append(correct / len(data_train))
    print(f'The validation accuracy for k={k} is {round(accuracy[k - 1] * 100, 3)}%')

0 fold done for k=1
1 fold done for k=1
2 fold done for k=1
3 fold done for k=1
4 fold done for k=1
5 fold done for k=1
6 fold done for k=1
7 fold done for k=1
8 fold done for k=1
9 fold done for k=1
The validation accuracy for k=1 is 52.244%
0 fold done for k=2
1 fold done for k=2
2 fold done for k=2
3 fold done for k=2
4 fold done for k=2
5 fold done for k=2
6 fold done for k=2
7 fold done for k=2
8 fold done for k=2
9 fold done for k=2
The validation accuracy for k=2 is 54.167%
0 fold done for k=3
1 fold done for k=3
2 fold done for k=3
3 fold done for k=3
4 fold done for k=3
5 fold done for k=3
6 fold done for k=3
7 fold done for k=3
8 fold done for k=3
9 fold done for k=3
The validation accuracy for k=3 is 55.609%
0 fold done for k=4
1 fold done for k=4
2 fold done for k=4
3 fold done for k=4
4 fold done for k=4
5 fold done for k=4
6 fold done for k=4
7 fold done for k=4
8 fold done for k=4
9 fold done for k=4
The validation accuracy for k=4 is 54.327%
0 fold done for k=5
1 fold d

## Model Testing

As can be seen from the output above, the optimal value of k on the training/validation data was $k = 9$. We will finally test the model on the reserved testing data to get a true final accuracy.

In [None]:
final_k = 9
watch_pred = pd.DataFrame(data_test['watch'])
watch_pred['pred'] = data_test.apply(lambda row: predict(data_train, row.drop('watch'), final_k), axis=1)
correct = len(watch_pred[watch_pred['watch'] == watch_pred['pred']])
accuracy = correct / len(data_test)
print(f'The accuracy on the testing data for k={final_k} is {round(accuracy * 100, 3)}')

So we end with a thoroughly dissapointing final accuracy of 50.962% meaning we should probably just guess instead!

# Predicting Using Scikit-Learn

In [None]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [None]:
random.seed(2718) # Euler
training_frac = 0.8
testing_frac = 0.2
indices_train = random.sample([i for i in range(len(data))], 
                              math.floor(training_frac * len(data)))
indices_test = random.sample([i for i in range(len(data)) if i not in indices_train], 
                             math.floor(testing_frac * len(data)))
data_train = data.iloc[indices_train].reset_index()
data_test = data.iloc[indices_test].reset_index()

In [None]:
accuracy = []
for k in range(1, 30):
    # By passing distance we give closer points a higher weight
    # I have set the algorithm to brute meaning that the distance to 
    # all points will be checked. As I have above
    model = KNeighborsClassifier(k, weights='distance',algorithm='brute')
    correct = 0
    for i in range(10):
        validate_indices_low = math.floor(i / 10 * len(data_train))
        validate_indices_high = math.floor((i + 1) / 10 * len(data_train))
        data_validate = data_train.iloc[range(validate_indices_low, 
                                              validate_indices_high)]
        data_fit = data_train.iloc[[i for i in range(len(data_train)) 
                                    if i not in range(validate_indices_low, 
                                                      validate_indices_high)]]
        watch_pred = pd.DataFrame(data_validate['watch'])
        model.fit(data_fit.drop('watch', axis=1), data_fit['watch'])
        watch_pred['pred'] = model.predict(data_validate.drop('watch', axis=1))
        correct += len(watch_pred[watch_pred['watch'] == watch_pred['pred']])
    accuracy.append(correct / len(data_train))
    print(f'The validation accuracy for k={k} is {round(accuracy[k - 1] * 100, 3)}%')

In [None]:
final_k = 2
# Same parameters as above
model = KNeighborsClassifier(final_k, weights='distance', algorithm='brute')
watch_pred = pd.DataFrame(data_test['watch'])
# This time we fit on all the training data
model.fit(data_train.drop('watch', axis=1), data_train['watch'])
# Predict on the remaining, unseen 20% of testing data
watch_pred['pred'] = model.predict(data_test.drop('watch', axis=1))
correct = len(watch_pred[watch_pred['watch'] == watch_pred['pred']])
accuracy = correct / len(data_test)
print(f'The accuracy on the testing data for k={final_k} is {round(accuracy * 100, 3)}')

The results above show that the k-Nearest Neighbors classifier is not performing well even for large values of k. This indicates that the predictor variables are not doing a good job of predicting! It is back to the drawing board for the model. Perhaps different variables will work better?