<a href="https://colab.research.google.com/github/speacock103/MachineLearningFall2020/blob/master/footballprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [12]:
import io
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

Retreive football game data set

In [13]:
from google.colab import files
uploaded = files.upload()

football_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
football_test = pd.read_csv(io.BytesIO(uploaded['test.csv']))

Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


Preprocess (number encode) data

In [70]:
#converts string and other formatted data to numeric values, allowing for easier digestion into models
encoder = preprocessing.LabelEncoder()
football_train = football_train.apply(encoder.fit_transform)
football_test = football_test.apply(encoder.fit_transform)

Parse training/testing data into input and designated outputs

In [15]:
football_train_x = football_train.drop("Label", axis=1)
football_train_y = football_train["Label"]

In [16]:
football_test_x = football_test.drop("Label", axis=1)
football_test_y = football_test["Label"]

# Naive bayes prediction

fitting of training data to model

In [17]:
#instantiates NB model from sklearn library. conceptually this type of model looks at the probability of an outcome
#based on each individual feature. the testing data is then compared to the trained model, relating each feature to prior instances of each feature and
#computing the probable outcome
footballnb = GaussianNB()
footballnb.fit(football_train_x, football_train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

predict testing data from said model

In [18]:
footballpred = footballnb.predict(football_test_x)

View prediction values (1 = win, 0 = loss)

In [19]:
footballpred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0])

In [71]:
football_test_y

0     1
1     0
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     0
10    1
11    0
Name: Label, dtype: int64

Accuracy of prediction (relation of correct predictions to total predictions made)

In [20]:
acc = metrics.accuracy_score(footballpred, football_test_y)
acc

0.9166666666666666

Precision (ratio of true prediction values to total amount of that outcome predicted)

In [None]:
#there was a single false positive (prediction value of loss when actual value is win). from total of 10 predicted wins, precision of 90%

Recall (ratio of correctly predicted values to wrongly predicted values of that outcome)

In [None]:
#there were no false negatives. recall is 100%

F1 (average of precision and recall)

In [None]:
#2*(Recall * Precision) / (Recall + Precision), recall is 94.7%

# K-Nearest Neighbors Prediction

In [68]:
#implementation of KNN algorithm. Credit to both mavaladezt and tugot17 on github, their code has been translated into what is seen below
#relates data instances by looking at the positions of each individual data instance in a "feature space". This is determined by equating
#a position value based on the values of individual features. Data instances close to one another determine the grouping predictions for outcomes

def knn_distance(trainx,testx,k):
    distances = -2 * trainx@testx.T + np.sum(testx**2,axis=1) + np.sum(trainx**2,axis=1)[:, np.newaxis]
    distances[distances < 0] = 0
    distances = distances**.5
    indices = np.argsort(distances, 0)
    distances = np.sort(distances,0)
    return indices[0:k,:], distances[0:k,:]


def knn_prediction(trainx,trainy,testx,k):
    indices, distances = knn_distance(trainx,testx,k)
    trainy = trainy.flatten()
    rows, columns = indices.shape
    predictions = list()
    for j in range(columns):
        temp = list()
        for i in range(rows):
            cell = indices[i][j]
            temp.append(trainy[cell])
        predictions.append(max(temp,key=temp.count))
    predictions=np.array(predictions)
    return predictions

def knn_accuracy(testy,predictions):
    x=testy.flatten()==predictions.flatten()
    grade=np.mean(x)
    return np.round(grade*100,2)

from tqdm import tqdm_notebook

football_train_xarr = football_train_x.values

football_train_yarr = football_train_y.values

football_test_xarr = football_test_x.values

football_test_yarr = football_test_y.values

accuracies = []
k_values = [i for i in range(1,24)]
for k in tqdm_notebook(k_values):
    prediction = knn_prediction(football_train_xarr, football_train_yarr, football_test_xarr, k)
    print (prediction)
    accuracy = knn_accuracy(football_test_yarr, prediction)
    print (accuracy)
    accuracies.append(accuracy)

bestk = k_values[accuracies.index(max(accuracies))]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))

[1 0 0 0 1 1 1 1 1 0 1 0]
83.33
[1 0 0 0 1 1 1 1 1 0 1 0]
83.33
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 0]
83.33
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0
[1 1 1 1 1 1 1 1 1 1 1 1]
75.0



In [66]:
bestk

1

Accuracy

In [69]:
print (max(accuracies))

83.33


Precision

In [None]:
#no false postives. Thus precision is 100%

Recall

In [None]:
#2 false negatives. 7/9, recall = 77%

F1

In [None]:
#averaging the two, f1 = 87%