In [1]:
#Name: Shashank Shekhar [sshekha4]
## Discussed with: Rahul Aettapu [raettap]
# Basketball Dataset Taken: http://college.cengage.com/mathematics/brase/understandable_statistics/7e/students/datasets/mlr/frames/frame.html

In [2]:
import pandas as pd
import numpy as np
from random import seed
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt
import random
import csv
## Seed value for split
seed(123)

In [3]:
# Loading CSV file
def load_csv(filename):
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        # get header from first row
        headers = next(reader)
        data = list(reader)
        data = np.array(data).astype(float)
    return data

In [4]:
data = load_csv('./Basketball.csv')

In [5]:
## Normalizing the data (all the columns)
for ind in range(len(data[0])):
    column = [row[ind] for row in data]
    minval = min(column)
    maxval = max(column)
    for i in range(len(column)):
        data[i][ind] = (column[i] - minval)/(maxval - minval)
print(data)

[[0.57894737 0.75949367 0.49025974 0.65243902 0.2601626 ]
 [0.31578947 0.47468354 0.46753247 0.8429878  0.36178862]
 [0.36842105 0.53797468 0.53571429 0.78810976 0.52845528]
 [0.26315789 0.47468354 0.40584416 0.62042683 0.23577236]
 [0.63157895 0.63291139 0.51298701 1.         0.82926829]
 [0.36842105 0.75949367 0.45454545 0.81707317 1.        ]
 [0.31578947 0.50632911 0.63636364 0.80335366 0.26422764]
 [0.57894737 0.82278481 0.57792208 0.77134146 0.53658537]
 [0.63157895 0.82278481 0.46753247 0.875      0.07723577]
 [0.52631579 0.66455696 0.61363636 0.88567073 0.39430894]
 [0.63157895 0.88607595 0.73051948 0.59146341 0.70325203]
 [0.63157895 0.88607595 0.65584416 0.7820122  0.25609756]
 [0.31578947 0.50632911 0.26948052 0.70884146 0.21544715]
 [0.21052632 0.50632911 0.43181818 0.82012195 0.23577236]
 [0.26315789 0.47468354 0.48701299 0.80945122 0.71138211]
 [0.57894737 0.7278481  0.68831169 0.9695122  0.90243902]
 [0.42105263 0.56329114 0.68831169 0.89786585 0.66666667]
 [1.         0

In [6]:
def find_yhat(coef, row):
    yhat = coef[0]
    for i in range(len(row)-1):
        yhat += coef[i+1]*row[i]
    return yhat

In [7]:
def make_test_predictions(train, test, l_rate, t_epochs, lambdaval):
    predictions = []
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(t_epochs):
        sum_error = 0.0
        for row in train:
            yhat = find_yhat(coef, row)
            error = row[-1] - yhat
            sum_error += error**2
            coef[0] = coef[0] + (2*l_rate*error) - (2*l_rate*lambdaval*coef[0])
            for i in range(len(row)-1):
                coef[i+1] = coef[i+1] + (2*l_rate*error*row[i]) - (2*l_rate*lambdaval*coef[i+1])
#         print('#Epoch={0}, lrate={1}, error={2}'.format(epoch, l_rate, sum_error))
#     print('')
#     print('')
    for row in test:
        yhat = find_yhat(coef, row)
        predictions.append(yhat)
    return predictions

In [8]:
### Experimental Section Starts
# def make_test_predictions(train, test, l_rate, t_epochs, lambdaval):
#     predictions = []
#     coef = [0.0 for i in range(len(train[0]))]
#     for epoch in range(t_epochs):
#         sum_error = 0.0
#         coef0_gradient = 0.0
#         coef_gradients = [0.0 for i in range(len(train[0]))]
#         for row in train:
#             yhat = find_yhat(coef, row)
#             error = row[-1] - yhat
#             sum_error += error**2
#             coef0_gradient += ((-2.0/len(train))*(error-(lambdaval*coef[0])))
#             for i in range(len(row)-1):
#                 coef_gradients[i+1] += ((-2.0/len(train))*((row[i]*error)-(lambdaval*coef[i+1])))
#         coef[0] -= (l_rate*coef0_gradient)
#         for i in range(len(row)-1):
#             coef[i+1] -= (l_rate*coef_gradients[i+1])
#         print('#Epoch={0}, lrate={1}, error={2}'.format(epoch, l_rate, sum_error))
#     for row in test:
#         yhat = find_yhat(coef, row)
#         predictions.append(yhat)
#     return predictions
### Experimental Section Ends

In [9]:
def find_rmse(preds, trvals):
    sum_error = 0.0
    for i in range(len(preds)):
        sum_error += (preds[i] - trvals[i])**2
    mean_error = sum_error/float(len(trvals))
    return sqrt(mean_error)

In [10]:
## Hyperparameters - total_folds, learning_rate, total_epochs
total_folds = 5
learning_rate = 0.001
total_epochs = 15
lambdaval = 0.001
# Performing Split
kf = KFold(n_splits=total_folds, shuffle=True)
rmse_across_folds = []
test_rmse_across_folds = []

In [11]:
def performSGD(lambdaval, total_epochs):
    for train_index, test_index in kf.split(data):
        train_data = []
        for i in range(len(train_index)):
            train_data.append(data[i])
        test_data = []
        for i in range(len(test_index)):
            test_data.append(data[i])
        predictions = make_test_predictions(train_data, test_data, learning_rate, total_epochs, lambdaval)
        truevalues = [row[-1] for row in test_data]
        rmse = find_rmse(predictions, truevalues)
        rmse_across_folds.append(rmse)
    return sum(rmse_across_folds)/float(len(rmse_across_folds))

In [12]:
def testSGD(lambdaval, total_epochs):
    ## Since the seed is fixed, the same split happens again
    for train_index, test_index in kf.split(data):
        train_data = []
        for i in range(len(train_index)):
            train_data.append(data[i])
        test_data = []
        for i in range(len(test_index)):
            test_data.append(data[i])
        ## Testing Part
        clf = SGDRegressor(alpha=lambdaval, max_iter=total_epochs, loss='squared_loss', penalty='l2')
        clf.fit([row[:-1] for row in train_data], [row[-1] for row in train_data])
        test_predictions = clf.predict([row[:-1] for row in test_data])
        truevalues = [row[-1] for row in test_data]
        test_rmse = find_rmse(test_predictions, truevalues)
        test_rmse_across_folds.append(test_rmse)
    return sum(test_rmse_across_folds)/float(len(test_rmse_across_folds))

In [13]:
## If looking to print the errors per epoch, uncomment the print statement in make_test_predictions function
if(abs(performSGD(lambdaval, total_epochs) - testSGD(lambdaval, total_epochs)) <= 1):
    print('Output and Expected average RMSE values match')
else:
    print('Output and Expected average RMSE values don\'t match')

#Epoch=0, lrate=0.001, error=8.832887170335121
#Epoch=1, lrate=0.001, error=8.769851858103879
#Epoch=2, lrate=0.001, error=8.707447087726193
#Epoch=3, lrate=0.001, error=8.645666543940703
#Epoch=4, lrate=0.001, error=8.584503974741754
#Epoch=5, lrate=0.001, error=8.523953190745788
#Epoch=6, lrate=0.001, error=8.464008064564117
#Epoch=7, lrate=0.001, error=8.404662530181891
#Epoch=8, lrate=0.001, error=8.345910582343393
#Epoch=9, lrate=0.001, error=8.287746275943416
#Epoch=10, lrate=0.001, error=8.230163725424761
#Epoch=11, lrate=0.001, error=8.173157104181781
#Epoch=12, lrate=0.001, error=8.116720643969883
#Epoch=13, lrate=0.001, error=8.06084863432094
#Epoch=14, lrate=0.001, error=8.00553542196459
#Epoch=0, lrate=0.001, error=8.832887170335121
#Epoch=1, lrate=0.001, error=8.769851858103879
#Epoch=2, lrate=0.001, error=8.707447087726193
#Epoch=3, lrate=0.001, error=8.645666543940703
#Epoch=4, lrate=0.001, error=8.584503974741754
#Epoch=5, lrate=0.001, error=8.523953190745788
#Epoch=6, 

In [14]:
print('RMSE errors across folds: ', rmse_across_folds)
print('Mean RMSE: {0}'.format(sum(rmse_across_folds)/float(len(rmse_across_folds))))

## Below is the Test RMSE using SGD Regressor
print('Test RMSE errors across folds (using SGDRegressor): ', test_rmse_across_folds)
print('Test Mean RMSE (using SGDRegressor): {0}'.format(sum(test_rmse_across_folds)/float(len(test_rmse_across_folds))))

RMSE errors across folds:  [0.5176399978475063, 0.5176399978475063, 0.5176399978475063, 0.5176399978475063, 0.4996685830333484]
Mean RMSE: 0.5140457148846747
Test RMSE errors across folds (using SGDRegressor):  [0.27485135153926765, 0.2752440743751121, 0.274630817872851, 0.27502871412906826, 0.27357122776068715]
Test Mean RMSE (using SGDRegressor): 0.27466523713539726
