In [None]:
import pandas as pd
import numpy as np
from proj1_helpers import *
from functions import *
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload

In [None]:
def standardize(x):
    """
    Standardize the original data set.
    
    :param: x: input data which needs to be standardized
    
    Multiple columns represent features so need to take mean of each standardized feature
    """
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x


def build_model_data(x, y):
    """
    Form (y,tX) to get regression data in matrix form.
    
    :param: x: input data which is standardized and cleaned.
    :param: y: labels for the dataset
    """
    num_samples = len(y)
    tx = np.c_[np.ones(num_samples), x]
    return tx, y

In [None]:
pd.read_csv("train.csv").head()

In [None]:
#load train dataset
yb, input_data, ids = load_csv_data("train.csv")

In [None]:
input_data, _, _ = standardize(input_data) # Standardizing it by features

In [None]:
input_data, _ = build_model_data(input_data, yb)
print(input_data)

In [None]:
#to split train dataset to 2 parts for test and train
ratio = 0.8
x_train, x_test, y_train, y_test, ids_train, ids_test = split_data(input_data, yb, ids, ratio)

In [None]:
x_train.shape

In [None]:
#test least_squares()
w, loss = least_squares(y_train, x_train)
print(loss, compute_loss(y_test, x_test, w))

In [None]:
#test ridge_regression()
w, loss = ridge_regression(y_train, x_train, 0.1)
print(loss, compute_loss(y_test, x_test, w))

In [None]:
#test GD()
w_initial = np.zeros(31)
max_iters = 100
gamma = 0.05
w, loss = least_squares_GD(y_train, x_train, w_initial, max_iters, gamma)
print(loss, compute_loss(y_test,x_test,w))

In [None]:
#test SGD()
w_initial = np.zeros(31)
max_iters = 1000
gamma = 0.005

# loss_mae is the argument to get the mean absolute error cost function running
w, loss = least_squares_SGD(y_train, x_train, w_initial, max_iters, gamma, loss_function='rmse')
print('Training loss: {}'.format(loss))
print('Testing loss: {}'.format(compute_loss(y_test, x_test, w, loss_function='rmse')))

In [None]:
def calculate_predicted_labels(x, w):
    y_pred = x.dot(w)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

training_predict_labels = calculate_predicted_labels(x_train, w)
testing_predict_labels = calculate_predicted_labels(x_test, w)

In [None]:
def print_accuracy(predict_labels, x, y, train=True):
    total_correct_labels = np.sum(predict_labels == y)
    print('Total correct labels in training: {}'.format(total_correct_labels))
    if train:
        print('Training accuracy: {}'.format((total_correct_labels / x.shape[0]) * 100))
    else:
        print('Testing accuracy: {}'.format((total_correct_labels / x.shape[0]) * 100))

print_accuracy(training_predict_labels, x_train, y_train)
print_accuracy(testing_predict_labels, x_test, y_test, train=False)