In [1]:
import pandas as pd
import numpy as np
from proj1_helpers import *
from functions import *
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload

In [119]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x

In [5]:
def build_model_data(x, y):
    """Form (y,tX) to get regression data in matrix form."""
    num_samples = len(y)
    tx = np.c_[np.ones(num_samples), x]
    return tx, y

In [6]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # polynomial basis function: TODO
    # this function should return the matrix formed
    # by applying the polynomial basis to the input data
    # ***************************************************
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [101]:
def compute_loss(y, tx, w):
    """Calculate the loss.
    y: weight
    tx: 1+height
    """
    # ***************************************************
    # compute loss by MSE
    # ***************************************************
    e = y - np.tanh(tx.dot(w))
    
    return 1/2*np.mean(e**2)

In [102]:
def least_squares(y, tx):
    """calculate the least squares solution."""
    # ***************************************************
    # returns weights
    # ***************************************************
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    w = np.linalg.solve(a, b)
    loss = compute_loss(y, tx, w)
    return w, loss

In [133]:

def calculate_predicted_labels(x, w):
    y_pred = x.dot(w)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred

In [132]:
def print_accuracy(predict_labels, x, y, train=True):
    total_correct_labels = np.sum(predict_labels == y)
    print('Total correct labels in training: {}'.format(total_correct_labels))
    if train:
        print('Training accuracy: {}'.format((total_correct_labels / x.shape[0]) * 100))
    else:
        print('Testing accuracy: {}'.format((total_correct_labels / x.shape[0]) * 100))



# From this part, we start to load and test data



In [120]:
#load train dataset
yb, input_data, ids = load_csv_data("train.csv")
input_data, _, _ = standardize(input_data)
# input_data, _ = build_model_data(input_data, yb)

In [121]:
#to split train dataset to 2 parts for test and train
ratio = 0.7
x_train, x_test, y_train, y_test, ids_train, ids_test = split_data(input_data, yb, ids, ratio)

In [122]:
tx_test = build_poly(x_test, 7)
tx_train = build_poly(x_train, 7)

tx_train.shape

(175000, 211)

In [135]:
#test least_squares()
w, loss = least_squares(y_train, tx_train)
print(loss, compute_loss(y_test, tx_test, w))

0.2997644956487409 0.3000954280448022


In [137]:
training_predict_labels = calculate_predicted_labels(tx_train, w)
testing_predict_labels = calculate_predicted_labels(tx_test, w)
print_accuracy(training_predict_labels, tx_train, y_train)
print_accuracy(testing_predict_labels, tx_test, y_test, train=False)

Total correct labels in training: 138357
Training accuracy: 79.06114285714285
Total correct labels in training: 59380
Testing accuracy: 79.17333333333333


In [129]:
#test ridge_regression()
w, loss = ridge_regression(y_train, x_train, 0.001)
print(loss, compute_loss(y_test,x_test,w))

0.3854638707246176 0.38596765840270747


In [114]:
#test GD()
w_initial = np.zeros(x_train.shape[1])
max_iters = 1009
gamma = 0.05
w, loss = least_squares_GD(y_train, x_train, w_initial, max_iters, gamma)
print(loss, compute_loss(y_test,x_test,w))

0.3891811177314048 0.38969699048762146


In [118]:
#test SGD()
w_initial = np.zeros(x_train.shape[1])
max_iters = 1000
gamma = 0.05
w, loss = least_squares_SGD(y_train, x_train, w_initial, max_iters, gamma)
print(loss, compute_loss(y_test,x_test,w))

KeyboardInterrupt: 

In [48]:
#test cross validatiaon
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 2
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(y_train, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    # ***************************************************
    # INSERT YOUR CODE HERE
    # cross validation: TODO
    # ***************************************************
    for lambda_ in lambdas:
        mse_tr_sum = 0.
        mse_te_sum = 0.
        for k in range(k_fold):
            mse_tr, mse_te = cross_validation(y_train, x_train, k_indices, k, lambda_ ,degree)
            mse_tr_sum += mse_tr
            mse_te_sum += mse_te
        rmse_tr.append(np.sqrt(2*np.divide(mse_tr_sum, k_fold)))
        rmse_te.append(np.sqrt(2*np.divide(mse_te_sum, k_fold)))
    cross_validation_visualization(lambdas, np.array(rmse_tr), np.array(rmse_te))
    # ***************************************************
    # INSERT YOUR CODE HERE
    # cross validation: degree
    # ***************************************************
#     for degree in range(19):
#         mse_tr_sum = 0.
#         mse_te_sum = 0.
#         for k in range(k_fold):
#             mse_tr, mse_te = cross_validation(y, x, k_indices, k, 0.01 ,degree)
#             mse_tr_sum += mse_tr
#             mse_te_sum += mse_te
#         rmse_tr.append(np.sqrt(2*np.divide(mse_tr_sum, k_fold)))
#         rmse_te.append(np.sqrt(2*np.divide(mse_te_sum, k_fold)))
    
#     cross_validation_visualization(range(19), np.array(rmse_tr), np.array(rmse_te))

#cross_validation_demo()
t = build_poly(x_train, 1)
t.shape

(200000, 32)

In [49]:
x_train.shape

(200000, 31)

array([4, 1, 5, 0, 7, 2, 3, 6])

array([9, 8])

In [8]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # get k'th subgroup in test, others in train: TODO
    # ***************************************************
    x_train = []
    y_train = []
    x_test = x[k_indices[k,:]]
    y_test = y[k_indices[k,:]]
    for it in range(k_indices.shape[0]):
        if it!= k:
            x_train.extend(x[k_indices[it, :]])
            y_train.extend(y[k_indices[it, :]])
    # ***************************************************
    # INSERT YOUR CODE HERE
    # form data with polynomial degree: TODO
    # ***************************************************
    tx_train = build_poly(x_train, degree)
    tx_test = build_poly(x_test, degree)
    # ***************************************************
    # INSERT YOUR CODE HERE
    # ridge regression: TODO
    # ***************************************************
    print(len(x_train), x_train)
    w = ridge_regression(y_train, x_train, lambda_)
    print(len(w))
    # ***************************************************
    # INSERT YOUR CODE HERE
    # calculate the loss for train and test data: TODO
    # ***************************************************
    print(y_train, tx_train, len(w))
    print(y_test.shape, tx_test.shape, len(w))
    loss_tr = compute_loss(y_train, tx_train, w)
    loss_te = compute_loss(y_test, tx_test, w)
    return loss_tr, loss_te

In [7]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)