In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
from implementations import *

DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
tX_stdzed, tX_mean, tX_std = standardize(tX)

In [None]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    return np.vander(x, degree+1, True)

def ridge_cross_validation(y, x, k_indices, k, lambda_, degree):
    train_y, train_x, test_y, test_x = np.array([]), np.array([]), np.array([]), np.array([])
    
    for k_ in range(len(k_indices)):
        temp_y = y.take(k_indices[k_])
        temp_x = x.take(k_indices[k_])
        
        if k_ != k:
            train_y = np.concatenate((train_y, temp_y))
            train_x = np.concatenate((train_x, temp_x))
        else:
            test_y = np.concatenate((test_y, temp_y))
            test_x = np.concatenate((test_x, temp_x))
    
    train_poly = build_poly(train_x, degree)
    test_poly = build_poly(test_x, degree)
    
    w, loss_tr = ridge_regression(train_y, train_poly, lambda_)
    loss_te = compute_loss(mse, test_y, test_poly, w)
    
    return loss_tr, loss_te

In [None]:
import matplotlib.pyplot as plt


def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
def cross_validation_demo(l_min, l_max, degree):
    seed = 1
    k_fold = 5
    lambdas = np.logspace(-l_min, -l_max, 50)
    k_indices = build_k_indices(y, k_fold, seed)
    rmse_tr = []
    rmse_te = []
    
    te_min = (np.NaN, np.Inf)
    
    for l in lambdas:
        tr_avg = 0
        te_avg = 0
        
        for k_ in range(k_fold):
            loss_tr, loss_te = ridge_cross_validation(y, tX_stdzed, k_indices, k_, l, degree)
            
            tr_avg += loss_tr
            te_avg += loss_te
            
        rmse_tr.append(tr_avg/k_fold)
        
        temp_rmse_te = te_avg/k_fold
        rmse_te.append(temp_rmse_te)
        
        if temp_rmse_te < te_min[1]:
            te_min = (l, temp_rmse_te)
            
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
    print(te_min)

In [None]:
cross_validation_demo(30, 0, 0)

In [None]:
cross_validation_demo(6, 4, 0)

In [None]:
cross_validation_demo(30, 0, 1)

In [None]:
cross_validation_demo(6, 4, 1)

In [None]:
cross_validation_demo(30, 0, 2)

In [None]:
cross_validation_demo(17, 15, 2)

In [None]:
cross_validation_demo(10, 0, 3)

In [None]:
cross_validation_demo(10, 9, 3)

In [None]:
cross_validation_demo(10, 0, 2)

In [None]:
cross_validation_demo(10, 0, 4)

In [None]:
cross_validation_demo(5, 3, 4)

In [None]:
cross_validation_demo(10, 0, 5)

In [None]:
cross_validation_demo(5, 3, 5)

In [None]:
cross_validation_demo(10, 0, 6)

In [None]:
cross_validation_demo(3, 2, 6)

In [None]:
cross_validation_demo(10, 0, 9)

In [None]:
cross_validation_demo(3, 1, 9)

In [None]:
cross_validation_demo(10, 0, 12)

In [None]:
cross_validation_demo(3, 1, 12)

In [None]:
cross_validation_demo(10, 0, 15)

In [None]:
cross_validation_demo(1, 0, 15)

In [None]:
cross_validation_demo(10, 0, 16)

In [None]:
cross_validation_demo(10, 0, 17)

In [None]:
cross_validation_demo(10, 0, 18)

In [None]:
cross_validation_demo(4.5, 4, 18)

In [None]:
cross_validation_demo(4, 2, 18)

In [None]:
cross_validation_demo(10, 0, 21)

In [None]:
cross_validation_demo(10, 0, 25)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_stdzed, tX_mean, tX_std = standardize(tX)
tX_test_stdzed = (tX_test-tX_mean)/tX_std

lambda_ = 0.00026826957952797245
degree  = 5

poly_tX_stdzed = expand_features_polynomial(tX_stdzed, degree)
poly_tX_test_stdzed = expand_features_polynomial(tX_test_stdzed, degree)

w, _ = ridge_regression(y, poly_tX_stdzed, lambda_)

In [None]:
OUTPUT_PATH = 'data/danielementary_ridge.csv'
y_pred = predict_labels(w, poly_tX_test_stdzed)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)