In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
from implementations import *
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [3]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [4]:
def cross_validation_least_squares_GD(y, tx, initial_w, max_iters, gammas, k_fold, seed):
    """Do cross-validation to find the best gamma to use with least_squares_GD"""
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    mse_tr = []
    mse_te = []
    
    weights = initial_w
    
    for gamma in gammas:
        tr_tmp = []
        te_tmp = []
        for k in range(k_fold):
            # divide the data into training set and testing set depending on k
            tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
            test_tx = tx[k_indices[k]]
            test_y = y[k_indices[k]]
            train_tx = tx[tr_indice]
            train_y = y[tr_indice]
            
            #Train the set and computes the losses
            weights, loss_tr = least_squares_GD(train_y, train_tx, initial_w, max_iters, gamma)
            loss_te = compute_loss(mse, test_y, test_tx, weights)
            
            tr_tmp.append(loss_tr)
            te_tmp.append(loss_te)
        mse_tr.append(np.mean(tr_tmp))
        mse_te.append(np.mean(te_tmp))
        
        print(mse_te)
        
    gamma = gammas[np.argmin(mse_te)]
    weights_final, loss = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        
    return mse_tr, mse_te, gamma, weights_final, loss

In [None]:
def cross_validation_logistic_regression(y, tx, initial_w, max_iters, gammas, k_fold, seed):
    """Do cross-validation to find the best gamma to use with logistic regression"""
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    loss_sigmoid_tr = []
    loss_sigmoid_te = []
    
    weights = initial_w
    
    for gamma in gammas:
        tr_tmp = []
        te_tmp = []
        for k in range(k_fold):
            # divide the data into training set and testing set depending on k
            tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
            test_tx = tx[k_indices[k]]
            test_y = y[k_indices[k]]
            train_tx = tx[tr_indice]
            train_y = y[tr_indice]
            
            #Train the set and computes the losses
            weights, loss_tr = logistic_regressions(train_y, train_tx, initial_w, max_iters, gamma)
            loss_te = compute_loss_sigmoid(test_y, test_tx, weights)
            
            tr_tmp.append(loss_tr)
            te_tmp.append(loss_te)
        loss_sigmoid_tr.append(np.mean(tr_tmp))
        loss_sigmoid_te.append(np.mean(te_tmp))
        
    gamma = gammas[np.argmin(loss_sigmoid_te)]
    weights_final, loss_sigmoid = logistic_regression(y, tx, initial_w, max_iters, gamma)
        
    return loss_sigmoid_tr, loss_sigmoid_te, gamma, weights_final, loss_sigmoid

In [7]:
#least_squares_GD cross-validation
max_iters = 200
k_fold = 5
seed = 42

tX_stdrzed, mean, std = standardize(tX)

initial_w = np.array([0.4 for i in range(tX_stdrzed.shape[1])])

gammas = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09])
mse_tr_least_squares_GD, mse_te_least_squares_GD, gamma, weights, loss = \
    cross_validation_least_squares_GD(y, tX_stdrzed, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_least_squares_GD)
print(mse_te_least_squares_GD)
print(gamma)
print(weights)
print(loss)

tX_test_stdrzd = (tX_test-mean)/std

y_pred = predict_labels(weights, tX_test_stdrzd)
OUTPUT_PATH = 'data/output_least_squares_GD.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

#0.697 on AICrowd initial_w = np.array([0.4 for i in range(tX_stdrzed.shape[1])])
#0.693 on AICrowd initial_w = np.array([0.0 for i in range(tX_stdrzed.shape[1])]) even if loss smaller

[0.4121670035023174]
[0.4121670035023174, 0.40661847241528604]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959, 0.4003942713904026]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959, 0.4003942713904026, 0.3992821847490182]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959, 0.4003942713904026, 0.3992821847490182, 0.3983287727459971]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959, 0.4003942713904026, 0.3992821847490182, 0.3983287727459971, 0.3974707695719626]
[0.4121670035023174, 0.40661847241528604, 0.40369759951339856, 0.4017868157144959, 0.4003942713904026, 0.3992821847490182, 0.3983287727459971, 0.3974707695719626, 3065736051812.7295]
[0.4121963988553795, 0.40663859579732325, 0.40371304333868474, 0.40179821384