In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
from implementations import *

In [5]:
tX_stdrzed, mean, std = standardize(tX)

In [6]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [7]:
def cross_validation_gammas(function, y, tx, initial_w, max_iters, gammas, k_fold, seed):
    """Do cross-validation to find the best gamma to use on a given function"""
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    mse_tr = []
    mse_te = []
    
    for gamma in gammas:
        tr_tmp = []
        te_tmp = []
        for k in range(k_fold):
            # divide the data into training set and testing set depending on k
            tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
            test_tx = tx[k_indices[k]]
            test_y = y[k_indices[k]]
            train_tx = tx[tr_indice]
            train_y = y[tr_indice]
            
            #Train the set and computes the losses
            print("HERE " + str(k))
            weights, loss_tr = function(train_y, train_tx, initial_w, max_iters, gamma)
            loss_te = compute_loss(mse, test_y, test_tx, weights)
            tr_tmp.append(loss_tr)
            te_tmp.append(loss_te)
        mse_tr.append((gamma, np.mean(tr_tmp)))
        mse_te.append((gamma, np.mean(te_tmp)))
        print(mse_tr)
        
    return mse_tr, mse_te

In [8]:
max_iters = 100
k_fold = 5
seed = 42

In [14]:
#least_squares_GD cross-validation
initial_w = np.array([1.5 for i in range(tX_stdrzed.shape[1])]) 
# ATTENTION !!!! 1. return nan pour loss avec least_squares_gd !!!
gammas = np.array([0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1])
mse_tr_least_squares_GD, mse_te_least_squares_GD = \
    cross_validation_gammas(least_squares_GD, y, tX_stdrzed, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_least_squares_GD)
print(mse_te_least_squares_GD)

[(0.001, 45.814939252517206), (0.01, 1.1570884690125212), (0.02, 0.802651741227361), (0.03, 0.7130170152799978), (0.04, 0.6492072148687056), (0.05, 0.6014476285437074), (0.06, 0.5651926606470817), (0.07, 0.5373073137667168), (0.08, 0.515557037124372), (0.09, 274914664.57558066), (0.1, 1.2574604422153355e+25), (0.1, 1.2574604422153355e+25)]
[(0.001, 45.48981802585414), (0.01, 1.1447516640538447), (0.02, 0.8004034428530963), (0.03, 0.7108303616686046), (0.04, 0.6470503297717095), (0.05, 0.5994189809913735), (0.06, 0.5633369443500535), (0.07, 0.5356344443945169), (0.08, 0.5140580666639419), (0.09, 321863003.0029902), (0.1, 2.1689801028412247e+25), (0.1, 2.1689801028412247e+25)]


In [None]:
#logistic_regression cross-validation
initial_w = np.zeros((tX_stdrzed.shape[1], 1))
gammas = np.array([0.001, 0.01, 0.1])
mse_tr_logistic_regression, mse_te_logistic_regression = \
cross_validation_gammas(logistic_regression, y, tX_stdrzed, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_logistic_regression)
print(mse_te_logistic_regression)

HERE 0


In [None]:
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_test_stdrzd = (tX_test-mean)/std #USE THE MEAN AND STD OF TRAINING DATA

In [None]:
OUTPUT_PATH = 'data/output.csv' 
y_pred = predict_labels(weights, tX_test_stdrzd)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
compute_loss(mse, y_pred, tX_test_stdrzd, weights)