In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
from implementations import *

In [None]:
tX_stdrzed, mean, std = standardize(tX)

In [4]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [5]:
def cross_validation_gammas(function, y, tx, initial_w, max_iters, gammas, k_fold, seed):
    """Do cross-validation to find the best gamma to use on a given function"""
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    mse_tr = []
    mse_te = []
    
    weights = initial_w
    
    for gamma in gammas:
        tr_tmp = []
        te_tmp = []
        for k in range(k_fold):
            # divide the data into training set and testing set depending on k
            tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
            test_tx = tx[k_indices[k]]
            test_y = y[k_indices[k]]
            train_tx = tx[tr_indice]
            train_y = y[tr_indice]
            
            #Train the set and computes the losses
            weights, loss_tr = function(train_y, train_tx, initial_w, max_iters, gamma)
            loss_te = compute_loss(mse, test_y, test_tx, weights)
            
            tr_tmp.append(loss_tr)
            te_tmp.append(loss_te)
        mse_tr.append(np.mean(tr_tmp))
        mse_te.append(np.mean(te_tmp))
        
    return mse_tr, mse_te

In [None]:
max_iters = 1000 #100 for least_squares
k_fold = 5
seed = 42

In [None]:
#least_squares_GD cross-validation
initial_w = np.array([0.4 for i in range(tX_stdrzed.shape[1])]) 
# ATTENTION !!!! 1. return nan pour loss avec least_squares_gd !!!
gammas = np.array([0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1])
mse_tr_least_squares_GD, mse_te_least_squares_GD = \
    cross_validation_gammas(least_squares_GD, y, tX_stdrzed, initial_w, max_iters, gammas, k_fold, seed)


print(mse_tr_least_squares_GD)
print(mse_te_least_squares_GD)

In [None]:
#Weights for best gamma for least_squares_GD

gamma = gammas[np.argmin(mse_te_least_squares_GD)]
weights, loss = least_squares_GD(y, tX_stdrzed, initial_w, max_iters, gamma)

print(gamma)
print(weights)
print(loss)

In [None]:
#logistic_regression cross-validation


#IL FAUT CHANGER LES LABELS POUR Y DE -1/1 à 0/1!!!!!!!!
y_logistic = []
for elem in y:
    if elem == -1:
        y_logistic.append(0)
    else:
        y_logistic.append(1)
        
y_logistic = np.asarray(y_logistic)
        
initial_w = np.array([0.5 for i in range(tX_stdrzed.shape[1])])
gammas = np.array([0.0000001, 0.000001, 0.00001])
mse_tr_logistic_regression, mse_te_logistic_regression = \
cross_validation_gammas(logistic_regression, y_logistic, tX_stdrzed, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_logistic_regression)
print(mse_te_logistic_regression)


In [None]:
gamma = gammas[np.argmin(mse_te_logistic_regression)]
weights, loss = logistic_regression(y_logistic, tX_stdrzed, initial_w, max_iters, gamma)

print(gamma)
print(weights)
print(loss)

y_pred = predict_labels(weights, tX_test_stdrzd)

In [None]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

In [None]:
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_test_stdrzd = (tX_test-mean)/std #USE THE MEAN AND STD OF TRAINING DATA

In [None]:
OUTPUT_PATH = 'data/output.csv' 
y_pred = predict_labels(weights, tX_test_stdrzd)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
#least_squares_GD:

#max_iters = 100 
#k_fold = 5
#seed = 42
#initial_w = np.array([0.4 for i in range(tX_stdrzed.shape[1])]) 
#gamma : 0.08
#weigths: [ 0.21327586 -0.1146719  -0.07368819  0.04893754  0.02225969  0.12098264
#  0.01944926  0.11727695  0.05210771 -0.26103589  0.1158705   0.12397883
#  0.02394943  0.11838262  0.12186015  0.12181576 -0.00271883  0.121864
#  0.12193441  0.04248975  0.12208034 -0.37029438  0.11649633 -0.0311746
#  0.06456901  0.06459276 -0.08128091  0.024305    0.02420671 -0.13281751]
#loss : 0.4187127661494721

# On AICrowd :  
# Categorical Accuracy : 0.672 
# F1-Score : 0.339

In [None]:
#logistic_regression:

#max_iters = 1000 
#k_fold = 5
#seed = 42
#initial_w = np.array([0.5 for i in range(tX_stdrzed.shape[1])])
#gamma : 1e-07
#[  6.70968093 -35.37478552  -3.81622818  -0.19319657   2.63897026
#   2.21089965   2.48368267   0.90021442  -1.06382924   2.97263477
#  -0.65038781   1.45159447   2.70114369  18.05969885   0.24352419
#   0.17975174  -4.73092368   0.23675558   0.34633217   0.58217815
#   0.32184767  -3.89345914   0.13572754   3.97257752   4.31809735
#   4.34578467   0.38030864   2.58058968   2.55730877  -9.82233767]
#loss : 2847310.0023890096 ???


# On AICrowd :  
# Categorical Accuracy : 0.696 
# F1-Score : 0.267

In [None]:
#Least squares with cleaned colums

tX_stdrzed_cleaned, mean, std = standardize(remove_wrong_columns(tX))

max_iters = 200
k_fold = 5
seed = 42
initial_w = np.array([0.1 for i in range(tX_stdrzed_cleaned.shape[1])]) 
gammas = np.array([0.01, 0.05, 0.09, 0.1, 0.2, 0.3])

mse_tr_least_squares_GD_cleaned, mse_te_least_squares_GD_cleaned = \
    cross_validation_gammas(least_squares_GD, y, tX_stdrzed_cleaned, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_least_squares_GD_cleaned)
print(mse_te_least_squares_GD_cleaned)

gamma = gammas[np.argmin(mse_te_least_squares_GD_cleaned)]
weights, loss = least_squares_GD(y, tX_stdrzed_cleaned, initial_w, max_iters, gamma)

print(gamma)
print(weights)
print(loss)

tX_test_stdrzd_cleaned = (remove_wrong_columns(tX_test)-mean)/std #USE THE MEAN AND STD OF TRAINING DATA

y_pred = predict_labels(weights, tX_test_stdrzd_cleaned)

unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))

OUTPUT_PATH = 'data/output.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

#max_iters = 200 : It outputs 71.7% of accuracy
#with max_iters = 1000 : the loss was converging to 0.374 which was quite close to the loss with max_iters = 200

In [None]:
#Logistic regression with cleaned colums

tX_stdrzed_cleaned, mean, std = standardize(remove_wrong_columns(tX))

y_logistic = []
for elem in y:
    if elem == -1:
        y_logistic.append(0)
    else:
        y_logistic.append(1)
y_logistic = np.asarray(y_logistic)

max_iters = 1000
k_fold = 5
seed = 42
        
initial_w = np.array([0.5 for i in range(tX_stdrzed_cleaned.shape[1])])
gammas = np.array([0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1])

mse_tr_logistic_cleaned, mse_te_logistic_cleaned = \
    cross_validation_gammas(logistic_regression, y_logistic, tX_stdrzed_cleaned, initial_w, max_iters, gammas, k_fold, seed)

print(mse_tr_logistic_cleaned)
print(mse_te_logistic_cleaned)

gamma = gammas[np.argmin(mse_te_logistic_cleaned)]
weights, loss = logistic_regression(y_logistic, tX_stdrzed_cleaned, initial_w, max_iters, gamma)

print(gamma)
print(weights)
print(loss)

tX_test_stdrzd_cleaned = (remove_wrong_columns(tX_test)-mean)/std #USE THE MEAN AND STD OF TRAINING DATA

y_pred = predict_labels(weights, tX_test_stdrzd_cleaned)

unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))

  + (1 - y).T @ np.log(1 - sigm_tx_w))
  if np.abs(loss - previous_loss) < threshold:
  + (1 - y).T @ np.log(1 - sigm_tx_w))
  return - np.sum(y.T @ np.log(sigm_tx_w) \
  return - np.sum(y.T @ np.log(sigm_tx_w) \
  t_exp = np.exp(t)
  return t_exp /(1 + t_exp)
