In [1]:
# Useful starting lines
import numpy as np
from helpers import *
from implementations import *
from exploration import *
from process_data import *
from crossvalidation import *
from select_parameter import *

seed=10

# Load the dataset

In [2]:
y, tX, ids = load_csv_data('data/train.csv')
_, tX_test, ids_test = load_csv_data('data/test.csv')

# Models

We now apply the 6 methods expected for this project. We compare the performance of each method by performing cross-validation on the training set, to have an estimate of the test accuracy. Valuating the variance of the test accuracy predicted and comparing test accuracy predicted with training accuracy we can evaluate if our model is overfitting or not.

## 1. Least Squares with Gradient Descent

In [3]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]

# Model parameters
max_iters = 500
gamma = 0.00005


# Split data in k-fold
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, least_squares_GD, k_indices, k, degrees, alphas,
                                           max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.787600 / Test accuracy : 0.787160
Iter 1: Training accuracy: 0.787330 / Test accuracy : 0.788160
Iter 2: Training accuracy: 0.787775 / Test accuracy : 0.784340
Iter 3: Training accuracy: 0.787615 / Test accuracy : 0.786120
Iter 4: Training accuracy: 0.787000 / Test accuracy : 0.788820

Average test accuracy: 0.786920
Variance test accuracy: 0.000003


## 2. Least Squares with Stochastic Gradient Descent

In [6]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]

# Model parameters
max_iters = 100
gamma = 0.00001
batch_size=1


# Split data in k-fold
k_fold = 3
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, least_squares_SGD, k_indices, k, degrees, alphas, 
                                           max_iters=max_iters, gamma=gamma, batch_size=batch_size)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.715089 / Test accuracy : 0.716943
Iter 1: Training accuracy: 0.721263 / Test accuracy : 0.719691
Iter 2: Training accuracy: 0.715623 / Test accuracy : 0.716271

Average test accuracy: 0.717635
Variance test accuracy: 0.000002


## 3. Least Squares with Normal Equations 

In [7]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]


# Split data in k-fold
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, least_squares, k_indices, k, degrees, alphas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.843625 / Test accuracy : 0.836580
Iter 1: Training accuracy: 0.842920 / Test accuracy : 0.840440
Iter 2: Training accuracy: 0.843120 / Test accuracy : 0.824320
Iter 3: Training accuracy: 0.843460 / Test accuracy : 0.840900
Iter 4: Training accuracy: 0.837225 / Test accuracy : 0.831080

Average test accuracy: 0.834664
Variance test accuracy: 0.000039


## 4. Ridge regression with Normal Equations

### Grid Search to find the best parameters (Alpha, Lambda, Degree)  per class_jet

In [None]:
# canditates parameters
degrees_candidates = [4,5,6]
alphas_candidates=[3,4,5]
lambdas_candidates = np.logspace(-3,-6,4)


k_fold = 3

opt_degree, opt_lambda, opt_alpha, accu = select_parameters_ridge_regression_jet(y,tX,degrees_candidates,lambdas_candidates,
                                                                  alphas_candidates,k_fold,seed)
print('Optimal alphas per jet_class:',opt_alpha)
print('Optimal degrees per jet_class:',opt_degree)
print('Optimal lambdas per jet_class:',opt_lambda)
print('Maximum accuracy predicted per jet_class:',accu)

In [8]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]
lambdas = [1e-06, 1e-05, 1e-03]


# Split data in k-fold
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, ridge_regression, k_indices, k, degrees, alphas, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.843205 / Test accuracy : 0.838920
Iter 1: Training accuracy: 0.842600 / Test accuracy : 0.841640
Iter 2: Training accuracy: 0.843100 / Test accuracy : 0.839000
Iter 3: Training accuracy: 0.843030 / Test accuracy : 0.840740
Iter 4: Training accuracy: 0.842665 / Test accuracy : 0.842580

Average test accuracy: 0.840576
Variance test accuracy: 0.000002


## 5. Logistic Regression with Stochastic Gradient Descent


In [9]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]

# Model parameters
max_iters = 100
gamma = 0.00001
batch_size = 1

# Split data in k-fold
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, logistic_regression, k_indices, k, degrees, alphas, log=True,
                                           batch_size=batch_size, max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.710235 / Test accuracy : 0.710960
Iter 1: Training accuracy: 0.722310 / Test accuracy : 0.720140
Iter 2: Training accuracy: 0.724430 / Test accuracy : 0.720800
Iter 3: Training accuracy: 0.722135 / Test accuracy : 0.722940
Iter 4: Training accuracy: 0.681275 / Test accuracy : 0.680720

Average test accuracy: 0.711112
Variance test accuracy: 0.000248


## 6. Regularized Logistic Regression with Stochastic Gradient Descent

In [10]:
# Preprocessing parameters
degrees = [5, 5, 5]
alphas = [4, 4, 5]

# Model parameters
lambdas=[0.1,0.1,0.1]
max_iters = 100
gamma = 0.00001
batch_size = 1

# Split data in k-fold
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_jet(y, tX, reg_logistic_regression, k_indices, k, degrees, alphas, lambdas, log=True,
                                           batch_size=batch_size, max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("Iter %d: Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))

Iter 0: Training accuracy: 0.710245 / Test accuracy : 0.710960
Iter 1: Training accuracy: 0.722310 / Test accuracy : 0.720140
Iter 2: Training accuracy: 0.724430 / Test accuracy : 0.720800
Iter 3: Training accuracy: 0.722130 / Test accuracy : 0.722940
Iter 4: Training accuracy: 0.681280 / Test accuracy : 0.680700

Average test accuracy: 0.711108
Variance test accuracy: 0.000248


# Prediction

In [11]:
! python run.py