In [None]:
# Useful starting lines
import numpy as np
from helpers import *
from methods import *
from process_data import *
from crossValidation import *

%load_ext autoreload
%autoreload 2

seed=20

# Load the dataset

In [None]:
from zipfile import ZipFile 
  
# # specifying the zip file name 
file_name = 'Data/test.csv.zip'
  
# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
    zip.extractall('Data/') 

In [None]:
y, tX, ids = load_csv_data('Data/train.csv')
_, tX_test, ids_test = load_csv_data('Data/test.csv')

# Methods

## 1. Least Squares with Gradient Descent

#### Cross Validation

In [None]:
# Model parameters
max_iters = 3000
gamma = 0.005


# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_least_squares_GD(y, tX, k_indices, k, max_iters, gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.773952 / Test accuracy : 0.774912
1 - Training accuracy: 0.767112 / Test accuracy : 0.765072

Average test accuracy: 0.769992
Variance test accuracy: 0.000024
Min test accuracy: 0.765072
Max test accuracy: 0.774912


## 2. Least Squares with Stochastic Gradient Descent

#### Cross Validation

In [None]:
#TO DO

initial_w=np.zeros(tX.shape[1])
batch_size=1
max_iters=1000
gamma=0.005

loss, weights = least_squares_SGD(y, tX, initial_w, batch_size, max_iters, gamma)

## 3. Least Squares with Normal Equations 

#### Cross Validation

In [None]:
#TO DO 

loss, weights = least_squares(y, tX)

## 4. Ridge regression with Normal Equations

#### Lambda

In [None]:
# TO CHECK

# To evaluate the best lambda that minimizes the test error
loss, weights, best_lambda = cross_validation_ridge_regression(y,tX)

#### Cross Validation

In [None]:
# Model parameters

lambdas = [0.01, 0.01, 0.01]

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.782840 / Test accuracy : 0.783928
1 - Training accuracy: 0.782848 / Test accuracy : 0.781256

Average test accuracy: 0.782592
Variance test accuracy: 0.000002
Min test accuracy: 0.781256
Max test accuracy: 0.783928


## 5. Logistic Regression with Stochastic Gradient Descent


#### Cross Validation

In [None]:
# TO DO

initial_w = np.random.random(tX.shape[1])
batch_size = 1
max_iters = 10000
gamma = 0.0009

loss, weights = logistic_regression(y, tX, initial_w, batch_size, max_iters, gamma)

## 6. Regularized Logistic Regression with Stochastic Gradient Descent

#### Optimal Lambda

In [None]:
# TO DO

#### Cross Validation

In [None]:
# TO DO

lambda_ = 0.001
initial_w = np.random.random(tX.shape[1])
batch_size = 1
max_iters = 1000
gamma = 0.1

loss, weights = reg_logistic_regression(y, tX, lambda_, initial_w, batch_size,  max_iters, gamma)

# Prediction (file.run)
by now the best accuracy predicted is through RIDGE REGRESSION

In [None]:
# Split data in subsets corresponding to a jet value
msks_jet_train = get_jet_masks(tX)
msks_jet_test = get_jet_masks(tX_test)

# Ridge regression parameters for each subset
lambdas = [0.01, 0.01, 0.01]

# Vector to store the final prediction
y_pred = np.zeros(tX_test.shape[0])

for idx in range(len(msks_jet_train)):
    x_train = tX[msks_jet_train[idx]]
    x_test = tX_test[msks_jet_test[idx]]
    y_train = y[msks_jet_train[idx]]

    # Pre-processing of data
    x_train, x_test = process_data(x_train, x_test, True)

    loss, weights = ridge_regression(y_train, x_train, lambdas[idx])

    y_test_pred = predict_labels(weights, x_test)

    y_pred[msks_jet_test[idx]] = y_test_pred

In [None]:
higgs = np.count_nonzero(y_pred==1)
print(f'From {y_pred.shape[0]} test examples, {higgs} are 1, i.e. the {higgs/y_pred.shape[0]} %')

From 568238 test examples, 161790 are 1, i.e. the 0.28472224666424983 %


#### Generate predictions and save ouput in csv format for submission

In [None]:
OUTPUT_PATH = 'data/RidgeRegression.csv' 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

# OTHERS (old)

### Umbalanced Dataset

In [None]:
higgs = np.count_nonzero(y==1)
print(f'From {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

# Random Over Sampling
#tX, y = Random_Over_Sampling(tX, y)

#higgs = np.count_nonzero(y==1)
#print(f'Applying Random Over Sampling: \nFrom {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

# Preprocessing

In [None]:
tX, tX_test = process_data(tX, tX_test, add_constant_col=True)

# Cross Validation
IDEA: insert CV in each of the methods above

In [None]:
def cross_validation(y, x, k_indices, k, regression_method, **args):
    """
    Completes k-fold cross-validation using the regression method
    passed as argument.
    """
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()

    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]

    # data pre-processing
    #x_train, x_test = process_data(x_train, x_test, True)

    # compute weights using given method
    loss, weights = regression_method(y=y_train, tx=x_train, **args)
    
    # predict output for train and test data
    y_train_pred = predict_labels(weights, x_train)
    y_test_pred = predict_labels(weights, x_test)
    
    
    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train)
    acc_test = compute_accuracy(y_test_pred, y_test)

    return acc_train, acc_test

In [None]:
regression_method = ridge_regression

# Model parameters
lambda_ = 0.0005

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, regression_method, lambda_=lambda_)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.775480 / Test accuracy : 0.776096
1 - Training accuracy: 0.775888 / Test accuracy : 0.774656

Average test accuracy: 0.775376
Variance test accuracy: 0.000001
Min test accuracy: 0.774656
Max test accuracy: 0.776096


In [None]:
# Only for non logistic methods
y_pred = predict_labels(weights, tX_test)

In [None]:
# Only for Logistic methods
y_pred = sigmoid(tX_test@weights)
y_pred[y_pred <0.5] = -1
y_pred[y_pred > 0.5] = 1