In [1]:
# Useful starting lines
import numpy as np
from helpers import *
from methods import *
from process_data import *
from crossValidation import *

%load_ext autoreload
%autoreload 2

seed=20

# Load the dataset

In [2]:
y, tX, ids = load_csv_data('Data/train.csv')
_, tX_test, ids_test = load_csv_data('Data/test.csv')

# Preprocessing

### Umbalanced Dataset

In [222]:
higgs = np.count_nonzero(y==1)
print(f'From {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

# Random Over Sampling
#tX, y = Random_Over_Sampling(tX, y)

#higgs = np.count_nonzero(y==1)
#print(f'Applying Random Over Sampling: \nFrom {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

From 250000 training examples, 85667 are 1, i.e. the 0.342668 %
Applying Random Over Sampling: 
From 328666 training examples, 164333 are 1, i.e. the 0.5 %


SUPERIDEA: SLIT THE DATASET IN 4 CLASSES

### Other Ideas

1. New features: Apply a polynomial basis to all the X features

2. PCA, correlation analysis (scatterplot, VIF, ...), manage the 0s in the last feature

3. Outlayer analysis, leverages, cook's metric ...

In [203]:
# YOU DON'T NEED THIS FOR RIDGE

tX, tX_test = process_data(tX, tX_test, add_constant_col=True)

# Methods

## 1. Least Squares with Gradient Descent

In [None]:
initial_w=np.zeros(tX.shape[1])
max_iters=200
gamma=0.005

loss, weights = least_squares_GD(y, tX,initial_w, max_iters, gamma)

## 2. Least Squares with Stochastic Gradient Descent

In [None]:
initial_w=np.zeros(tX.shape[1])
batch_size=1
max_iters=1000
gamma=0.005

loss, weights = least_squares_SGD(y, tX, initial_w, batch_size, max_iters, gamma)

## 3. Least Squares with Normal Equations 

In [68]:
loss, weights = least_squares(y, tX)

## 4. Ridge regression with Normal Equations

In [None]:
# TO CHECK

# To evaluate the best lambda that minimizes the test error
loss, weights, best_lambda = cross_validation_ridge_regression(y,tX)

In [3]:
# Model parameters

lambdas = [0.01, 0.01, 0.01]

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.782840 / Test accuracy : 0.783928
1 - Training accuracy: 0.782848 / Test accuracy : 0.781256

Average test accuracy: 0.782592
Variance test accuracy: 0.000002
Min test accuracy: 0.781256
Max test accuracy: 0.783928


#### Prediction

In [138]:
# WITHOUT SPLITTING THE DATASET

lambda_ = 0.01

loss, weights = ridge_regression(y, tX, lambda_)

In [4]:
# Split data in subsets corresponding to a jet value
msks_jet_train = get_jet_masks(tX)
msks_jet_test = get_jet_masks(tX_test)

# Ridge regression parameters for each subset
lambdas = [0.01, 0.01, 0.01]

# Vector to store the final prediction
y_pred = np.zeros(tX_test.shape[0])

for idx in range(len(msks_jet_train)):
    x_train = tX[msks_jet_train[idx]]
    x_test = tX_test[msks_jet_test[idx]]
    y_train = y[msks_jet_train[idx]]

    # Pre-processing of data
    x_train, x_test = process_data(x_train, x_test, True)

    loss, weights = ridge_regression(y_train, x_train, lambdas[idx])

    y_test_pred = predict_labels(weights, x_test)

    y_pred[msks_jet_test[idx]] = y_test_pred

## 5. Logistic Regression with Stochastic Gradient Descent


In [None]:
initial_w = np.random.random(tX.shape[1])
batch_size = 1
max_iters = 10000
gamma = 0.0009

loss, weights = logistic_regression(y, tX, initial_w, batch_size, max_iters, gamma)

## 6. Regularized Logistic Regression with Stochastic Gradient Descent

In [None]:
lambda_ = 0.001
initial_w = np.random.random(tX.shape[1])
batch_size = 1
max_iters = 1000
gamma = 0.1

loss, weights = reg_logistic_regression(y, tX, lambda_, initial_w, batch_size,  max_iters, gamma)

# Cross Validation
IDEA: insert CV in each of the methods above

In [184]:
regression_method = ridge_regression

# Model parameters
lambda_ = 0.0005

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, regression_method, lambda_=lambda_)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.775480 / Test accuracy : 0.776096
1 - Training accuracy: 0.775888 / Test accuracy : 0.774656

Average test accuracy: 0.775376
Variance test accuracy: 0.000001
Min test accuracy: 0.774656
Max test accuracy: 0.776096


# Prediction

#### Generate predictions and save ouput in csv format for submission

In [139]:
# Only for non logistic methods
y_pred = predict_labels(weights, tX_test)

In [36]:
# Only for Logistic methods
y_pred = sigmoid(tX_test@weights)
y_pred[y_pred <0.5] = -1
y_pred[y_pred > 0.5] = 1

In [5]:
OUTPUT_PATH = 'data/FIRSTsubmission.csv' 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

ATTENTION: we are prediction too LESS 1

In [6]:
higgs = np.count_nonzero(y_pred==1)
print(f'From {y_pred.shape[0]} test examples, {higgs} are 1, i.e. the {higgs/y_pred.shape[0]} %')

From 568238 test examples, 161790 are 1, i.e. the 0.28472224666424983 %
