In [1]:
# Useful starting lines
import numpy as np
from helpers import *
from methods import *
from process_data import *
from crossvalidation import *
from select_parameter import *

%load_ext autoreload
%autoreload 2

seed=10

# Load the dataset

In [2]:
from zipfile import ZipFile 
  
# # specifying the zip file name 
file_name = 'Data/test.csv.zip'
  
# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
    zip.extractall('Data/') 

In [3]:
y, tX, ids = load_csv_data('Data/train.csv')
_, tX_test, ids_test = load_csv_data('Data/test.csv')

# Methods

## 1. Least Squares with Gradient Descent

#### Cross Validation

In [None]:
# Degree polynomial expansion
degrees = [10,10,10,10]

# Model parameters
max_iters = 5000
gamma = 0.005


# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, least_squares_GD, _, k_indices, k, degrees, alpha,
                                           max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.780448 / Test accuracy : 0.781016
1 - Training accuracy: 0.779968 / Test accuracy : 0.778256

Average test accuracy: 0.779636
Variance test accuracy: 0.000002
Min test accuracy: 0.778256
Max test accuracy: 0.781016


## 2. Least Squares with Stochastic Gradient Descent

#### Cross Validation

In [None]:
# Degree polynomial expansion
degrees = [10,10,10,10]

# Model parameters
max_iters = 1000
gamma = 0.005
batch_size=1


# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, least_squares_SGD, _, k_indices, k, degrees, alpha, 
                                           max_iters=max_iters, gamma=gamma, batch_size=batch_size)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

## 3. Least Squares with Normal Equations 

#### Search for hyperparameters: Degrees.

In [None]:
# Model parameters for least squares
#tuning parameters for each category
degrees_candidates = [9,10,11,12,13,14]
alpha = 0
k_fold = 3
par_degree, par_lambda, accu = select_parameters_least_squares(y,tX,degrees_candidates,alpha,k_fold,seed)
par_degree, accu

#### Cross Validation

In [11]:
# Degree polynomial expansion
#degrees = [8,11,9]
degrees = [7,10,9,9]
#degrees = [5,7,6,6]
eta=[10,10,14,14]

alphas = [0,0,0]

# Split data in k-fold
k_fold = 3
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    print(k)
    acc_train, acc_test = cross_validation(y, tX, least_squares, k_indices, k, degrees, alphas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0


LinAlgError: Singular matrix

## 4. Ridge regression with Normal Equations

#### Search for hyperparameters: Lambdas, Degrees

In [44]:
# Model parameters for ridge regression
#tuning parameters for each category
degrees_candidates = [6,7,8,9,10]
alphas_candidates=[2,3,4,5,6]
lambdas_candidates = np.logspace(-7,-1,7)
#lambdas_candidates=[1e-5]


k_fold = 2
par_degree, par_lambda, par_alpha, accu = select_parameters_ridge_regression(y,tX,degrees_candidates,lambdas_candidates,
                                                                  alphas_candidates,k_fold,seed)
par_degree, par_lambda, par_alpha, accu

6 1e-07 2
0.8521899271358796 0.8501080951237089
6 1e-07 2
0.8527904556009288 0.8501481303547122
6 1e-07 3
0.8521899271358796 0.8502882536632237
6 1e-07 3
0.8527504203699255 0.8503483065097286
6 1e-07 4
0.8519096805188566 0.85048842981824
6 1e-07 4
0.8527103851389223 0.8505484826647449
6 1e-07 5
0.8516894867483386 0.85048842981824
6 1e-07 5
0.8527304027544239 0.8503483065097286
6 1e-07 6
0.8517895748258467 0.8504083593562335
6 1e-07 6
0.8526903675234206 0.8497878132756826
6 1e-06 2
0.8521298742893747 0.8501881655857154
6 1e-06 2
0.852670349907919 0.8501080951237089
6 1e-06 3
0.8521498919048763 0.8503683241252302
6 1e-06 3
0.8525502442149091 0.8503483065097286
6 1e-06 4
0.8520498038273681 0.85048842981824
6 1e-06 4
0.852670349907919 0.8504283769717351
6 1e-06 5
0.8518496276723516 0.8503883417407319
6 1e-06 5
0.8525902794459124 0.8502882536632237
6 1e-06 6
0.851889662903355 0.8503883417407319
6 1e-06 6
0.8526903675234206 0.8497878132756826
6 1e-05 2
0.8513491872848107 0.8501681479702138
6

0.852169909520378 0.850048042277204
8 0.0001 4
0.8513491872848107 0.8507286412042597
8 0.0001 4
0.8520898390583713 0.8504083593562335
8 0.0001 5
0.8516294339018337 0.8505084474337417
8 0.0001 5
0.8520498038273681 0.8501080951237089
8 0.0001 6
0.8513491872848107 0.850208183201217
8 0.0001 6
0.8523100328288894 0.8498678837376892
8 0.001 2
0.8511289935142926 0.8469653294899512
8 0.001 2
0.852169909520378 0.8499479541996957
8 0.001 3
0.8507286412042597 0.8493674433501481
8 0.001 3
0.8516894867483386 0.850268236047722
8 0.001 4
0.8511690287452959 0.8499079189686924
8 0.001 4
0.8516294339018337 0.8503082712787253
8 0.001 5
0.8512090639762991 0.8496677075826727
8 0.001 5
0.851389222515814 0.8496877251981744
8 0.001 6
0.8510689406677876 0.8497277604291776
8 0.001 6
0.8514292577468172 0.8492473376571383
8 0.01 2
0.8501481303547122 0.8480462807270398
8 0.01 2
0.8506886059732565 0.8496276723516695
8 0.01 3
0.8498879013531908 0.8487268796540957
8 0.01 3
0.8505084474337417 0.8495275842741613
8 0.01

0.8485066858835776 0.8480662983425414
10 0.1 5
0.8482464568820562 0.8469853471054528
10 0.1 5
0.8478861398030266 0.8480863159580431
10 0.1 6
0.8479261750340299 0.8471855232604693
10 0.1 6
0.8477259988790136 0.8480262631115382
6 1e-07 2
0.822810275456515 0.8164396987516764
6 1e-07 2
0.8203342618384402 0.8166460332198494
6 1e-07 3
0.8227844836479934 0.816671825028371
6 1e-07 3
0.820669555349221 0.8172392448158465
6 1e-07 4
0.822810275456515 0.8168781594965439
6 1e-07 4
0.8205148044980914 0.8171876611988033
6 1e-07 5
0.8230939853502528 0.8168523676880223
6 1e-07 5
0.8207211389662643 0.8167492004539358
6 1e-07 6
0.8233519034354689 0.8171876611988033
6 1e-07 6
0.8210306406685237 0.8165944496028061
6 1e-06 2
0.8225781491798205 0.8164396987516764
6 1e-06 2
0.8203600536469617 0.8165944496028061
6 1e-06 3
0.822810275456515 0.8170329103476736
6 1e-06 3
0.8207727225833076 0.8176003301351491
6 1e-06 4
0.8230681935417311 0.8169555349221087
6 1e-06 4
0.8209274734344372 0.8173166202414113
6 1e-06 5
0

0.8237903641803364 0.8162333642835036
8 1e-05 6
0.821211183328175 0.8159496543897659
8 0.0001 2
0.8228618590735582 0.8162333642835036
8 0.0001 2
0.8205663881151346 0.8154338182193336
8 0.0001 3
0.8226039409883421 0.8168523676880223
8 0.0001 3
0.8202568864128753 0.8175745383266274
8 0.0001 4
0.8233003198184257 0.8166976168368926
8 0.0001 4
0.8205663881151346 0.8173939956669761
8 0.0001 5
0.8238677396059012 0.8162849479005467
8 0.0001 5
0.8203084700299185 0.8174455792840194
8 0.0001 6
0.8236356133292067 0.8158980707727226
8 0.0001 6
0.8210822242855669 0.8170329103476736
8 0.001 2
0.8218559785412153 0.8160528216238523
8 0.001 2
0.8200505519447023 0.816001238006809
8 0.001 3
0.8226039409883421 0.8165170741772413
8 0.001 3
0.8200247601361808 0.8174197874754978
8 0.001 4
0.8226555246053854 0.8172392448158465
8 0.001 4
0.819741050242443 0.8171102857732384
8 0.001 5
0.8228618590735582 0.8174455792840194
8 0.001 5
0.8204116372640049 0.8167749922624574
8 0.001 6
0.8227844836479934 0.816671825028

0.8166202414113277 0.8140410605591664
10 0.1 2
0.8154338182193336 0.8111781698132673
10 0.1 3
0.8171102857732384 0.8141442277932529
10 0.1 3
0.8151501083255958 0.8137573506654286
10 0.1 4
0.8168523676880223 0.8149695656659445
10 0.1 4
0.8157175281130713 0.8135510161972558
10 0.1 5
0.8174197874754978 0.8148148148148148
10 0.1 5
0.8166202414113277 0.8133962653461261
10 0.1 6
0.8180129990714949 0.814866398431858
10 0.1 6
0.816800784070979 0.814531104921077
6 1e-07 2
0.855449256982162 0.8478674423092829
6 1e-07 2
0.8566623473298227 0.848666979129332
6 1e-07 3
0.8556146783932067 0.8487496898348543
6 1e-07 3
0.8571861817981308 0.8486945493645061
6 1e-07 4
0.8563039342725594 0.8492735243031623
6 1e-07 4
0.8565244961539522 0.8489151112458989
6 1e-07 5
0.8567726282705191 0.8494113754790328
6 1e-07 5
0.8561936533318629 0.8493286647735105
6 1e-07 6
0.8578478674423092 0.8480052934851534
6 1e-07 6
0.8571034710926084 0.8496043671252516
6 1e-06 2
0.8551184141600728 0.8477847316037606
6 1e-06 2
0.8564

0.8575170246202201 0.8488599707755508
8 1e-05 4
0.8564693556836039 0.8481431446610239
8 1e-05 4
0.8570759008574343 0.8492459540679882
8 1e-05 5
0.856993190151912 0.8482258553665463
8 1e-05 5
0.8575721650905682 0.8488324005403766
8 1e-05 6
0.8580132888533539 0.8485291279534615
8 1e-05 6
0.8576824460312646 0.8494113754790328
8 0.0001 2
0.8549254225138541 0.8484464172479391
8 0.0001 2
0.8557525295690772 0.848391276777591
8 0.0001 3
0.8560006616856441 0.8496870778307739
8 0.0001 3
0.8566623473298227 0.8490253921865953
8 0.0001 4
0.8560833723911665 0.8491632433624658
8 0.0001 4
0.8564969259187781 0.8491081028921177
8 0.0001 5
0.8565244961539522 0.8487496898348543
8 0.0001 5
0.8568001985056932 0.8490529624217694
8 0.0001 6
0.8573240329740013 0.8485566981886355
8 0.0001 6
0.8566623473298227 0.8491356731272918
8 0.001 2
0.8542637368696755 0.8493286647735105
8 0.001 2
0.8549529927490281 0.8481982851313722
8 0.001 3
0.8546221499269389 0.8500454908880373
8 0.001 3
0.8554216867469879 0.84897025171

0.8528300846406219 0.8495216564197292
10 0.01 4
0.855173554630421 0.8487496898348543
10 0.01 5
0.8532160679330595 0.8503487634749525
10 0.01 5
0.8554768272173362 0.8481431446610239
10 0.01 6
0.8529403655813184 0.8505968955915194
10 0.01 6
0.8549529927490281 0.8486118386589837
10 0.1 2
0.8482258553665463 0.8469576245485374
10 0.1 2
0.8498249290066444 0.8458823853767473
10 0.1 3
0.8487772600700284 0.8459650960822696
10 0.1 3
0.8497973587714703 0.8469576245485374
10 0.1 4
0.8496870778307739 0.846075377022966
10 0.1 4
0.848942681481073 0.8462683686691848
10 0.1 5
0.8496870778307739 0.8470954757244079
10 0.1 5
0.8500454908880373 0.8467646329023186
10 0.1 6
0.8499627801825149 0.8469576245485374
10 0.1 6
0.850762317002564 0.8472884673706267


([7.0, 8.0, 6.0],
 [1e-06, 0.001, 0.01],
 [4.0, 5.0, 6.0],
 [0.8509488349747778, 0.8174455792840194, 0.8508725979432604])

#### Cross Validation

In [4]:
# Process data parameters
degrees=[7, 8, 6]
alphas=[4.0, 5.0, 6.0]

# Model parameters
lambdas=[1e-06, 0.001, 0.01]

# Split data in k-fold
k_fold = 3
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    print(k)
    acc_train, acc_test = cross_validation_jet(y, tX, ridge_regression, k_indices, k, degrees, alphas, lambdas)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0
0.8421993687974751 0.8400513602054408
1
0.8428293713174853 0.8397393589574358
2
0.8423313693254773 0.8412273649094596
0 - Training accuracy: 0.842199 / Test accuracy : 0.840051
1 - Training accuracy: 0.842829 / Test accuracy : 0.839739
2 - Training accuracy: 0.842331 / Test accuracy : 0.841227

Average test accuracy: 0.840339
Variance test accuracy: 0.000000
Min test accuracy: 0.839739
Max test accuracy: 0.841227


## 5. Logistic Regression with Stochastic Gradient Descent


#### Cross Validation

In [None]:
# Degree polynomial expansion
degrees = [7,10,9,9]

# Model parameters
max_iters = 5000
gamma = 0.005
batch_size = 1

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, logistic_regression, k_indices, k, batch_size=batch_size, 
                                           max_iters=max_iters, gamma=gamma, degrees=degrees)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

7.997182550126146 0 /5000
0.7524469806959074 1000 /5000
0.4363734465688057 2000 /5000
0.400023839876429 3000 /5000
0.40076385746875537 4000 /5000
10.371044956652037 0 /5000
1.5550707519015254 1000 /5000
0.661663323193932 2000 /5000
0.5936779549816508 3000 /5000
0.5260682969312651 4000 /5000
8.442467521375363 0 /5000
1.7929335626679517 1000 /5000
0.7223707415043099 2000 /5000
0.5914451124506169 3000 /5000
0.5454690010169485 4000 /5000
12.029170671306018 0 /5000
2.0525746152729285 1000 /5000
0.8818547205280977 2000 /5000
0.6671236655053144 3000 /5000
0.5709404738188759 4000 /5000
6.689309726685983 0 /5000
0.7984062586900572 1000 /5000
0.4669079564079578 2000 /5000
0.4200246548981903 3000 /5000
0.4292098025507569 4000 /5000
9.748515728107092 0 /5000
1.2804452477863466 1000 /5000
0.6366004683073873 2000 /5000
0.5702327086476892 3000 /5000
0.5406348712757687 4000 /5000
10.207457083659229 0 /5000
1.8709968020623324 1000 /5000
0.8871537214879525 2000 /5000
0.6549933484898737 3000 /5000
0.6273

## 6. Regularized Logistic Regression with Stochastic Gradient Descent

#### Optimal Lambda

In [None]:
# TO DO

#### Cross Validation

In [None]:
# TO DO

lambda_ = 0.001
initial_w = np.random.random(tX.shape[1])
batch_size = 1
max_iters = 1000
gamma = 0.1

loss, weights = reg_logistic_regression(y, tX, lambda_, initial_w, batch_size,  max_iters, gamma)

# Prediction (file.run)
by now the best accuracy predicted is through RIDGE REGRESSION

In [54]:
# Split data in subsets corresponding to a jet value
msks_jet_train = get_jet_masks(tX)
msks_jet_test = get_jet_masks(tX_test)

# Process data parameters
degrees=[7, 8, 6]
alphas=[4.0, 5.0, 6.0]

# Model parameters
lambdas=[1e-06, 0.001, 0.01]

# Vector to store the final prediction
y_pred = np.zeros(tX_test.shape[0])

for idx in range(len(msks_jet_train)):
    x_train = tX[msks_jet_train[idx]]
    x_test = tX_test[msks_jet_test[idx]]
    y_train = y[msks_jet_train[idx]]

    # Pre-processing of data
    x_train, x_test = process_data(x_train, x_test, alphas[idx])
    # Trasform the data and add an intercepta
    x_train, x_test = phi(x_train, x_test, degrees[idx])

    weights, loss = ridge_regression(y_train, x_train, lambdas[idx])

    y_test_pred = predict_labels(weights, x_test)

    y_pred[msks_jet_test[idx]] = y_test_pred

In [55]:
higgs = np.count_nonzero(y_pred==1)
print(f'From {y_pred.shape[0]} test examples, {higgs} are 1, i.e. the {higgs/y_pred.shape[0]} %')

From 568238 test examples, 177959 are 1, i.e. the 0.3131768730707908 %


#### Generate predictions and save ouput in csv format for submission

In [56]:
OUTPUT_PATH = 'data/RidgeRegression.csv' 
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [57]:
y_pred1=y_pred

# OTHERS (old)

### Umbalanced Dataset

In [None]:
higgs = np.count_nonzero(y==1)
print(f'From {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

# Random Over Sampling
#tX, y = Random_Over_Sampling(tX, y)

#higgs = np.count_nonzero(y==1)
#print(f'Applying Random Over Sampling: \nFrom {y.shape[0]} training examples, {higgs} are 1, i.e. the {higgs/y.shape[0]} %')

# Preprocessing

In [None]:
tX, tX_test = process_data(tX, tX_test, add_constant_col=True)

# Cross Validation
IDEA: insert CV in each of the methods above

In [None]:
def cross_validation(y, x, k_indices, k, regression_method, **args):
    """
    Completes k-fold cross-validation using the regression method
    passed as argument.
    """
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()

    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]

    # data pre-processing
    #x_train, x_test = process_data(x_train, x_test, True)

    # compute weights using given method
    loss, weights = regression_method(y=y_train, tx=x_train, **args)
    
    # predict output for train and test data
    y_train_pred = predict_labels(weights, x_train)
    y_test_pred = predict_labels(weights, x_test)
    
    
    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train)
    acc_test = compute_accuracy(y_test_pred, y_test)

    return acc_train, acc_test

In [None]:
regression_method = ridge_regression

# Model parameters
lambda_ = 0.0005

# Split data in k-fold
k_fold = 2
k_indices = build_k_indices(y, k_fold, seed)


accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, regression_method, lambda_=lambda_)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.775480 / Test accuracy : 0.776096
1 - Training accuracy: 0.775888 / Test accuracy : 0.774656

Average test accuracy: 0.775376
Variance test accuracy: 0.000001
Min test accuracy: 0.774656
Max test accuracy: 0.776096


In [None]:
# TO CHECK

# To evaluate the best lambda that minimizes the test error
loss, weights, best_lambda = cross_validation_ridge_regression(y,tX)

In [None]:
# Only for non logistic methods
y_pred = predict_labels(weights, tX_test)

In [None]:
# Only for Logistic methods
y_pred = sigmoid(tX_test@weights)
y_pred[y_pred <0.5] = -1
y_pred[y_pred > 0.5] = 1