In [1]:
from utils.proj1_helpers import load_csv_data, predict_labels, create_csv_submission, cross_validation_visualization, sort_predictions
from utils.preprocessing import adjust_features, nan_to_mean, standardize, split_jets
from utils.implementations import ridge_regression, logistic_regression, build_k_indices, cross_validation
import time
import numpy as np

In [2]:
train_path = './data/train.csv'
test_path = './data/test.csv'
OUTPUT = 'jet_pred/prediction.csv'

In [3]:
start_time = time.time()
print("Loading datasets")
train_y, train_x, train_ids = load_csv_data(train_path)
test_y, test_x, idstest_ids = load_csv_data(test_path)
print("Datasets loaded in: " + str(time.time() - start_time))

Loading datasets
Datasets loaded in: 23.776541709899902


In [4]:
x_jets_train, \
x_jets_test, \
y_jets_train, \
ids = split_jets(train_x, train_y, test_x, test_y, idstest_ids)

In [5]:
x_jets_train, x_jets_test = adjust_features(x_jets_train, x_jets_test)

In [122]:
seed = 33
k_fold = 8
jet_to_train = 7
degree = 4
lambdas = np.logspace(-9, -1, 30)

In [123]:
x_jets_train[jet_to_train], x_jets_test[jet_to_train] = standardize(x_jets_train[jet_to_train], x_jets_test[jet_to_train])

In [124]:
k_indices = build_k_indices(y_jets_train[jet_to_train], k_fold, seed)
rmse_tr = []
rmse_te = []
best_loss = 50000
best_lambda = 0

In [125]:
for lambda_ in lambdas:
    temp_tr = np.zeros(k_fold)
    temp_te = np.zeros(k_fold)
    for k in range(k_fold):
        tr_loss, te_loss, ws = cross_validation(y_jets_train[jet_to_train], x_jets_train[jet_to_train], k_indices, k, lambda_, degree)
        temp_tr[k] = tr_loss
        temp_te[k] = te_loss
    print(np.mean(temp_te))
    print(np.mean(temp_tr))
    if np.mean(temp_te) < best_loss:
        best_loss = np.mean(temp_te)
        best_lambda = lambda_
    print("After lambdas iteration, the best lambda is : " + str(best_lambda) + " for Lambda : " + str(lambda_) + " with best loss = " + str(best_loss))
    rmse_tr.append(np.mean(temp_tr))
    rmse_te.append(np.mean(temp_te))
    break

Loss after 0 iterations = 12542.4982322
Loss after 100 iterations = 12931.5009788
Loss after 200 iterations = 11348.5881245
Loss after 300 iterations = 11013.4454034
Loss after 400 iterations = 11302.5287797
Loss after 500 iterations = 13022.1582169
Loss after 600 iterations = 10915.9797169
Loss after 700 iterations = 10425.5586777
Loss after 800 iterations = 12742.133632
Loss after 900 iterations = 11034.318325
Loss after 1000 iterations = 11893.3481169
Loss after 1100 iterations = 10489.3460143
Loss after 1200 iterations = 10605.0016986
Loss after 1300 iterations = 11590.7248905
Loss after 1400 iterations = 10904.2436325
Loss after 1500 iterations = 11490.6416796
Loss after 1600 iterations = 10878.9840757
Loss after 1700 iterations = 11234.7508774
Loss after 1800 iterations = 11674.1803457
Loss after 1900 iterations = 10129.9688206
Loss after 2000 iterations = 10048.3046082
Loss after 2100 iterations = 11409.7499755
Loss after 2200 iterations = 10316.918371
Loss after 2300 iterations

KeyboardInterrupt: 

jet_to_train = 0
degree = 1
543.454079519
3788.08242136

jet_to_train = 0
degree = 2
527.606257202
3660.47960373

jet_to_train = 1
degree = 1
3945.51887954
27574.96887

jet_to_train = 1
degree = 2
3908.64952551
27278.0120927

jet_to_train = 1
degree = 3
BAD

jet_to_train = 2
degree = 1
234.197314724 
1630.67508124

jet_to_train = 2
degree = 2
233.081735551
1602.74241649

jet_to_train = 2
degree = 3
236.43029354
1564.26179544

jet_to_train = 3
degree = 1
4085.45459208
28568.0052354

jet_to_train = 3
degree = 2
4009.65515713
27994.504916

jet_to_train = 4
degree = 1
108.914269772
754.685716006

jet_to_train = 4
degree = 2
103.775478397
705.367011582

jet_to_train = 4
degree = 3
109.135289693
693.425231758

jet_to_train = 5
degree = 1
2437.15647835
17025.3416627

jet_to_train = 5
degree = 2
2381.56121261
16565.3445096

jet_to_train = 5
degree = 3
2347.78598357
16311.0713315

jet_to_train = 6
degree = 1
45.6583051767
314.975605086

jet_to_train = 6
degree = 2
44.5916296727
294.565592769

jet_to_train = 7
degree=1
1164.10894967
8119.8504021

jet_to_train = 7
degree= 2
1118.08420289
7722.93721149

jet_to_train = 7
degree = 3
1094.32793334
7572.61130825

To understand how we obtain 83.10% accuracy, we should take a look on the model performance at various stages. The stages are divided into testing the raw data, the data divided into 4 categories each represeneting a specific jet number, and the data divided into 8 categories each represeting a jet number w/ and w/o the mass feature.
The best results are obtained with 8 different classifiers in both models because now each classifier is trained on a specific subset of the whole dataset with a certain set of features that contributes specifically to that subset.
Further investigation should be done to understand why the logistic regression is underperforming to the ridge regression model although we are dealing with a binary classifcation problem.