In [1]:
from utils.proj1_helpers import load_csv_data, predict_labels, create_csv_submission, cross_validation_visualization, sort_predictions
from utils.preprocessing import adjust_features, nan_to_mean, standardize, split_jets
from utils.implementations import ridge_regression, logistic_regression, build_k_indices, cross_validation
import time
import numpy as np

In [2]:
train_path = './data/train.csv'
test_path = './data/test.csv'
OUTPUT = 'jet_pred/prediction.csv'

In [3]:
start_time = time.time()
print("Loading datasets")
train_y, train_x, train_ids = load_csv_data(train_path)
test_y, test_x, idstest_ids = load_csv_data(test_path)
print("Datasets loaded in: " + str(time.time() - start_time))

Loading datasets
Datasets loaded in: 23.776541709899902


In [4]:
x_jets_train, \
x_jets_test, \
y_jets_train, \
ids = split_jets(train_x, train_y, test_x, test_y, idstest_ids)

In [5]:
x_jets_train, x_jets_test = adjust_features(x_jets_train, x_jets_test)

In [102]:
seed = 33
k_fold = 8
jet_to_train = 7
degree = 1
lambdas = np.logspace(-9, -1, 30)

In [103]:
x_jets_train[jet_to_train], x_jets_test[jet_to_train] = standardize(x_jets_train[jet_to_train], x_jets_test[jet_to_train])

In [104]:
k_indices = build_k_indices(y_jets_train[jet_to_train], k_fold, seed)
rmse_tr = []
rmse_te = []
best_loss = 50000
best_lambda = 0

In [None]:
for lambda_ in lambdas:
    temp_tr = np.zeros(k_fold)
    temp_te = np.zeros(k_fold)
    for k in range(k_fold):
        tr_loss, te_loss, ws = cross_validation(y_jets_train[jet_to_train], x_jets_train[jet_to_train], k_indices, k, lambda_, degree)
        temp_tr[k] = tr_loss
        temp_te[k] = te_loss
    print(np.mean(temp_te))
    print(np.mean(temp_tr))
    if np.mean(temp_te) < best_loss:
        best_loss = np.mean(temp_te)
        best_lambda = lambda_
    print("After lambdas iteration, the best lambda is : " + str(best_lambda) + " for Lambda : " + str(lambda_) + " with best loss = " + str(best_loss))
    rmse_tr.append(np.mean(temp_tr))
    rmse_te.append(np.mean(temp_te))
    break

Loss after 0 iterations = 12542.4982322
Loss after 100 iterations = 11156.2113406
Loss after 200 iterations = 10837.1688132
Loss after 300 iterations = 10643.4696134
Loss after 400 iterations = 10487.5327036
Loss after 500 iterations = 10354.6962542
Loss after 600 iterations = 10239.1328158
Loss after 700 iterations = 10137.1285404
Loss after 800 iterations = 10046.0058978
Loss after 900 iterations = 9963.76136662
Loss after 1000 iterations = 9888.86449159
Loss after 1100 iterations = 9820.12728974
Loss after 1200 iterations = 9756.61481224
Loss after 1300 iterations = 9697.58210392
Loss after 1400 iterations = 9642.42882236
Loss after 1500 iterations = 9590.66601606
Loss after 1600 iterations = 9541.89146354
Loss after 1700 iterations = 9495.77114838
Loss after 1800 iterations = 9452.02519636
Loss after 1900 iterations = 9410.41709731
Loss after 2000 iterations = 9370.74537045
Loss after 2100 iterations = 9332.83706482
Loss after 2200 iterations = 9296.54264963
Loss after 2300 iterati

Loss after 9100 iterations = 8162.90756243
Loss after 9200 iterations = 8155.33567155
Loss after 9300 iterations = 8147.87602991
Loss after 9400 iterations = 8140.52615439
Loss after 9500 iterations = 8133.28363491
Loss after 9600 iterations = 8126.1461318
Loss after 9700 iterations = 8119.11137314
Loss after 9800 iterations = 8112.17715234
Loss after 9900 iterations = 8105.34132574
Accuracy: 0.797292069632%
Loss after 0 iterations = 12542.4982322
Loss after 100 iterations = 11155.4100264
Loss after 200 iterations = 10842.429843
Loss after 300 iterations = 10653.2918345
Loss after 400 iterations = 10500.4012612
Loss after 500 iterations = 10369.6051758
Loss after 600 iterations = 10255.424433
Loss after 700 iterations = 10154.3517569
Loss after 800 iterations = 10063.8349983
Loss after 900 iterations = 9981.95151166
Loss after 1000 iterations = 9907.2263218
Loss after 1100 iterations = 9838.51148843
Loss after 1200 iterations = 9774.90210385
Loss after 1300 iterations = 9715.67640619
L

Loss after 8200 iterations = 8207.58294948
Loss after 8300 iterations = 8198.7913422
Loss after 8400 iterations = 8190.13956557
Loss after 8500 iterations = 8181.62428321
Loss after 8600 iterations = 8173.24226547
Loss after 8700 iterations = 8164.99038502
Loss after 8800 iterations = 8156.86561277
Loss after 8900 iterations = 8148.86501394
Loss after 9000 iterations = 8140.98574433
Loss after 9100 iterations = 8133.2250468
Loss after 9200 iterations = 8125.58024788
Loss after 9300 iterations = 8118.04875457
Loss after 9400 iterations = 8110.62805128
Loss after 9500 iterations = 8103.31569687
Loss after 9600 iterations = 8096.10932196
Loss after 9700 iterations = 8089.00662618
Loss after 9800 iterations = 8082.00537568
Loss after 9900 iterations = 8075.10340072
Accuracy: 0.785686653772%
Loss after 0 iterations = 12542.4982322
Loss after 100 iterations = 11182.9797474
Loss after 200 iterations = 10869.1001333
Loss after 300 iterations = 10677.3661872
Loss after 400 iterations = 10522.35

Loss after 7300 iterations = 8345.40770345
Loss after 7400 iterations = 8335.33608756
Loss after 7500 iterations = 8325.43642344
Loss after 7600 iterations = 8315.70429573
Loss after 7700 iterations = 8306.13544235
Loss after 7800 iterations = 8296.72574765
Loss after 7900 iterations = 8287.47123592
Loss after 8000 iterations = 8278.36806525
Loss after 8100 iterations = 8269.41252178
Loss after 8200 iterations = 8260.60101418
Loss after 8300 iterations = 8251.93006853
Loss after 8400 iterations = 8243.39632336
Loss after 8500 iterations = 8234.99652503
Loss after 8600 iterations = 8226.72752333
Loss after 8700 iterations = 8218.58626727
Loss after 8800 iterations = 8210.56980111
Loss after 8900 iterations = 8202.67526059
Loss after 9000 iterations = 8194.89986936
Loss after 9100 iterations = 8187.2409355
Loss after 9200 iterations = 8179.69584833
Loss after 9300 iterations = 8172.26207527
Loss after 9400 iterations = 8164.93715889
Loss after 9500 iterations = 8157.71871412
Loss after 9

Loss after 6400 iterations = 8484.27906713
Loss after 6500 iterations = 8472.63083265
Loss after 6600 iterations = 8461.20139723
Loss after 6700 iterations = 8449.9845742
Loss after 6800 iterations = 8438.97441635
Loss after 6900 iterations = 8428.16520374
Loss after 7000 iterations = 8417.55143227
Loss after 7100 iterations = 8407.12780304
Loss after 7200 iterations = 8396.8892123
Loss after 7300 iterations = 8386.83074214
Loss after 7400 iterations = 8376.94765164
Loss after 7500 iterations = 8367.23536864
Loss after 7600 iterations = 8357.68948193
Loss after 7700 iterations = 8348.30573394
Loss after 7800 iterations = 8339.08001388
Loss after 7900 iterations = 8330.00835116
Loss after 8000 iterations = 8321.08690929
Loss after 8100 iterations = 8312.31198006
Loss after 8200 iterations = 8303.67997804
Loss after 8300 iterations = 8295.18743541
Loss after 8400 iterations = 8286.83099699
Loss after 8500 iterations = 8278.60741561
Loss after 8600 iterations = 8270.51354769


jet_to_train = 0
degree = 1
543.454079519
3788.08242136

jet_to_train = 0
degree = 2
527.606257202
3660.47960373

jet_to_train = 1
degree = 1
3945.51887954
27574.96887

jet_to_train = 1
degree = 2
3908.64952551
27278.0120927

jet_to_train = 1
degree = 3
BAD

jet_to_train = 2
degree = 1
234.197314724 
1630.67508124

jet_to_train = 2
degree = 2
233.081735551
1602.74241649

jet_to_train = 2
degree = 3
236.43029354
1564.26179544

jet_to_train = 3
degree = 1
4085.45459208
28568.0052354

jet_to_train = 3
degree = 2
4009.65515713
27994.504916

jet_to_train = 4
degree = 1
108.914269772
754.685716006

jet_to_train = 4
degree = 2
103.775478397
705.367011582

jet_to_train = 4
degree = 3
109.135289693
693.425231758

jet_to_train = 5
degree = 1
2437.15647835
17025.3416627

jet_to_train = 5
degree = 2
2381.56121261
16565.3445096

jet_to_train = 5
degree = 3
2347.78598357
16311.0713315

jet_to_train = 6
degree = 1
45.6583051767
314.975605086

jet_to_train = 6
degree = 2
44.5916296727
294.565592769

jet_to_train = 7
degree=1

To understand how we obtain 83.10% accuracy, we should take a look on the model performance at various stages. The stages are divided into testing the raw data, the data divided into 4 categories each represeneting a specific jet number, and the data divided into 8 categories each represeting a jet number w/ and w/o the mass feature.
The best results are obtained with 8 different classifiers in both models because now each classifier is trained on a specific subset of the whole dataset with a certain set of features that contributes specifically to that subset.
Further investigation should be done to understand why the logistic regression is underperforming to the ridge regression model although we are dealing with a binary classifcation problem.