In [1]:
from implementations import *
from helpers import load_csv_data, predict_labels, create_csv_submission, pre_process_data, normalize_data, \
    pre_process_data_jets, load_csv_data_general
import numpy as np
from validation import split_data, validate

In [2]:
data_path = "data/train.csv"
data = load_csv_data_general(data_path)

In [3]:
def normalize_data_features(data):
    """normalize the data by (x - mean(x)) / std(x)."""
    PRI_jet_num_index = 24
    DER_mass_MMC_index = 2
    jets = data[:, PRI_jet_num_index].copy()    
    MMCs = data[:, DER_mass_MMC_index].copy()
    mean_data = np.mean(data[:, 2:])
    data[:, 2:] = data[:, 2:] - mean_data
    std_data = np.std(data[:, 2:])
    data[:, 2:] = data[:, 2:] / std_data
    data[:, PRI_jet_num_index] = jets[:]
    data[:, DER_mass_MMC_index] = MMCs[:]


    return data

In [4]:
# Define the parameters of the algorithm.
max_iters = 100
gamma = 0.3
initial_w = np.ones(len(data[0,2:]))

lambda_ = 0.000001
lambda0 = 0.001
lambda1 = 0.08
lambda2 = 0.0006000000000000001
lambda3 = 6.000000000000001e-05
lambda0_wm = 1.99e-10
lambda1_wm = 0.0004
lambda2_wm = 3e-5
lambda3_wm = 5e-3



In [5]:
folds_n = 10
validation_scores = []

In [6]:
for i in range(0, folds_n):

    if folds_n > 1:
        x_tr, x_te = split_data(data, 1 / folds_n)
    else:
        test_data_path = "data/test.csv"
        test_data = load_csv_data_general(test_data_path)
        x_tr = data
        x_te = test_data

    x_tr0, x_tr1, x_tr2, x_tr3, x_tr0_wm, x_tr1_wm, x_tr2_wm, x_tr3_wm = pre_process_data_jets(x_tr)
    
    initial_w0 = np.ones(len(x_tr0[:, 2:].T))
    initial_w1 = np.ones(len(x_tr1[:, 2:].T))
    initial_w2 = np.ones(len(x_tr2[:, 2:].T))
    initial_w3 = np.ones(len(x_tr3[:, 2:].T))
    initial_w0_wm = np.ones(len(x_tr0_wm[:, 2:].T))
    initial_w1_wm = np.ones(len(x_tr1_wm[:, 2:].T))
    initial_w2_wm = np.ones(len(x_tr2_wm[:, 2:].T))
    initial_w3_wm = np.ones(len(x_tr3_wm[:, 2:].T))
    
    # Start ML algorithm.

    loss0, w0 = reg_logistic_regression_GD(x_tr0[:, 1], x_tr0[:, 2:], initial_w0, max_iters, gamma, lambda0)
    loss1, w1 = reg_logistic_regression_GD(x_tr1[:, 1], x_tr1[:, 2:], initial_w1, max_iters, gamma, lambda1)
    loss2, w2 = reg_logistic_regression_GD(x_tr2[:, 1], x_tr2[:, 2:], initial_w2, max_iters, gamma, lambda2)
    loss3, w3 = reg_logistic_regression_GD(x_tr3[:, 1], x_tr3[:, 2:], initial_w3, max_iters, gamma, lambda3)
    loss0_wm, w0_wm = reg_logistic_regression_GD(x_tr0_wm[:, 1], x_tr0_wm[:, 2:], initial_w0_wm, max_iters, gamma, lambda0_wm)
    loss1_wm, w1_wm = reg_logistic_regression_GD(x_tr1_wm[:, 1], x_tr1_wm[:, 2:], initial_w1_wm, max_iters, gamma, lambda1_wm)
    loss2_wm, w2_wm = reg_logistic_regression_GD(x_tr2_wm[:, 1], x_tr2_wm[:, 2:], initial_w2_wm, max_iters, gamma, lambda2_wm)
    loss3_wm, w3_wm = reg_logistic_regression_GD(x_tr3_wm[:, 1], x_tr3_wm[:, 2:], initial_w3_wm, max_iters, gamma, lambda3_wm)

    # Test algorithm
    x_te0, x_te1, x_te2, x_te3, x_te0_wm, x_te1_wm, x_te2_wm, x_te3_wm = pre_process_data_jets(x_te)
    y_te_predicted0 = predict_labels(w0, x_te0[:, 2:])
    y_te_predicted1 = predict_labels(w1, x_te1[:, 2:])
    y_te_predicted2 = predict_labels(w2, x_te2[:, 2:])
    y_te_predicted3 = predict_labels(w3, x_te3[:, 2:])
    y_te_predicted0_wm = predict_labels(w0_wm, x_te0_wm[:, 2:])
    y_te_predicted1_wm = predict_labels(w1_wm, x_te1_wm[:, 2:])
    y_te_predicted2_wm = predict_labels(w2_wm, x_te2_wm[:, 2:])
    y_te_predicted3_wm = predict_labels(w3_wm, x_te3_wm[:, 2:])

    if folds_n > 1:
        score0 = validate(y_te_predicted0, x_te0[:, 1])
        score1 = validate(y_te_predicted1, x_te1[:, 1])
        score2 = validate(y_te_predicted2, x_te2[:, 1])
        score3 = validate(y_te_predicted3, x_te3[:, 1])
        score0_wm = validate(y_te_predicted0_wm, x_te0_wm[:, 1])
        score1_wm = validate(y_te_predicted1_wm, x_te1_wm[:, 1])
        score2_wm = validate(y_te_predicted2_wm, x_te2_wm[:, 1])
        score3_wm = validate(y_te_predicted3_wm, x_te3_wm[:, 1])

        # validation_scores.append(score)
        print("Accuracy score-jet0:" + str(score0))
        print("Accuracy score-jet1:" + str(score1))
        print("Accuracy score-jet2:" + str(score2))
        print("Accuracy score-jet3:" + str(score3))
        print("Accuracy score-jet0_wm:" + str(score0_wm))
        print("Accuracy score-jet1_wm:" + str(score1_wm))
        print("Accuracy score-jet2_wm:" + str(score2_wm))
        print("Accuracy score-jet3_wm:" + str(score3_wm))
        final_score = (score0 * len(y_te_predicted0) + score1 * len(y_te_predicted1) + score2 * len(
            y_te_predicted2) + score3 * len(y_te_predicted3) +
                       score0_wm * len(y_te_predicted0_wm) + score1_wm * len(y_te_predicted1_wm) + score2_wm * len(
            y_te_predicted2_wm) + score3_wm * len(y_te_predicted3_wm)) / (
                          len(y_te_predicted0) + len(y_te_predicted1) + len(y_te_predicted2) + len(y_te_predicted3) +
                          len(y_te_predicted0_wm) + len(y_te_predicted1_wm) + len(y_te_predicted2_wm) + len(
                              y_te_predicted3_wm))

        print("================ step " + str(i + 1) + " : " + str(final_score) + " ================")

        validation_scores.append(final_score)
    else:
        # Extract prediction to upload in Kaggle
        create_csv_submission(np.concatenate((x_te0[:, 0], x_te1[:, 0], x_te2[:, 0], x_te3[:, 0], x_te0_wm[:, 0],
                                              x_te1_wm[:, 0], x_te2_wm[:, 0], x_te3_wm[:, 0])),
                              np.concatenate((y_te_predicted0, y_te_predicted1, y_te_predicted2, y_te_predicted3,
                                              y_te_predicted0_wm, y_te_predicted1_wm, y_te_predicted2_wm,
                                              y_te_predicted3_wm)),
                              "reg_logistic")

cv_score = np.mean(np.array(validation_scores))
print("================ Final validation Score ================")
print("MEAN-Accuracy score:" + str(cv_score))

print("Successfully written to file!")

Accuracy score-jet0:0.6765827267658273
Accuracy score-jet1:0.6130803160600388
Accuracy score-jet2:0.5648996228536626
Accuracy score-jet3:0.6802163667523564
Accuracy score-jet0_wm:0.9409534358624803
Accuracy score-jet1_wm:0.9048176892663641
Accuracy score-jet2_wm:0.8382575757575758
Accuracy score-jet3_wm:0.9292237442922374
Accuracy score-jet0:0.6753375784672357
Accuracy score-jet1:0.6120382732532266
Accuracy score-jet2:0.5760495042542718
Accuracy score-jet3:0.6788892462742575
Accuracy score-jet0_wm:0.9392523364485982
Accuracy score-jet1_wm:0.9089435685870045
Accuracy score-jet2_wm:0.8439133681852129
Accuracy score-jet3_wm:0.9287321830457614
Accuracy score-jet0:0.6755827628841109
Accuracy score-jet1:0.6141667327675101
Accuracy score-jet2:0.5698884409862192
Accuracy score-jet3:0.6782916621955145
Accuracy score-jet0_wm:0.9397815647443798
Accuracy score-jet1_wm:0.9068231841526045
Accuracy score-jet2_wm:0.8424812030075188
Accuracy score-jet3_wm:0.9256198347107438
Accuracy score-jet0:0.675484