In [1]:
from implementations import *
from helpers import load_csv_data, predict_labels, create_csv_submission, pre_process_data, normalize_data, \
    pre_process_data_jets, load_csv_data_general
import numpy as np
from validation import split_data, validate

In [2]:
data_path = "data/train.csv"
data = load_csv_data_general(data_path)

In [3]:
def normalize_data_features(data):
    """normalize the data by (x - mean(x)) / std(x)."""
    PRI_jet_num_index = 24
    DER_mass_MMC_index = 2
    jets = data[:, PRI_jet_num_index].copy()    
    MMCs = data[:, DER_mass_MMC_index].copy()
    mean_data = np.mean(data[:, 2:])
    data[:, 2:] = data[:, 2:] - mean_data
    std_data = np.std(data[:, 2:])
    data[:, 2:] = data[:, 2:] / std_data
    data[:, PRI_jet_num_index] = jets[:]
    data[:, DER_mass_MMC_index] = MMCs[:]

    return data

In [6]:
# Define the parameters of the algorithm.
max_iters = 100
gamma = 0.3
initial_w = np.ones(len(data[0,2:]))
folds_n = 10
validation_scores = []

In [7]:
for i in range(0, folds_n):

    if folds_n > 1:
        x_tr, x_te = split_data(data, 1 / folds_n)
    else:
        test_data_path = "data/test.csv"
        test_data = load_csv_data_general(test_data_path)
        x_tr = data
        x_te = test_data

    x_tr0, x_tr1, x_tr2, x_tr3, x_tr0_wm, x_tr1_wm, x_tr2_wm, x_tr3_wm = pre_process_data_jets(x_tr)
    
    initial_w0 = np.ones(len(x_tr0[:, 2:].T))
    initial_w1 = np.ones(len(x_tr1[:, 2:].T))
    initial_w2 = np.ones(len(x_tr2[:, 2:].T))
    initial_w3 = np.ones(len(x_tr3[:, 2:].T))
    initial_w0_wm = np.ones(len(x_tr0_wm[:, 2:].T))
    initial_w1_wm = np.ones(len(x_tr1_wm[:, 2:].T))
    initial_w2_wm = np.ones(len(x_tr2_wm[:, 2:].T))
    initial_w3_wm = np.ones(len(x_tr3_wm[:, 2:].T))
    
    # Start ML algorithm.
    loss0, w0 = logistic_regression_GD(x_tr0[:, 1], x_tr0[:, 2:], initial_w0, max_iters, gamma)
    loss1, w1 = logistic_regression_GD(x_tr1[:, 1], x_tr1[:, 2:], initial_w1, max_iters, gamma)
    loss2, w2 = logistic_regression_GD(x_tr2[:, 1], x_tr2[:, 2:], initial_w2, max_iters, gamma)
    loss3, w3 = logistic_regression_GD(x_tr3[:, 1], x_tr3[:, 2:], initial_w3, max_iters, gamma)
    loss0_wm, w0_wm = logistic_regression_GD(x_tr0_wm[:, 1], x_tr0_wm[:, 2:], initial_w0_wm, max_iters, gamma)
    loss1_wm, w1_wm = logistic_regression_GD(x_tr1_wm[:, 1], x_tr1_wm[:, 2:], initial_w1_wm, max_iters, gamma)
    loss2_wm, w2_wm = logistic_regression_GD(x_tr2_wm[:, 1], x_tr2_wm[:, 2:], initial_w2_wm, max_iters, gamma)
    loss3_wm, w3_wm = logistic_regression_GD(x_tr3_wm[:, 1], x_tr3_wm[:, 2:], initial_w3_wm, max_iters, gamma)

    # Test algorithm
    x_te0, x_te1, x_te2, x_te3, x_te0_wm, x_te1_wm, x_te2_wm, x_te3_wm = pre_process_data_jets(x_te)
    y_te_predicted0 = predict_labels(w0, x_te0[:, 2:])
    y_te_predicted1 = predict_labels(w1, x_te1[:, 2:])
    y_te_predicted2 = predict_labels(w2, x_te2[:, 2:])
    y_te_predicted3 = predict_labels(w3, x_te3[:, 2:])
    y_te_predicted0_wm = predict_labels(w0_wm, x_te0_wm[:, 2:])
    y_te_predicted1_wm = predict_labels(w1_wm, x_te1_wm[:, 2:])
    y_te_predicted2_wm = predict_labels(w2_wm, x_te2_wm[:, 2:])
    y_te_predicted3_wm = predict_labels(w3_wm, x_te3_wm[:, 2:])

    if folds_n > 1:
        score0 = validate(y_te_predicted0, x_te0[:, 1])
        score1 = validate(y_te_predicted1, x_te1[:, 1])
        score2 = validate(y_te_predicted2, x_te2[:, 1])
        score3 = validate(y_te_predicted3, x_te3[:, 1])
        score0_wm = validate(y_te_predicted0_wm, x_te0_wm[:, 1])
        score1_wm = validate(y_te_predicted1_wm, x_te1_wm[:, 1])
        score2_wm = validate(y_te_predicted2_wm, x_te2_wm[:, 1])
        score3_wm = validate(y_te_predicted3_wm, x_te3_wm[:, 1])

        # validation_scores.append(score)
        print("Accuracy score-jet0:" + str(score0))
        print("Accuracy score-jet1:" + str(score1))
        print("Accuracy score-jet2:" + str(score2))
        print("Accuracy score-jet3:" + str(score3))
        print("Accuracy score-jet0_wm:" + str(score0_wm))
        print("Accuracy score-jet1_wm:" + str(score1_wm))
        print("Accuracy score-jet2_wm:" + str(score2_wm))
        print("Accuracy score-jet3_wm:" + str(score3_wm))
        final_score = (score0 * len(y_te_predicted0) + score1 * len(y_te_predicted1) + score2 * len(
            y_te_predicted2) + score3 * len(y_te_predicted3) +
                       score0_wm * len(y_te_predicted0_wm) + score1_wm * len(y_te_predicted1_wm) + score2_wm * len(
            y_te_predicted2_wm) + score3_wm * len(y_te_predicted3_wm)) / (
                          len(y_te_predicted0) + len(y_te_predicted1) + len(y_te_predicted2) + len(y_te_predicted3) +
                          len(y_te_predicted0_wm) + len(y_te_predicted1_wm) + len(y_te_predicted2_wm) + len(
                              y_te_predicted3_wm))

        print("================ step " + str(i + 1) + " : " + str(final_score) + " ================")

        validation_scores.append(final_score)
    else:
        # Extract prediction to upload in Kaggle
        create_csv_submission(np.concatenate((x_te0[:, 0], x_te1[:, 0], x_te2[:, 0], x_te3[:, 0], x_te0_wm[:, 0],
                                              x_te1_wm[:, 0], x_te2_wm[:, 0], x_te3_wm[:, 0])),
                              np.concatenate((y_te_predicted0, y_te_predicted1, y_te_predicted2, y_te_predicted3,
                                              y_te_predicted0_wm, y_te_predicted1_wm, y_te_predicted2_wm,
                                              y_te_predicted3_wm)),
                            "logistic")

cv_score = np.mean(np.array(validation_scores))
print("================ Final validation Score ================")
print("MEAN-Accuracy score:" + str(cv_score))

print("Successfully written to file!")

Accuracy score-jet0:0.6751531019124573
Accuracy score-jet1:0.6144622381959575
Accuracy score-jet2:0.5490950916645149
Accuracy score-jet3:0.6777431823062057
Accuracy score-jet0_wm:0.9396511479862496
Accuracy score-jet1_wm:0.9066414214972327
Accuracy score-jet2_wm:0.8442437923250564
Accuracy score-jet3_wm:0.9278273809523809
Accuracy score-jet0:0.6759200746291114
Accuracy score-jet1:0.6137048909766084
Accuracy score-jet2:0.4839246769692564
Accuracy score-jet3:0.6787940670679278
Accuracy score-jet0_wm:0.9407631377225109
Accuracy score-jet1_wm:0.9090909090909091
Accuracy score-jet2_wm:0.8427980443775855
Accuracy score-jet3_wm:0.9336890243902439
Accuracy score-jet0:0.6761469544825928
Accuracy score-jet1:0.6139206349206349
Accuracy score-jet2:0.5025340215861098
Accuracy score-jet3:0.6807731653475475
Accuracy score-jet0_wm:0.9403016783513916
Accuracy score-jet1_wm:0.9078850405305822
Accuracy score-jet2_wm:0.8444444444444444
Accuracy score-jet3_wm:0.9273700305810397
Accuracy score-jet0:0.676058