In [1]:
from utils.proj1_helpers import load_csv_data, predict_labels, create_csv_submission, cross_validation_visualization, sort_predictions
from utils.preprocessing import adjust_features, nan_to_mean, standardize, split_jets
from utils.train_utils import build_poly, get_lambda, get_degree

from utils.implementations import ridge_regression, logistic_regression, build_k_indices, cross_validation, least_squares, least_squares_sgd, least_squares_gd,reg_logistic_regression
import time
import numpy as np

In [2]:
train_path = './data/train.csv'
test_path = './data/test.csv'
OUTPUT = 'jet_pred.csv'

In [3]:

# Loading datasets
start_time = time.time()
print("Loading datasets")
train_y, train_x, train_ids = load_csv_data(train_path)
test_y, test_x, idstest_ids = load_csv_data(test_path)
print("Datasets loaded in: " + str(time.time() - start_time))

Loading datasets
Datasets loaded in: 24.359423875808716


In [4]:
x_jets_train, \
x_jets_test, \
y_jets_train, \
ids = split_jets(train_x, train_y, test_x, test_y, idstest_ids)

In [5]:
x_jets_train, x_jets_test = adjust_features(x_jets_train, x_jets_test)

In [6]:
px_train = []
px_test = []
ws = []
losses = []
preds = []

In [7]:
for i in range(len(x_jets_train)):

    # Getting correct Lambda
    lambda_ = get_lambda(i)
    # Getting correct Degree
    degree = get_degree(i)

    # Standardizing dataset
    x_jets_train[i], x_jets_test[i] = standardize(x_jets_train[i], x_jets_test[i])

    # import pandas as pd
    # import matplotlib.pyplot as plt
    # df = pd.DataFrame(x_jets_train[i])
    # df.hist()
    # plt.show()
    
    
    
    # Building polynomial features
    px_train.append(build_poly(degree=degree,x=x_jets_train[i]))
    px_test.append(build_poly(degree=degree,x=x_jets_test[i]))
    
    initial_w = np.zeros((px_train[i].shape[1]))
    #gamma = 1 / px_train[i].shape[1]
    # Training model
    #w, loss = ridge_regression(lambda_=lambda_, tx=px_train[i], y=y_jets_train[i])
    #w, loss = least_squares(y=y_jets_train[i],tx=px_train[i])
    w, loss = logistic_regression(y=y_jets_train[i], tx=px_train[i], initial_w = None, max_iters=10000, gamma = 0.0000001 )
    print(loss)
    ws.append(w)
    losses.append(loss)

    # Predicting labels
    preds.append(predict_labels(w, px_test[i]))


Loss after 0 iterations = 18107.0837978
Loss after 100 iterations = 8749.85708982
Loss after 200 iterations = 6810.57390865
Loss after 300 iterations = 6058.33377739
Loss after 400 iterations = 5653.67610125
Loss after 500 iterations = 5395.30042256
Loss after 600 iterations = 5214.34342068
Loss after 700 iterations = 5080.59272905
Loss after 800 iterations = 4978.22449781
Loss after 900 iterations = 4897.88622668
Loss after 1000 iterations = 4833.58638912
Loss after 1100 iterations = 4781.2741199
Loss after 1200 iterations = 4738.10670073
Loss after 1300 iterations = 4702.03404072
Loss after 1400 iterations = 4671.54575654
Loss after 1500 iterations = 4645.50924712
Loss after 1600 iterations = 4623.06217362
Loss after 1700 iterations = 4603.53907934
Loss after 1800 iterations = 4586.42020534
Loss after 1900 iterations = 4571.29513035
Loss after 2000 iterations = 4557.83652709
Loss after 2100 iterations = 4545.78094882
Loss after 2200 iterations = 4534.91457936
Loss after 2300 iteratio

Loss after 9200 iterations = 31495.647865
Loss after 9300 iterations = 31490.9667293
Loss after 9400 iterations = 31486.3750936
Loss after 9500 iterations = 31481.8703476
Loss after 9600 iterations = 31477.4499887
Loss after 9700 iterations = 31473.1116153
Loss after 9800 iterations = 31468.8529212
Loss after 9900 iterations = 31464.6716905
31460.6064853
Loss after 0 iterations = 5241.57897939
Loss after 100 iterations = 3482.19384508
Loss after 200 iterations = 2902.70131929
Loss after 300 iterations = 2613.62418687
Loss after 400 iterations = 2445.42164558
Loss after 500 iterations = 2336.87281391
Loss after 600 iterations = 2262.33667447
Loss after 700 iterations = 2208.87255548
Loss after 800 iterations = 2169.00792334
Loss after 900 iterations = 2138.27568027
Loss after 1000 iterations = 2113.89215535
Loss after 1100 iterations = 2094.05302813
Loss after 1200 iterations = 2077.54986238
Loss after 1300 iterations = 2063.55042629
Loss after 1400 iterations = 2051.46746535
Loss after

Loss after 8300 iterations = 32023.3922806
Loss after 8400 iterations = 32018.6346368
Loss after 8500 iterations = 32013.9308518
Loss after 8600 iterations = 32009.2790914
Loss after 8700 iterations = 32004.6776488
Loss after 8800 iterations = 32000.1249333
Loss after 8900 iterations = 31995.6194602
Loss after 9000 iterations = 31991.1598417
Loss after 9100 iterations = 31986.7447786
Loss after 9200 iterations = 31982.3730527
Loss after 9300 iterations = 31978.0435206
Loss after 9400 iterations = 31973.7551069
Loss after 9500 iterations = 31969.506799
Loss after 9600 iterations = 31965.2976418
Loss after 9700 iterations = 31961.126733
Loss after 9800 iterations = 31956.9932189
Loss after 9900 iterations = 31952.89629
31948.8756139
Loss after 0 iterations = 2046.17047701
Loss after 100 iterations = 1671.23067959
Loss after 200 iterations = 1485.89382986
Loss after 300 iterations = 1371.10552429
Loss after 400 iterations = 1291.32689822
Loss after 500 iterations = 1232.04127745
Loss afte

Loss after 7400 iterations = 19020.9710188
Loss after 7500 iterations = 19014.3762703
Loss after 7600 iterations = 19007.8857682
Loss after 7700 iterations = 19001.4966003
Loss after 7800 iterations = 18995.2060324
Loss after 7900 iterations = 18989.0114979
Loss after 8000 iterations = 18982.9105885
Loss after 8100 iterations = 18976.9010466
Loss after 8200 iterations = 18970.9807598
Loss after 8300 iterations = 18965.1477584
Loss after 8400 iterations = 18959.4002161
Loss after 8500 iterations = 18953.7364569
Loss after 8600 iterations = 18948.1549704
Loss after 8700 iterations = 18942.6544401
Loss after 8800 iterations = 18937.2337912
Loss after 8900 iterations = 18931.8922679
Loss after 9000 iterations = 18926.6295553
Loss after 9100 iterations = 18921.4459613
Loss after 9200 iterations = 18916.3426791
Loss after 9300 iterations = 18911.3221345
Loss after 9400 iterations = 18906.3883869
Loss after 9500 iterations = 18901.5474628
Loss after 9600 iterations = 18896.8073602
Loss after 

Loss after 6500 iterations = 8971.03248557
Loss after 6600 iterations = 8963.85184194
Loss after 6700 iterations = 8956.81390949
Loss after 6800 iterations = 8949.91339537
Loss after 6900 iterations = 8943.14528164
Loss after 7000 iterations = 8936.50480809
Loss after 7100 iterations = 8929.9874563
Loss after 7200 iterations = 8923.58893486
Loss after 7300 iterations = 8917.30516567
Loss after 7400 iterations = 8911.13227127
Loss after 7500 iterations = 8905.06656305
Loss after 7600 iterations = 8899.10453031
Loss after 7700 iterations = 8893.24283023
Loss after 7800 iterations = 8887.47827842
Loss after 7900 iterations = 8881.80784042
Loss after 8000 iterations = 8876.22862376
Loss after 8100 iterations = 8870.73787089
Loss after 8200 iterations = 8865.33295286
Loss after 8300 iterations = 8860.01136392
Loss after 8400 iterations = 8854.77071712
Loss after 8500 iterations = 8849.60874131
Loss after 8600 iterations = 8844.5232797
Loss after 8700 iterations = 8839.51229068
Loss after 88

- 4312.79911789 - 0
- 31460.6064 -1
- 1820.590 - 2
- 31948.875 -3
- 797.932472075 -4
- 18879.0641026 -5
- 335.232308935 -6
- 8780.58062848 -7

In [None]:
len(ids)

In [8]:
# Sorting predictions
ids, preds = sort_predictions(ids, preds)
print("Creating submission")


Creating submission


In [None]:
len(preds)

In [None]:
len(ids)

In [9]:
# Creating output files
create_csv_submission(ids, preds, OUTPUT)
print("Created submission")

Created submission
