In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
from implementations import *

## Do your thing crazy machine learning thing here :) ...

In [4]:
# Separate the data according to the value of column 24 (PRI_jet_num) 

def separate(y, tX, ids):
    
    split_x = []
    split_y = []
    split_ids = []
    
    jet_column_nbr = 22
    
    for i in range(4):
        
        split_x.append(tX[np.where(tX[:,jet_column_nbr] == i)])
        split_y.append(y[np.where(tX[:,jet_column_nbr] == i)])
        split_ids.append(ids[np.where(tX[:,jet_column_nbr] == i)])
    
    
    
    return split_x, split_y, split_ids

In [5]:
split_x, split_y, split_ids = separate(y, tX, ids)

In [6]:
#remove the columns from each set of data given a boolean array

def removeNone(data, selection):
   
    cleaned=[]
    
    for i in range(4):
        curr_data = data[i]
        
        cleaned.append(curr_data[:,selection[i]])
      
    return cleaned
    
    

In [7]:
#print statistics about the None values (-999) for each columns
#returns a boolean array that can be used to filter the columns that have 100% of undefined values (-999)
def dataStatistics(data):
    
    stats=[]
    
    for i in range(len(data)):
        
        print("Statistics ")
        print("Type :")
        print(i)
        
        
        nones = (data[i] == -999)
    
        mean = np.sum(nones, axis=0)/nones.shape[0]
        print(mean) 
        stats.append(mean != 1)
    
    return stats
    

In [8]:
selection = dataStatistics(split_x)

Statistics 
Type :
0
[0.26145747 0.         0.         0.         1.         1.
 1.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         1.
 1.         1.         1.         1.         1.         0.        ]
Statistics 
Type :
1
[0.09751883 0.         0.         0.         1.         1.
 1.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         1.         1.         0.        ]
Statistics 
Type :
2
[0.05859584 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
Statistics 
Type :
3
[0.0

In [9]:
cleaned = removeNone(split_x, selection)



# Now we can either drop the lines with residual Nones or replace the Nones by the median value

In [10]:
#replace the value of column 0 (can be None sometimes) by the median value of this column

def putMedianInsteadOfNone(cleaned):
    
    completed_data = []
    
    for i in range(len(cleaned)):
        #current PRI_jet_num
        current = cleaned[i]
        
        median = np.median(current[np.where(current[:,0] != -999)], axis = 0)
        
        #replace -999 by median value
        current[np.where(current[:,0] == -999)] = median
        
        completed_data.append(current)
    
    
    return completed_data
    
    

In [11]:
cleaned_with_median = putMedianInsteadOfNone(cleaned)



In [12]:
#Instead of putting the median we can simply drop the data where columns 0 == -999
def dropLineIfNone(cleaned, split_y, split_ids):
    
    res_x=[]
    res_y=[]
    res_ids=[]
    
    for i in range(len(cleaned)):
        
        current = cleaned[i]
        
        drop_indexes = np.where(current[:,0] != -999)
        
        res_x.append(current[drop_indexes])
        res_y.append(current[drop_indexes])
        res_ids.append(current[drop_indexes])
        
    return res_x, res_y, res_ids

In [13]:
dropped_x, dropped_y, dropped_ids = dropLineIfNone(cleaned, split_y, split_ids)


## At this point, the first values in each of the split data has a PRI_jet_num = 0, then 1 and so on. The data is clean and we can work with it.

## Features Expension

In [14]:
#method to perform polynomial feature expension

def build_poly(x, degree):
   
    x_extended = x

    for d in range (2, degree +1):
        x_extended = np.c_[x_extended, x**d]
        

    return x_extended

## Cross Validation

In [15]:
#method to split the training set into a (new) training set and a test set (same as in lab03)

def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed
    np.random.seed(seed)
 
    # split the data based on the given ratio

    training_nbr = int(x.shape[0] * ratio)
    indexes = np.random.choice(x.shape[0],training_nbr, replace=False)
    
    x_train = x[indexes]
    y_train = y[indexes]
    x_test = np.delete(x, indexes, axis = 0)
    y_test = np.delete(y, indexes, axis = 0)
    
    
    return x_train, y_train, x_test, y_test

In [35]:
#perform cross-validation 

def crossValidation(x, y, splitRatio, degrees, seed =1):
    
    x_train, y_train, x_test, y_test = split_data(x, y, splitRatio, seed)
    
    a_training = []
    a_testing = []
    weights = []
    degr = []
    
    # define parameter (just add more for loops if there are more parameters for the model)
    lambdas = np.arange(0,0.000001,0.0000001)
    
    for ind, lambda_ in enumerate(lambdas):
        
        for ind_d, d in enumerate(degrees):
            
            
            #perform polynomial feature expension
            x_test_poly = build_poly(x_test,d)
            x_train_poly = build_poly(x_train, d)
           
            
            #normalize data (DANGER: the test set must be normalized with the training set's mean and std)
            mean = np.mean(x_train_poly, axis =0)
            std = np.std(x_train_poly, axis = 0)
            
              
            #put 1 if std = 0
            std = std + (std == 0)

            
            x_train_ready = (x_train_poly - mean) / std
            x_test_ready = (x_test_poly - mean) / std
            
            
            #add bias term
            bias_tr = np.ones(shape=x_train.shape)
            bias_te = np.ones(shape=x_test.shape)
            
            x_train_ready = np.c_[bias_tr, x_train_ready]
            x_test_ready = np.c_[bias_te, x_test_ready]
            
            
            #Models
        
            #ideal :  lambdas = np.arange(0,0.000001,0.0000001) => 81.9 %
            w_star, e_tr = ridge_regression(y_train,x_train_ready, lambda_)
        
            #ideal : lambdas = np.arange(0,0.3,0.1)
            #w_star, e_tr = logistic_regression(y_train, x_train_ready,np.ones(x_train_ready.shape[1])  ,400, lambda_)
        
            #don't usel least squares with lambda bigger than 0.35 ideal: lambdas = np.arange(0.001,0.13,0.01)
            #w_star, e_tr = least_squares_GD(y_train, x_train_ready,np.ones(x_train_ready.shape[1])  ,400, lambda_)    
            #w_star, e_tr = least_squares_SGD(y_train, x_train,np.ones(x_train.shape[1])  ,400, lambda_)
        
            #DON'T REALLY NEED TO DO CROSS VALIDATION FOR THIS ONE ;) BUT PRACTICAL TO RUN IT HERE
            #w_star, e_tr = least_squares(y_train, x_train_ready)  
        
            degr.append(d)
        
            #compare the prediction with the reality
            accuracy_training = np.count_nonzero(predict_labels(w_star, x_train_ready) + y_train)/len(y_train)
            accuracy_testing = np.count_nonzero(predict_labels(w_star, x_test_ready) + y_test)/len(y_test)
        
            a_training.append(accuracy_training)
            a_testing.append(accuracy_testing)
            weights.append(w_star)
            print("lambda={l:.5f},degree={deg}, Training Accuracy={tr}, Testing Accuracy={te}".format(
                   l=lambda_, tr=a_training[ind*len(degrees)+ind_d], te=a_testing[ind*len(degrees)+ind_d], deg=d))
        
            #plt.plot(lambdas, a_training,'r--' , lambdas, a_testing, 'g--')
            #plt.show
    
    return weights[np.argmax(a_testing)], degr[np.argmax(a_testing)], a_testing[np.argmax(a_testing)], x_train

In [17]:
#perform cross-validation 

def crossValidationForLogistic_reg(x, y, splitRatio, degrees, seed =1):
    
    x_train, y_train, x_test, y_test = split_data(x, y, splitRatio, seed)
    
    a_training = []
    a_testing = []
    weights = []
    degr = []
    
    index = 0
    
    # define parameter (just add more for loops if there are more parameters for the model)
    lambdas = np.arange(0.0001,0.3,0.1)
    gammas = np.arange(0.01,1,0.3)
    
    for ind, lambda_ in enumerate(lambdas):
        
        for ind_d, d in enumerate(degrees):
            
            for ind_g, gamma in enumerate(gammas):
            
                #perform polynomial feature expension
                x_test_poly = build_poly(x_test,d)
                x_train_poly = build_poly(x_train, d)
            
                #normalize data (DANGER: the test set must be normalized with the training set's mean and std)
                mean = np.mean(x_train_poly, axis =0)
                std = np.std(x_train_poly, axis = 0)
            
                #put 1 if std = 0
                std = std + (std == 0)
            
                x_train_ready = (x_train_poly - mean) / std
                x_test_ready = (x_test_poly - mean) / std
                
               
                #add bias term
                
                bias_tr = np.ones(shape=x_train.shape)
                bias_te = np.ones(shape=x_test.shape)
            
                x_train_ready = np.c_[bias_tr, x_train_ready]
                x_test_ready = np.c_[bias_te, x_test_ready]
                
           

                #Model
        
                #ideal :lambdas = np.arange(0,0.3,0.01)
                #       gammas = np.arange(0,3,0.5)
                w_star, e_tr = reg_logistic_regression(y_train, x_train_ready, lambda_, np.ones(x_test_ready.shape[1]), 30, gamma)
        
           
                degr.append(d)
        
                #compare the prediction with the reality
                accuracy_training = np.count_nonzero(predict_labels(w_star, x_train_ready) + y_train)/len(y_train)
                accuracy_testing = np.count_nonzero(predict_labels(w_star, x_test_ready) + y_test)/len(y_test)
        
                a_training.append(accuracy_training)
                a_testing.append(accuracy_testing)
                weights.append(w_star)
                print("lambda={l:.5f},degree={deg}, gamma={ga:.5f}, Training Accuracy={tr}, Testing Accuracy={te}".format(
                       l=lambda_, tr=a_training[index], te=a_testing[index], deg=d, ga=gamma))
        
                #increment index
                index = index + 1
    
    return weights[np.argmax(a_testing)], degr[np.argmax(a_testing)], a_testing[np.argmax(a_testing)], x_train

In [18]:
#Since we separated the data according to PRI_jet_num
# we have to make separate prediction and then put them together for the submission

def put_together(labels, indices):
    
    #First build first chunk
    ids_0 = np.matrix(indices[0]).T
    lab_0 = np.matrix(labels[0]).T
    
    unsorted_res = np.concatenate((ids_0, lab_0), axis=1)
    
    for i in range(1,len(labels)):
        ids = np.matrix(indices[i]).T
        lab = np.matrix(labels[i]).T
        by_jet_num = np.concatenate((ids, lab), axis=1)
        unsorted_res = np.concatenate((unsorted_res, by_jet_num), axis=0)
    
    sorted_res = unsorted_res[np.lexsort(np.fliplr(unsorted_res).T)]
    
    return sorted_res[0,:,:][:,1]

## Generate predictions and save ouput in csv format for submission:

In [19]:
DATA_TEST_PATH = '../data/test.csv' 
y_donotUse, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [33]:
#separate data with respect to column 24 and remove None
split_x_test, _, split_ids_test =  separate(y_donotUse, tX_test, ids_test)


split_x_cleaned_test = removeNone(split_x_test, dataStatistics(split_x_test))

#median instead of None
split_x_with_median = putMedianInsteadOfNone(split_x_cleaned_test)



#line dropped when None
#split_x_drop_lines, split_y_dropped_split_indexes_dropped = dropLineIfNone(split_x_cleaned_test, _, split_ids_test)

#degrees for polynomial feature expension
degrees = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

y_res = []

acc = []




for i in range(len(cleaned_with_median)):
    
    
    
    #training: chose either cross calidation or cross validation for logistic regression with regularization
    w_star, d, accuracy, training_set = crossValidation(cleaned_with_median[i], split_y[i], 0.9, degrees ,6)
    #w_star, d, accuracy, training_set = crossValidationForLogistic_reg(cleaned_with_median[i], split_y[i], 0.9, degrees ,6)
    
    
    #polynomial feature expension and normalization using the training data
    mean = np.mean(build_poly(training_set,d), axis = 0)
    std = np.std(build_poly(training_set,d), axis = 0)
    
      
    #put 1 if std = 0
    std = std + (std == 0)
    
    extended_and_normalized = (build_poly(split_x_with_median[i], d) - mean) / std
    
    #adding bias term
    bias = np.ones(shape=split_x_with_median[i].shape)          
    x_test_ready = np.c_[bias, extended_and_normalized]
    
    #prediction
    y_res.append(predict_labels(w_star, x_test_ready))


    acc.append(accuracy)

print("Accuracy per jet nbr: \n")
print(acc)


Statistics 
Type :
0
[0.2605448 0.        0.        0.        1.        1.        1.
 0.        0.        0.        0.        0.        1.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        1.        1.        1.        1.        1.
 1.        0.       ]
Statistics 
Type :
1
[0.09834149 0.         0.         0.         1.         1.
 1.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         1.         1.         0.        ]
Statistics 
Type :
2
[0.05881481 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
Statistics 
Type :
3
[0.06376737 0.         0.     

lambda=0.00000,degree=7, Training Accuracy=0.8384025978358781, Testing Accuracy=0.8454763811048839
lambda=0.00000,degree=8, Training Accuracy=0.8385805318001357, Testing Accuracy=0.8461769415532426
lambda=0.00000,degree=9, Training Accuracy=0.8389586414741829, Testing Accuracy=0.8457766212970377
lambda=0.00000,degree=10, Training Accuracy=0.838847432746522, Testing Accuracy=0.8461769415532426
lambda=0.00000,degree=11, Training Accuracy=0.8390809710746099, Testing Accuracy=0.8462770216172938
lambda=0.00000,degree=12, Training Accuracy=0.8391588171839726, Testing Accuracy=0.8464771817453963
lambda=0.00000,degree=13, Training Accuracy=0.8392366632933352, Testing Accuracy=0.8465772618094476
lambda=0.00000,degree=14, Training Accuracy=0.8393478720209963, Testing Accuracy=0.8465772618094476
lambda=0.00000,degree=15, Training Accuracy=0.8393589928937623, Testing Accuracy=0.8462770216172938
lambda=0.00000,degree=16, Training Accuracy=0.8394145972575928, Testing Accuracy=0.8464771817453963
lamb

lambda=0.00000,degree=10, Training Accuracy=0.8387584657643932, Testing Accuracy=0.8460768614891914
lambda=0.00000,degree=11, Training Accuracy=0.8388251910009897, Testing Accuracy=0.8461769415532426
lambda=0.00000,degree=12, Training Accuracy=0.838969762346949, Testing Accuracy=0.8462770216172938
lambda=0.00000,degree=13, Training Accuracy=0.8390587293290778, Testing Accuracy=0.8460768614891914
lambda=0.00000,degree=14, Training Accuracy=0.8391365754384404, Testing Accuracy=0.8459767814251401
lambda=0.00000,degree=15, Training Accuracy=0.8391588171839726, Testing Accuracy=0.8465772618094476
lambda=0.00000,degree=16, Training Accuracy=0.8391810589295048, Testing Accuracy=0.8466773418734987
lambda=0.00000,degree=1, Training Accuracy=0.7147258163894024, Testing Accuracy=0.7120567375886525
lambda=0.00000,degree=2, Training Accuracy=0.7606929458797231, Testing Accuracy=0.7595099935525468
lambda=0.00000,degree=3, Training Accuracy=0.7714396251558269, Testing Accuracy=0.7676337846550613
lamb

lambda=0.00000,degree=13, Training Accuracy=0.787487999541475, Testing Accuracy=0.7810444874274661
lambda=0.00000,degree=14, Training Accuracy=0.7875023284471765, Testing Accuracy=0.7809155383623468
lambda=0.00000,degree=15, Training Accuracy=0.7876886042212956, Testing Accuracy=0.7807865892972276
lambda=0.00000,degree=16, Training Accuracy=0.7877315909384001, Testing Accuracy=0.7806576402321083
lambda=0.00000,degree=1, Training Accuracy=0.7146685007665965, Testing Accuracy=0.7119277885235332
lambda=0.00000,degree=2, Training Accuracy=0.7607216036911261, Testing Accuracy=0.7596389426176661
lambda=0.00000,degree=3, Training Accuracy=0.7714826118729312, Testing Accuracy=0.7677627337201806
lambda=0.00000,degree=4, Training Accuracy=0.7808393872959922, Testing Accuracy=0.7774339136041264
lambda=0.00000,degree=5, Training Accuracy=0.7834615770393615, Testing Accuracy=0.7783365570599613
lambda=0.00000,degree=6, Training Accuracy=0.7849517832323145, Testing Accuracy=0.7788523533204385
lambda=

lambda=0.00000,degree=16, Training Accuracy=0.7865996073879837, Testing Accuracy=0.7800128949065119
lambda=0.00000,degree=1, Training Accuracy=0.7369488983480735, Testing Accuracy=0.722707423580786
lambda=0.00000,degree=2, Training Accuracy=0.7791844026377892, Testing Accuracy=0.7633981738785233
lambda=0.00000,degree=3, Training Accuracy=0.7990119318056506, Testing Accuracy=0.7852322350138944
lambda=0.00000,degree=4, Training Accuracy=0.807745748880704, Testing Accuracy=0.797141722905915
lambda=0.00000,degree=5, Training Accuracy=0.8104585253964403, Testing Accuracy=0.7943628423977769
lambda=0.00000,degree=6, Training Accuracy=0.813612403784654, Testing Accuracy=0.8017070265978563
lambda=0.00000,degree=7, Training Accuracy=0.8230078736684238, Testing Accuracy=0.8108376339817388
lambda=0.00000,degree=8, Training Accuracy=0.83090359718577, Testing Accuracy=0.8229456133386265
lambda=0.00000,degree=9, Training Accuracy=0.8333517125780199, Testing Accuracy=0.8233425962683605
lambda=0.00000,

lambda=0.00000,degree=3, Training Accuracy=0.798923711431155, Testing Accuracy=0.7850337435490274
lambda=0.00000,degree=4, Training Accuracy=0.8077016386934562, Testing Accuracy=0.797141722905915
lambda=0.00000,degree=5, Training Accuracy=0.8099733133367151, Testing Accuracy=0.796744739976181
lambda=0.00000,degree=6, Training Accuracy=0.8121567676054785, Testing Accuracy=0.7979356887653831
lambda=0.00000,degree=7, Training Accuracy=0.8123773185417172, Testing Accuracy=0.7999206034140532
lambda=0.00000,degree=8, Training Accuracy=0.8140755607507554, Testing Accuracy=0.7995236204843191
lambda=0.00000,degree=9, Training Accuracy=0.8150239297765819, Testing Accuracy=0.8005160778086542
lambda=0.00000,degree=10, Training Accuracy=0.8156855825852981, Testing Accuracy=0.8015085351329893
lambda=0.00000,degree=11, Training Accuracy=0.8152444807128206, Testing Accuracy=0.8019055180627233
lambda=0.00000,degree=12, Training Accuracy=0.8155311969299309, Testing Accuracy=0.8017070265978563
lambda=0.0

lambda=0.00000,degree=6, Training Accuracy=0.8093948964756605, Testing Accuracy=0.815065403698692
lambda=0.00000,degree=7, Training Accuracy=0.8178172156213968, Testing Accuracy=0.8204781235904375
lambda=0.00000,degree=8, Training Accuracy=0.8307013586002908, Testing Accuracy=0.8272440234551195
lambda=0.00000,degree=9, Training Accuracy=0.8362661051787236, Testing Accuracy=0.8272440234551195
lambda=0.00000,degree=10, Training Accuracy=0.8357146438060862, Testing Accuracy=0.8281461434370772
lambda=0.00000,degree=11, Training Accuracy=0.837970622148694, Testing Accuracy=0.8258908434821831
lambda=0.00000,degree=12, Training Accuracy=0.8381711535569258, Testing Accuracy=0.821831303563374
lambda=0.00000,degree=13, Training Accuracy=0.839073544893969, Testing Accuracy=0.8240866035182679
lambda=0.00000,degree=14, Training Accuracy=0.8378202235925202, Testing Accuracy=0.8249887235002256
lambda=0.00000,degree=15, Training Accuracy=0.8372687622198827, Testing Accuracy=0.8245376635092467
lambda=0

lambda=0.00000,degree=9, Training Accuracy=0.8108988820373991, Testing Accuracy=0.8141632837167343
lambda=0.00000,degree=10, Training Accuracy=0.8117010076703264, Testing Accuracy=0.8123590437528191
lambda=0.00000,degree=11, Training Accuracy=0.8114503434100366, Testing Accuracy=0.8141632837167343
lambda=0.00000,degree=12, Training Accuracy=0.8123026018950218, Testing Accuracy=0.8132611637347767
lambda=0.00000,degree=13, Training Accuracy=0.8127537975635434, Testing Accuracy=0.8141632837167343
lambda=0.00000,degree=14, Training Accuracy=0.8131548603800071, Testing Accuracy=0.8155164636896707
lambda=0.00000,degree=15, Training Accuracy=0.8133052589361809, Testing Accuracy=0.8159675236806495
lambda=0.00000,degree=16, Training Accuracy=0.81430791597734, Testing Accuracy=0.8155164636896707
lambda=0.00000,degree=1, Training Accuracy=0.7263247606156314, Testing Accuracy=0.7388362652232747
lambda=0.00000,degree=2, Training Accuracy=0.7623702812453, Testing Accuracy=0.774018944519621
lambda=0.

In [34]:
OUTPUT_PATH = '../data/submission.csv'

#reassemble the data for the submission
y_pred = put_together(y_res, split_ids_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)