# ML project 1


In [5]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from proj1_helpers import *
from helpers import *
from implementations import *

from data_pre_processing import preprocess_data

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading data

In [6]:
te_data = load_csv_data("data/test.csv", sub_sample=False)

In [7]:
tr_data = load_csv_data("data/train.csv", sub_sample=False)

In [8]:
#get testing data
x_te = te_data[1]
y_te = te_data[0]
#we only need the ids to make the submission file
ids_te = te_data[2]

In [9]:
#get training data
x = tr_data[1]
y = tr_data[0]

### preprocess data

In [10]:
#get the training data split by set and the jet indices
xx, yy, jet_indices = preprocess_data(x, y, augment=True, clean=True)

In [11]:
#get the training data split by set and the jet indices
xx_te, yy_te, jet_indices_te = preprocess_data(x_te, y_te, augment=True, clean=True)

### Regression functions

In [12]:
def cross_validation(y, x, k_indices, k, lambda_):
    """return the accuracy of ridge regression for this k-fold."""
    # ***************************************************
    # get k'th subgroup in test, others in train
    # ***************************************************
    te_x = x[k_indices[k]]
    te_y = y[k_indices[k]]
    
    tr_x = np.delete(x, k_indices[k], axis=0)
    tr_y = np.delete(y, k_indices[k], axis=0)   
    
    # ***************************************************
    # regression
    # ***************************************************
    w_star, _ = ridge_regression(tr_y, tr_x, lambda_)
    
    # ***************************************************
    # calculate the accuracy for train and test data
    # ***************************************************
    accuracy_tr = accuracy(tr_y, tr_x, w_star)
    accuracy_te = accuracy(te_y, te_x, w_star)
    
    return accuracy_tr, accuracy_te, w_star

In [13]:
def best_model_cross_validation(x, y, seed, degrees_range, k_fold=4, lambdas=None):
    """
    This function will iterate over the given degrees and lambdas to
    find the ones which yield the best accuracy using cross validation.
    """
    degrees = range(degrees_range[0], degrees_range[1]+1)
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
     
    #variables to store the best parameters and results    
    best1_accuracy = 0.0
    best1_lambda = 0
    best1_degree = 0
    best1_w = [];
    
    for i, degree in enumerate(degrees):
            
        best0_accuracy = 0.0
        best0_lambda = 0
        best0_w = [];
        
        #Add the columns with raised power.
        x_poly = build_poly_matrix(x, degree)
        
        for j, lambda_ in enumerate(lambdas):
            
            #prepare variables for the k-fold average
            tmp_te = 0
            tmp_w = 0

            for k in range(k_fold):
                _, accuracy_te, w_star = \
                    cross_validation(y, x_poly, k_indices, k, lambda_)
                    
                tmp_te += accuracy_te
                #We also average the weights
                tmp_w += w_star

            te_accuracy = tmp_te/k_fold

            if te_accuracy > best0_accuracy:
                best0_accuracy = te_accuracy
                best0_lambda = lambda_
                best0_w = tmp_w/k_fold
                
        if best0_accuracy > best1_accuracy:
            best1_accuracy = best0_accuracy
            best1_lambda = best0_lambda
            best1_degree = degree
            best1_w = best0_w
        
    return best1_accuracy, best1_w, best1_degree, best1_lambda

### Regression

In [14]:
#array to store the best results 
#and parameters for each jet.
accuracy_arr = []
w_arr = []
lambda_arr = []
degree_arr = []

#Here we already give the best parameter
#since we already found them.
degrees = [[5,5], [9, 9], [13, 13]]
lambdas =[[1e-08], [8.53167852417e-07], [1.26896100317e-07]]
#We tried different seeds and this was the best one.
seed = 120

#for each jet
for jet in range(3):
    #print something to get an idea of the progress
    print('jet : ', jet)
    best_accuracy, best_w, best_degree, best_lambda = \
        best_model_cross_validation(xx[jet], yy[jet], seed, degrees[jet], lambdas=lambdas[jet])  
    
    accuracy_arr.append(best_accuracy)
    w_arr.append(best_w)
    lambda_arr.append(best_lambda)
    degree_arr.append(best_degree)
    
print('==> Done.')

jet :  0
jet :  1
jet :  2
==> Done.


### Display the best results for each jet

In [15]:
#This is to print the best parameters for each 
for i in range(3):
    print("jet = ", i)
    #build the polynomial matrix of the best degree for this jet
    tx = build_poly_matrix(xx[i], degree_arr[i])
    #compute the accuracy
    acc1 = accuracy(yy[i], tx, w_arr[i])

    print("\tRidge regression accuracy : ", acc1)
    print("\tbest lambda : ", lambda_arr[i])
    print("\tbest degree : ", degree_arr[i])

jet =  0
	Ridge regression accuracy :  0.844434658153
	best lambda :  1e-08
	best degree :  5
jet =  1
	Ridge regression accuracy :  0.806883833694
	best lambda :  8.53167852417e-07
	best degree :  9
jet =  2
	Ridge regression accuracy :  0.832292571303
	best lambda :  1.26896100317e-07
	best degree :  13


### Compute the overall accuracy

In [16]:
def get_overall_predictions(xx, weight_array, degree_array, jet_indices):
    """
    Compute the overall prediction using the weights of
    all 3 jet subsets.
    """
    tx = []
    for jet in range(3):
        tx.append(build_poly_matrix(xx[jet], degree_array[jet]))
        
    #compute the predictions
    y_p = arrange_prediction(weight_array, tx, jet_indices)
    
    return y_p

In [3]:
#get the prediction for the training set
y_p = get_overall_predictions(xx, w_arr, degree_arr, jet_indices)
#compute the accuracy with the prediction already given
accuracy(y, y_pred=y_p)

0.829264

### Create the submission

In [20]:
#get the prediction for the testing set
y_p_te = get_overall_predictions(xx_te, w_arr, degree_arr, jet_indices_te)

In [21]:
create_csv_submission(ids_te, y_p_te, "submission.csv")

### Run cell

In [1]:
#Imports
import numpy as np

from proj1_helpers import *
from helpers import *
from implementations import *
from cross_validation import *

from data_pre_processing import preprocess_data


# Load data
TEST_PATH = "data/test.csv"
TRAIN_PATH = "data/train.csv"

OUTPUT_PATH = "data/submission.csv"

te_data = load_csv_data(TEST_PATH, sub_sample=False)
tr_data = load_csv_data(TRAIN_PATH, sub_sample=False)

#get testing data
x_te = te_data[1]
y_te = te_data[0]
#we only need the ids to make the submission file
ids_te = te_data[2]

#get training data
x = tr_data[1]
y = tr_data[0]

#get the training data split by set and the jet indices
xx, yy, jet_indices = preprocess_data(x, y, augment=True, clean=True)

#get the tresting data split by set and the jet indices
xx_te, yy_te, jet_indices_te = preprocess_data(x_te, y_te, augment=True, clean=True)

#Prepare hyperparameters:
#Here we already give the best parameter
#since we already found them.
degrees = [[5,5], [9, 9], [13, 13]]
lambdas =[[1e-08], [8.53167852417e-07], [1.26896100317e-07]]
#We tried different seeds and this was the best one.
seed = 120

#array to store the best results 
#and parameters for each jet.
accuracy_arr = []
w_arr = []
lambda_arr = []
degree_arr = []

#for each jet
for jet in range(3):
    #print something to get an idea of the progress
    print('jet : ', jet)
    best_accuracy, best_w, best_degree, best_lambda = \
        best_model_cross_validation(xx[jet], yy[jet], seed, degrees[jet], lambdas=lambdas[jet])  
    
    accuracy_arr.append(best_accuracy)
    w_arr.append(best_w)
    lambda_arr.append(best_lambda)
    degree_arr.append(best_degree)

print('Training done, creating submission.')
#get the prediction for the testing set
y_p_te = get_overall_predictions(xx_te, w_arr, degree_arr, jet_indices_te)
create_csv_submission(ids_te, y_p_te, OUTPUT_PATH)
print('==> Done.')

jet :  0
jet :  1
jet :  2
Training done, creating submission.
==> Done.
