In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from scripts.proj1_helpers import *
from scripts.preprocess import standardize_with_power_terms

from implementations_cross_validation import least_squares_GD
from implementations_cross_validation import least_squares_SGD
from implementations_cross_validation import least_squares
from implementations_cross_validation import ridge_regression
from implementations_cross_validation import logistic_regression
from implementations_cross_validation import reg_logistic_regression

%load_ext autoreload
%autoreload 2

In [2]:
# load data
raw_y, raw_x, ind = load_csv_data('higgs-data/train.csv')

## * Data Processing
1. Based on PRI_JET_NUM (feature 22), which ranged in value of inclusive [0,4], we devide the training data into 4 groups. From these 4 groups, we devide again each of them into 2 subsets based on outliers (-999) value in DER_MASS_MMC (feature 1). So these approach give us 8 subsets to train and to obtain the 8 corresponding models. 
<br><br>
<b>We obtain exactly eight models (w00, w01, w10, w11, w20, w21, w30, w31)</b>.
<br><br>We add suffixes to the model with {PRI_JET_NUM}_{DER_MASS_MMC_OUTLIERS_TAG}, 1 to indicate real values and 2 to indicate outliers.
<br>E.g. for PRI_JET_NUM=0 and DER_MASS_MMC!=-999 we will get w0_1.<br>
<br>
2. We standardize the data using power terms.

### Creating Subsets

In [3]:
def create_subsets(x, y):
    sets_x = []
    sets_y = []
    for pri_jet_num_val in np.unique(x[:,22]):
        
        indices = (x[:,22] == pri_jet_num_val) & (x[:,0] != -999)
        x_tmp   = x[indices,:]
        y_tmp   = y[indices]

        sets_x.append(x_tmp)
        sets_y.append(y_tmp)

        indices = (x[:,22] == pri_jet_num_val) & (x[:,0] == -999)
        x_tmp   = x[indices,:]
        y_tmp   = y[indices]

        sets_x.append(x_tmp)
        sets_y.append(y_tmp)
        
    return sets_x, sets_y

sets_x, sets_y = create_subsets(raw_x, raw_y)

# Subset 00 where PRI_JET_NUM = 0 and DER_MASS_MMC != -999
x00 = sets_x[0]
y00 = sets_y[0]

# Subset 01 where PRI_JET_NUM = 0 and DER_MASS_MMC == -999
x01 = sets_x[1]
y01 = sets_y[1]

# Subset 10 where PRI_JET_NUM = 1 and DER_MASS_MMC != -999
x10 = sets_x[2]
y10 = sets_y[2]

# Subset 11 where PRI_JET_NUM = 1 and DER_MASS_MMC == -999
x11 = sets_x[3]
y11 = sets_y[3]

# Subset 20 where PRI_JET_NUM = 2 and DER_MASS_MMC != -999
x20 = sets_x[4]
y20 = sets_y[4]

# Subset 21 where PRI_JET_NUM = 2 and DER_MASS_MMC == -999
x21 = sets_x[5]
y21 = sets_y[5]

# Subset 30 where PRI_JET_NUM = 3 and DER_MASS_MMC != -999
x30 = sets_x[6]
y30 = sets_y[6]

# Subset 31 where PRI_JET_NUM = 3 and DER_MASS_MMC == -999
x31 = sets_x[7]
y31 = sets_y[7]

### Features Reduction

In [4]:
# if PRI_JET_NUM = 0 and DER_MASS_MMC != -999
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29 
x00 = np.delete(x00, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)

# if PRI_JET_NUM = 0 and DER_MASS_MMC == -999
# We drop features 0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29 
x01 = np.delete(x01, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)

# if PRI_JET_NUM = 1 and DER_MASS_MMC != -999
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28 
x10 = np.delete(x10, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)

# if PRI_JET_NUM = 1 and DER_MASS_MMC == -999
# We drop features 0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28 
x11 = np.delete(x11, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)

# if PRI_JET_NUM == 2 and DER_MASS_MMC != -999
# We drop features 11, 15, 18, 20, 22, 28
x20 = np.delete(x20, [11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 2 and DER_MASS_MMC == -999
# We drop features 0, 11, 15, 18, 20, 22, 28
x21 = np.delete(x21, [0, 11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 3 and DER_MASS_MMC != -999
# We drop features 11, 15, 18, 20, 22, 28
x30 = np.delete(x30, [11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 3 and DER_MASS_MMC == -999
# We drop features 0, 11, 15, 18, 20, 22, 28
x31 = np.delete(x31, [0, 11, 15, 18, 20, 22, 28], 1)

### Data Standarization Using Power Terms

In [5]:
standardize_x00 = standardize_with_power_terms(x00, 2, True, with_sqrt=True)

standardize_x01 = standardize_with_power_terms(x01, 2, True, with_sqrt=True)

standardize_x10 = standardize_with_power_terms(x10, 2, True, with_sqrt=True)

standardize_x11 = standardize_with_power_terms(x11, 2, True, with_sqrt=True)

standardize_x20 = standardize_with_power_terms(x20, 2, True, with_sqrt=True)

standardize_x21 = standardize_with_power_terms(x21, 2, True, with_sqrt=True)

standardize_x30 = standardize_with_power_terms(x30, 2, True, with_sqrt=True)

standardize_x31 = standardize_with_power_terms(x31, 2, True, with_sqrt=True)


# Main Implementations

In [6]:
# Final Datasets
standardize_x = [standardize_x00, standardize_x01, standardize_x10, standardize_x11, standardize_x20, standardize_x21, standardize_x30, standardize_x31]
sets_y        = [y00, y01, y10, y11, y20, y21, y30, y31]

# Define the parameters of the algorithm.
max_iters = 5000
gamma     = 0.000002
lambda_   = 0.000001

## 1. Least Squares Using Gradient Descent

In [7]:
for x, y in zip(standardize_x, sets_y):
    initial_w  = np.zeros(x.shape[1])
    accuracies = least_squares_GD(y, x, initial_w, max_iters, gamma)
    print("\n".join(map(str,accuracies)))
    print("\n")        

0.724881420247
0.802051831719
0.675002143408
0.767654059773
0.699243047209
0.777100271003
0.678445400493
0.81990521327


## 2. Least Squares Using Stochastic Gradient Descent

In [8]:
for x, y in zip(standardize_x, sets_y):
    initial_w  = np.zeros(x.shape[1])
    accuracies = least_squares_SGD(y, x, initial_w, max_iters, gamma)
    print("\n".join(map(str,accuracies)))
    print("\n")        

0.723770158558
0.797266776404
0.673430310651
0.767389579476
0.705146857275
0.774051490515
0.675400009668
0.811103588355


## 3. Least Squares Using Normal Equations

In [7]:
for x, y in zip(standardize_x, sets_y):
    accuracies = least_squares(y, x)
    print("\n".join(map(str,accuracies)))
    print("\n")    

0.798617698875
0.800108415774
0.797262501694
0.802412250983
0.795771784795
0.801057053801
0.799566336902
0.805800243935
0.791028594661
0.8040384876


0.947932618683
0.948698315467
0.93989280245
0.946018376723
0.946784073507
0.948315467075
0.950995405819
0.950612557427
0.946784073507
0.952526799387


0.761503286653
0.759359817091
0.75421549014
0.761646184624
0.762503572449
0.753929694198
0.761789082595
0.760503000857
0.753643898257
0.761074592741


0.916666666667
0.911375661376
0.920634920635
0.912698412698
0.915343915344
0.931216931217
0.916666666667
0.914021164021
0.915343915344
0.915343915344


0.800506115563
0.790805567271
0.783002952341
0.800084352594
0.790383804302
0.795444959933
0.778996204133
0.793125263602
0.801771404471
0.784268241248


0.915254237288
0.905084745763
0.891525423729
0.898305084746
0.894915254237
0.871186440678
0.898305084746
0.932203389831
0.898305084746
0.918644067797


0.787717601547
0.79835589942
0.78916827853
0.791102514507
0.773694390716
0.774661508704
0.76

## 4. Ridge Regression

In [8]:
for x, y in zip(standardize_x, sets_y):
    accuracies = ridge_regression(y, x, lambda_)
    print("\n".join(map(str,accuracies)))
    print("\n")    

0.798888738311
0.800243935493
0.797262501694
0.802818810137
0.795229705922
0.801057053801
0.798888738311
0.805935763654
0.790622035506
0.804174007318


0.947549770291
0.948698315467
0.93989280245
0.946401225115
0.946784073507
0.948315467075
0.950995405819
0.950612557427
0.946401225115
0.952526799387


0.762074878537
0.758502429266
0.752643612461
0.761217490712
0.76264647042
0.753929694198
0.761646184624
0.759931408974
0.753501000286
0.760645898828


0.916666666667
0.912698412698
0.920634920635
0.912698412698
0.915343915344
0.931216931217
0.916666666667
0.912698412698
0.915343915344
0.914021164021


0.800927878532
0.791016448756
0.783213833825
0.799873471109
0.789540278364
0.793757908056
0.778785322649
0.79396878954
0.800084352594
0.78342471531


0.918644067797
0.901694915254
0.891525423729
0.898305084746
0.894915254237
0.877966101695
0.898305084746
0.925423728814
0.908474576271
0.918644067797


0.790618955513
0.79835589942
0.790618955513
0.791586073501
0.771276595745
0.77417794971
0.77

## 5. Logistic Regression using Gradient Descent

In [34]:
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    accuracies   = logistic_regression(y, x, initial_w, max_iters, gamma)   
    print("\n".join(map(str,accuracies)))
    print("\n")

0.8060577314
0.94839796348
0.788331285188
0.918011108172
0.82054525903
0.906165311653
0.809252187364
0.93432633717


## 6. Regularized Logistic Regression using Gradient Descent

In [9]:
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    accuracies   = reg_logistic_regression(y, x, initial_w, max_iters, gamma, lambda_)
    print("\n".join(map(str,accuracies)))  
    print("\n")    

0.8060577314
0.94839796348
0.788345574576
0.918011108172
0.82054525903
0.906165311653
0.809300526901
0.93432633717
