In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from scripts.proj1_helpers import *
from scripts.preprocess import standardize_with_power_terms

from implementations import least_squares_GD
from implementations import least_squares_SGD
from implementations import least_squares
from implementations import ridge_regression
from implementations import logistic_regression
from implementations import reg_logistic_regression

%load_ext autoreload
%autoreload 2

In [2]:
# load data
raw_y, raw_x, ind = load_csv_data('higgs-data/train.csv')

## * Data Processing
1. Based on PRI_JET_NUM (feature 22), which ranged in value of inclusive [0,4], we devide the training data into 4 groups. From these 4 groups, we devide again each of them into 2 subsets based on outliers (-999) value in DER_MASS_MMC (feature 1). So these approach give us 8 subsets to train and to obtain the 8 corresponding models. 
<br><br>
<b>We obtain exactly eight models (w00, w01, w10, w11, w20, w21, w30, w31)</b>.
<br><br>We add suffixes to the model with {PRI_JET_NUM}_{DER_MASS_MMC_OUTLIERS_TAG}, 1 to indicate real values and 2 to indicate outliers.
<br>E.g. for PRI_JET_NUM=0 and DER_MASS_MMC!=-999 we will get w0_1.<br>
<br>
2. We standardize the data using power terms.

### Creating Subsets

In [3]:
def create_subsets(x, y):
    sets_x = []
    sets_y = []
    for pri_jet_num_val in np.unique(x[:,22]):
        
        indices = (x[:,22] == pri_jet_num_val) & (x[:,0] != -999)
        x_tmp   = x[indices,:]
        y_tmp   = y[indices]

        sets_x.append(x_tmp)
        sets_y.append(y_tmp)

        indices = (x[:,22] == pri_jet_num_val) & (x[:,0] == -999)
        x_tmp   = x[indices,:]
        y_tmp   = y[indices]

        sets_x.append(x_tmp)
        sets_y.append(y_tmp)
        
    return sets_x, sets_y

sets_x, sets_y = create_subsets(raw_x, raw_y)

# Subset 00 where PRI_JET_NUM = 0 and DER_MASS_MMC != -999
x00 = sets_x[0]
y00 = sets_y[0]

# Subset 01 where PRI_JET_NUM = 0 and DER_MASS_MMC == -999
x01 = sets_x[1]
y01 = sets_y[1]

# Subset 10 where PRI_JET_NUM = 1 and DER_MASS_MMC != -999
x10 = sets_x[2]
y10 = sets_y[2]

# Subset 11 where PRI_JET_NUM = 1 and DER_MASS_MMC == -999
x11 = sets_x[3]
y11 = sets_y[3]

# Subset 20 where PRI_JET_NUM = 2 and DER_MASS_MMC != -999
x20 = sets_x[4]
y20 = sets_y[4]

# Subset 21 where PRI_JET_NUM = 2 and DER_MASS_MMC == -999
x21 = sets_x[5]
y21 = sets_y[5]

# Subset 30 where PRI_JET_NUM = 3 and DER_MASS_MMC != -999
x30 = sets_x[6]
y30 = sets_y[6]

# Subset 31 where PRI_JET_NUM = 3 and DER_MASS_MMC == -999
x31 = sets_x[7]
y31 = sets_y[7]

### Features Reduction

In [4]:
# if PRI_JET_NUM = 0 and DER_MASS_MMC != -999
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29 
x00 = np.delete(x00, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)

# if PRI_JET_NUM = 0 and DER_MASS_MMC == -999
# We drop features 0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29 
x01 = np.delete(x01, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)

# if PRI_JET_NUM = 1 and DER_MASS_MMC != -999
# We drop features 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28 
x10 = np.delete(x10, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)

# if PRI_JET_NUM = 1 and DER_MASS_MMC == -999
# We drop features 0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28 
x11 = np.delete(x11, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)

# if PRI_JET_NUM == 2 and DER_MASS_MMC != -999
# We drop features 11, 15, 18, 20, 22, 28
x20 = np.delete(x20, [11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 2 and DER_MASS_MMC == -999
# We drop features 0, 11, 15, 18, 20, 22, 28
x21 = np.delete(x21, [0, 11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 3 and DER_MASS_MMC != -999
# We drop features 11, 15, 18, 20, 22, 28
x30 = np.delete(x30, [11, 15, 18, 20, 22, 28], 1)

# if PRI_JET_NUM == 3 and DER_MASS_MMC == -999
# We drop features 0, 11, 15, 18, 20, 22, 28
x31 = np.delete(x31, [0, 11, 15, 18, 20, 22, 28], 1)

### Data Standarization Using Power Terms

In [5]:
standardize_x00 = standardize_with_power_terms(x00, 2, True, with_sqrt=True)

standardize_x01 = standardize_with_power_terms(x01, 2, True, with_sqrt=True)

standardize_x10 = standardize_with_power_terms(x10, 2, True, with_sqrt=True)

standardize_x11 = standardize_with_power_terms(x11, 2, True, with_sqrt=True)

standardize_x20 = standardize_with_power_terms(x20, 2, True, with_sqrt=True)

standardize_x21 = standardize_with_power_terms(x21, 2, True, with_sqrt=True)

standardize_x30 = standardize_with_power_terms(x30, 2, True, with_sqrt=True)

standardize_x31 = standardize_with_power_terms(x31, 2, True, with_sqrt=True)


# Main Implementations

In [6]:
# Final Datasets
standardize_x = [standardize_x00, standardize_x01, standardize_x10, standardize_x11, standardize_x20, standardize_x21, standardize_x30, standardize_x31]
sets_y        = [y00, y01, y10, y11, y20, y21, y30, y31]

# Define the parameters of the algorithm.
max_iters = 5000
gamma     = 0.000002
lambda_   = 0.000001

## 1. Least Squares Using Gradient Descent

In [7]:
ws_1 = []
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    w, loss   = least_squares_GD(y, x, initial_w, max_iters, gamma)
    print(np.mean(predict_labels(w, x) == y))
    ws_1.append(w)
    

0.724881420247
0.802051831719
0.675002143408
0.767654059773
0.699243047209
0.777100271003
0.678445400493
0.81990521327


## 2. Least Squares Using Stochastic Gradient Descent

In [8]:
ws_2 = []
for x, y in zip(standardize_x, sets_y):
    initial_w = np.zeros(x.shape[1])
    w, loss   = least_squares_SGD(y, x, initial_w, max_iters, gamma)
    print(np.mean(predict_labels(w, x) == y))
    ws_2.append(w)
    

0.723770158558
0.797266776404
0.673430310651
0.767389579476
0.705146857275
0.774051490515
0.675400009668
0.811103588355


## 3. Least Squares Using Normal Equations

In [7]:
ws_3 = []
for x, y in zip(standardize_x, sets_y):
    w, loss = least_squares(y, x)
    print(np.mean(predict_labels(w, x) == y))
    ws_3.append(w)
    

0.799756064507
0.948512804808
0.759581035123
0.918407828617
0.791342484239
0.907859078591
0.785227437521
0.938388625592


## 4. Ridge Regression

In [8]:
ws_4 = []
for x, y in zip(standardize_x, sets_y):
    w, loss = ridge_regression(y, x, lambda_)
    print(np.mean(predict_labels(w, x) == y))
    ws_4.append(w)
    

0.799796720423
0.948551085251
0.759266668572
0.918275588469
0.791131633879
0.907520325203
0.785904191038
0.939065673663


## 5. Logistic Regression using Gradient Descent

In [34]:
ws_5 = []
for x, y in zip(standardize_x, sets_y):
    #map y to value of either zero or one
    mapped_y = (y+1)/2
    
    initial_w = np.zeros(x.shape[1])
    w, loss   = logistic_regression(mapped_y, x, initial_w, max_iters, gamma)
    
    print(np.mean(predict_labels(w, x) == y))
    ws_5.append(w)
    

0.8060577314
0.94839796348
0.788331285188
0.918011108172
0.82054525903
0.906165311653
0.809252187364
0.93432633717


## 6. Regularized Logistic Regression using Gradient Descent

In [9]:
ws_6 = []
lambda_ = 0.01
for x, y in zip(standardize_x, sets_y):
    #map y to value of either zero or one
    mapped_y = (y+1)/2
    
    initial_w = np.zeros(x.shape[1])
    w, loss   = reg_logistic_regression(mapped_y, x, initial_w, max_iters, gamma, lambda_)
    
    print(np.mean(predict_labels(w, x) == y))
    ws_6.append(w)
    

0.8060577314
0.94839796348
0.788345574576
0.918011108172
0.82054525903
0.906165311653
0.809300526901
0.93432633717


# Testing and Make Submission to Kaggle

In [35]:
test_y, test_x, ind = load_csv_data('higgs-data/test.csv')

def make_submission_file(x, y, w, filename="prediction.csv"):
    y_pred = np.ones(len(y))
    w00, w01, w10, w11, w20, w21, w30, w31 = w[0], w[1], w[2], w[3], w[4], w[5], w[6], w[7]
    
    # for PRI_JET_NUM = 0 and DER_MASS_MMC != -999
    mask         = (x[:, 22] == 0) & (x[:, 0] != -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w00, stand_x)
    
    # for PRI_JET_NUM = 0 and DER_MASS_MMC == -999
    mask         = (x[:, 22] == 0) & (x[:, 0] == -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w01, stand_x)

    # for PRI_JET_NUM = 1 and DER_MASS_MMC != -999
    mask         = (x[:, 22] == 1) & (x[:, 0] != -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w10, stand_x)
    
    # for PRI_JET_NUM = 1 and DER_MASS_MMC == -999
    mask         = (x[:, 22] == 1) & (x[:, 0] == -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [0, 4, 5, 6, 11, 12, 15, 18, 20, 22, 26, 27, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w11, stand_x)

    # for PRI_JET_NUM = 2 and DER_MASS_MMC != -999
    mask         = (x[:, 22] == 2) & (x[:, 0] != -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w20, stand_x)
    
    # for PRI_JET_NUM = 2 and DER_MASS_MMC == -999
    mask         = (x[:, 22] == 2) & (x[:, 0] == -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [0, 11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w21, stand_x)

    # for PRI_JET_NUM = 3 and DER_MASS_MMC != -999
    mask         = (x[:, 22] == 3) & (x[:, 0] != -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w30, stand_x)
    
    # for PRI_JET_NUM = 3 and DER_MASS_MMC == -999
    mask         = (x[:, 22] == 3) & (x[:, 0] == -999)
    x_tmp        = x[mask, :]
    x_tmp        = np.delete(x_tmp, [0, 11, 15, 18, 20, 22, 28], 1)
    stand_x      = standardize_with_power_terms(x_tmp, 2, True, with_sqrt=True)
    y_pred[mask] = predict_labels(w31, stand_x)
    
    create_csv_submission(ind, y_pred, filename)

### 5. Testing Logistic Regression

In [36]:
make_submission_file(test_x, test_y, ws_5, "final-submission.csv")