In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
import math
%load_ext autoreload
%autoreload 2

# Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from helpers import *

DATA_TRAIN_PATH = '../data/train.csv' 
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

# Feature processing

We try to obtain column names of data, and its name -> index mappings.

In [3]:
str_x = np.genfromtxt(DATA_TRAIN_PATH, delimiter=",", skip_header=0, dtype=str)
columns = str_x[0, 2:]
columns

array(['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis',
       'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot',
       'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta',
       'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi',
       'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num',
       'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
       'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta',
       'PRI_jet_subleading_phi', 'PRI_jet_all_pt'], 
      dtype='<U27')

In [4]:
col_idx = {}
for i in range(len(columns)):
    col_idx[columns[i]] = i
print(col_idx)

{'PRI_lep_pt': 16, 'PRI_tau_eta': 14, 'DER_deltaeta_jet_jet': 4, 'DER_lep_eta_centrality': 12, 'PRI_jet_all_pt': 29, 'PRI_tau_pt': 13, 'PRI_jet_leading_phi': 25, 'DER_pt_tot': 8, 'PRI_jet_subleading_pt': 26, 'PRI_jet_leading_eta': 24, 'PRI_tau_phi': 15, 'DER_pt_h': 3, 'PRI_lep_eta': 17, 'PRI_met_phi': 20, 'DER_deltar_tau_lep': 7, 'DER_mass_transverse_met_lep': 1, 'DER_met_phi_centrality': 11, 'DER_mass_jet_jet': 5, 'PRI_jet_num': 22, 'PRI_met': 19, 'DER_sum_pt': 9, 'DER_prodeta_jet_jet': 6, 'PRI_lep_phi': 18, 'PRI_jet_subleading_eta': 27, 'DER_pt_ratio_lep_tau': 10, 'PRI_jet_subleading_phi': 28, 'DER_mass_vis': 2, 'PRI_jet_leading_pt': 23, 'DER_mass_MMC': 0, 'PRI_met_sumet': 21}


## Clean data

We throw away data whose DER_mass_MMC is -999.

In [5]:
def valid_mass_indices(X):
    """returns a list of row indices where DER_mass_MMC field has valid value"""
    return np.where(X[:, col_idx['DER_mass_MMC']] != -999)[0]

In [6]:
def reset_invalid_val(X):
    """replaces -999 with 0, as 0 will have less bad effects on the training result"""
    for col in range(X.shape[1]):
        invalid_idx = np.where(X[:, col] == -999)[0]
        X[invalid_idx, col] = 0
    return X

In [7]:
def clean_data(y, tx):
    """clean data"""

    # throw away rows with invalid mass value
    idx = valid_mass_indices(tx)
    tx = tx[idx, :]
    y = y[idx]
    
    # replace -999 with 0
    tx = reset_invalid_val(tx)
    
    # tx = delete_derived_features(tx)
    
    return y, tx

In [8]:
y, tX = clean_data(y, X)

In [9]:
tX.shape

(211886, 30)

In [10]:
y.shape

(211886,)

# General gradient descent

In [11]:
# general gradient descent
def gradient_descent(y, tx, initial_x, gamma, max_iters,
                     compute_gradient, compute_loss):
    """
    General gradient descent algorithm
    
    Params:
        y (array): training values
        tx (array): each row contains the data associated to a sample.
                    each column contains all the sample value for a feature
        initial_w (array): the initial weight vector
        gamma (float): step size
        max_iters (int): maximum number of iterations to run
        compute_gradient (func): a function that computes gradient
        compute_less (func): a function that computes loss

    Returns:
        w (array): the last weight vector
        loss (float): the last loss value
    """
    losses = []
    w = initial_w
    
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        w = w - gamma * gradient

        # if (n_iter % 100 == 0):
        #    print(loss)
        
        losses.append(loss)
    
    return w, losses[-1]

# General stochastic gradient descent

In [12]:
def stochastic_gradient_descent(y, tx, initial_w, batch_size, gamma, max_iters, seed,
                                compute_gradient, compute_loss):
    """
    General stochastic gradient descent algorithm
    
    Params:
        y (array): training values
        tx (array): each row contains the data associated to a sample.
                    each column contains all the sample value for a feature
        batch_size (int): batch size
        initial_w (array): the initial weight vector
        gamma (float): step size
        max_iters (int): maximum number of iterations to run
        seed (int): random number generation seed
        compute_gradient (func): a function that computes gradient
        compute_less (func): a function that computes loss

    Returns:
        w (array): the last weight vector
        loss (float): the last loss value
    """
    
    losses = []
    w = initial_w
    
    num_batches = math.floor(y.shape[0] / batch_size) 
    batches = batch_iter(y, tx, seed, batch_size, num_batches)

    for n_iter in range(max_iters):
        batch_y, batch_tx = next(batches)
        gradient = compute_gradient(batch_y, batch_tx, w)
        loss = compute_loss(y, tx, w)

        # if (n_iter % 100 == 0):
        #    print(loss)
        
        w = w - gamma * gradient
        
        losses.append(loss)
        
    return w, losses[-1]

## Linear regression using gradient descent 

In [13]:
# Cost functions
def calculate_mse(e):
    """Calculate the mse for vector e."""
    return 1/2*np.mean(e**2)

def calculate_mae(e):
    """Calculate the mae for vector e."""
    return np.mean(np.abs(e))

def compute_loss_mse(y, tx, w):
    """Calculate the loss using mse """
    e = y - tx.dot(w)
    return calculate_mse(e)

def compute_gradient_mse(y, tx, w):
    """ compute the gradient associated to the MSE cost function"""
    e = y - (tx @ w)
    return -1/y.shape[0] * (tx.T @ e)

In [14]:
def least_squares_GD(y, tx, initial_w, gamma, max_iters):
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient_mse, compute_loss_mse)

In [15]:
max_iters = 1000
gamma = 1e-6
initial_w = np.zeros(tX.shape[1])

w_linear_gd, loss_linear_gd = least_squares_GD(y, tX, initial_w, gamma, max_iters)

In [16]:
loss_linear_gd

0.40265042191731415

## Linear regression using stochastic gradient descent 

In [17]:
def least_squares_SGD(y, tx, initial_w, gamma, max_iters):
    batch_size = y.shape[0]//2
    seed = 3
    
    return stochastic_gradient_descent(y, tx, initial_w, batch_size, gamma, max_iters,
                                       seed, compute_gradient_mse, compute_loss_mse)

In [18]:
max_iters = 5000
gamma = 4e-7
initial_w = np.zeros(tX.shape[1])

w_linear_sgd, loss_linear_sgd = least_squares_SGD(y, tX, initial_w, gamma, max_iters)

In [19]:
loss_linear_sgd

0.3985179351390859

## Least squares regression using normal equations

In [20]:
def least_squares(y, tx):
    w = (np.linalg.inv(tx.T @ tx) @ tx.T @ y)
    loss = compute_loss_mse(y, tx, w)
    return w, loss

In [21]:
w_ls, loss_ls = least_squares(y, tX)

In [22]:
loss_ls

0.37717616465985404

In [23]:
w_ls

array([  2.96490590e-03,  -8.44190753e-03,  -7.26535537e-03,
         1.07588572e-03,  -1.59616405e-03,   3.99733431e-04,
        -2.96780330e-03,   1.36877265e-01,  -1.59044104e-03,
        -4.95865902e+00,  -2.98661958e-01,   7.19197758e-02,
         3.61840394e-01,   4.96119931e+00,  -1.08333759e-03,
        -9.52686303e-04,   4.96974908e+00,  -6.56814462e-04,
         1.26463254e-03,  -3.14363964e-04,   4.87948602e-04,
        -6.63146763e-04,  -7.43474417e-02,   4.79290812e-04,
         9.72092505e-04,   1.03122590e-03,   2.23111208e-04,
         1.61676265e-03,  -2.60772556e-03,   4.95769084e+00])

## Ridge regression using normal equations

In [24]:
def ridge_regression(y, tx, lambda_):
    w = np.linalg.inv(tx.T @ tx + lambda_ * np.identity(tx.shape[1])) @ tx.T @ y
    loss = compute_loss_mse(y, tx, w)
    return w, loss

In [25]:
lamb = 23
w_ridge, loss_ridge = ridge_regression(y, tX, lamb)

In [26]:
loss_ridge

0.37717992915741932

## Logistic regression using gradient descent

In [27]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1 / (1 + np.exp(-t))

In [28]:
def compute_loss_log(y, tx, w):
    """compute cost by negative log likelihood."""
    xw = tx @ w
    return np.sum(np.log(1 + np.exp(xw)) - y * xw)

In [29]:
def compute_gradient_log(y, tx, w):
    """compute the gradient of loss."""
    xw = tx @ w
    return tx.T @ (np.apply_along_axis(sigmoid, 0, xw) - y)

In [30]:
def logistic_regression(y, tx, initial_w, gamma, max_iters):
    """logistic regression algorithm using gradient descent"""
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient_log, compute_loss_log)

In [31]:
max_iters = 800
gamma = 3e-13
initial_w = np.zeros(tX.shape[1])

w_log, loss_log = logistic_regression(y, tX, initial_w, gamma, max_iters)

In [32]:
loss_log

-60469.70129449368

## Regularized  logistic  regression  using  gradient  descent

In [33]:
def compute_loss_rlog(y, tx, w, lambda_):
    return compute_loss_log(y, tx, w) + lambda_ * w.T @ w

In [34]:
def compute_gradient_rlog(y, tx, w, lambda_):
    return compute_gradient_log(y, tx, w) + 2 * lambda_ * w

In [35]:
def reg_logistic_regression(y, tx, lambda_, initial_w, gamma, max_iters):
    """regularized logistic regression algorithm using gradient descent"""
    compute_gradient = partial(compute_gradient_rlog, lambda_ = lambda_)
    compute_loss = partial(compute_loss_rlog, lambda_ = lambda_)
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient, compute_loss)

In [36]:
max_iters = 1000
gamma = 1e-13
# initial_w = np.random.rand(tX.shape[1])#
initial_w = np.zeros(tX.shape[1])

lambdas = np.logspace(-4, 2, 30)

w_rlog, loss_rlog = reg_logistic_regression(y, tX, lambdas, initial_w, gamma, max_iters)

In [37]:
loss_rlog

15438.02691224187

# Make submission

In [38]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [39]:
tX_test = reset_invalid_val(tX_test)

In [40]:
OUTPUT_PATH = '../data/submission.csv'
y_pred = predict_labels(w_ridge, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)