# Required ML methods

1. *** least_squares_GD (y, tx, inital_w, gamma, max_iters) ***
Linear regression using gradient descent 

2. *** least_squares_SGD(y, tx, initial_w, gamma, max_iters) ***
Linear regression using stochastic gradient descent 

3. *** least_squares(y, tx) ***
Least squares regression using normal equations

4. *** ridge_regression(y, tx, lambda_) ***
Ridge regression using normal equations

5. *** logistic_regression(y, tx, initial_w, gamma, max_iters) ***
Logistic regression using gradient descent or SGD 

6. *** reg_logistic_regression(y, tx, lambda_, initial_w, gamma, max_iters) ***
Regularized  logistic  regression  using  gradient  descent or SGD

In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
import math
%load_ext autoreload
%autoreload 2

In [43]:
from proj1_helpers import *
from helpers import *

DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Feature processing

We try to obtain get column names of data, and obtain its positions

In [55]:
str_x = np.genfromtxt(DATA_TRAIN_PATH, delimiter=",", skip_header=0, dtype=str)
columns = str_x[0, 2:]
columns

array(['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis',
       'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot',
       'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta',
       'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi',
       'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num',
       'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
       'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta',
       'PRI_jet_subleading_phi', 'PRI_jet_all_pt'], 
      dtype='<U27')

In [56]:
col_idx = {}
for i in range(len(columns)):
    col_idx[columns[i]] = i
col_idx

{'DER_deltaeta_jet_jet': 4,
 'DER_deltar_tau_lep': 7,
 'DER_lep_eta_centrality': 12,
 'DER_mass_MMC': 0,
 'DER_mass_jet_jet': 5,
 'DER_mass_transverse_met_lep': 1,
 'DER_mass_vis': 2,
 'DER_met_phi_centrality': 11,
 'DER_prodeta_jet_jet': 6,
 'DER_pt_h': 3,
 'DER_pt_ratio_lep_tau': 10,
 'DER_pt_tot': 8,
 'DER_sum_pt': 9,
 'PRI_jet_all_pt': 29,
 'PRI_jet_leading_eta': 24,
 'PRI_jet_leading_phi': 25,
 'PRI_jet_leading_pt': 23,
 'PRI_jet_num': 22,
 'PRI_jet_subleading_eta': 27,
 'PRI_jet_subleading_phi': 28,
 'PRI_jet_subleading_pt': 26,
 'PRI_lep_eta': 17,
 'PRI_lep_phi': 18,
 'PRI_lep_pt': 16,
 'PRI_met': 19,
 'PRI_met_phi': 20,
 'PRI_met_sumet': 21,
 'PRI_tau_eta': 14,
 'PRI_tau_phi': 15,
 'PRI_tau_pt': 13}

## Clean data

We throw away data whose DER_mass_MMC is

In [76]:
# throw DER_mass_MMC < 0
X = tX
valid_mass_idx = np.where(tX[:, col_idx['DER_mass_MMC']] != -999)[0]
valid_mass_dat = X[valid_mass_idx, :]

In [77]:
valid_mass_dat.shape

(211886, 30)

In [78]:
# replace other -999 with 0
valid_dat = valid_mass_dat
for i in range(valid_mass_dat.shape[1]):
    idx = np.where(valid_dat[:, i] == -999)[0]
    valid_dat[idx, i] = 0

In [54]:
# delete features : 
# variables prefixed with DER (for DERived) are quantities computed from the primitive features, which were selected by  the physicists of ATLAS
# throw out DER_... except DER_mass_MMC

In [None]:
# add polynomial features

# General gradient descent

In [3]:
# general gradient descent
def gradient_descent(y, tx, initial_x, gamma, max_iters,
                     compute_gradient, compute_loss):
    losses = []
    w = initial_w
    
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        w = w - gamma * gradient
        # print(loss)
        
        losses.append(loss)
    
    return w, losses[-1]

# General stochastic gradient descent

In [4]:
def stochastic_gradient_descent(y, tx, initial_w, batch_size, gamma, max_iters, seed,
                                compute_gradient, compute_loss):
    losses = []
    w = initial_w
    
    num_batches = math.floor(y.shape[0] / batch_size) 
    batches = batch_iter(y, tx, seed, batch_size, num_batches)

    for n_iter in range(max_iters):
        batch_y, batch_tx = next(batches)
        gradient = compute_gradient(batch_y, batch_tx, w)
        loss = compute_loss(y, tx, w)
        #print(loss)
        w = w - gamma * gradient
        
        losses.append(loss)
        
    return w, losses[-1]

## Linear regression using gradient descent 

In [5]:
# TODO: perhaps import it instead of copy?
def calculate_mse(e):
    """Calculate the mse for vector e."""
    return 1/2*np.mean(e**2)

def calculate_mae(e):
    """Calculate the mae for vector e."""
    return np.mean(np.abs(e))

def compute_loss_mse(y, tx, w):
    """Calculate the loss using mse """
    e = y - tx.dot(w)
    return calculate_mse(e)

def compute_gradient_mse(y, tx, w):
    """ compute the gradient associated to the MSE cost function"""
    e = y - (tx @ w)
    return -1/y.shape[0] * (tx.T @ e)

In [6]:
def least_squares_GD(y, tx, initial_w, gamma, max_iters):
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient_mse, compute_loss_mse)

In [7]:
max_iters = 100
gamma = 1e-7
initial_w = np.zeros(tX.shape[1])

w_linear_gd, loss_linear_gd = least_squares_GD(y, tX, initial_w, gamma, max_iters)

In [8]:
loss_linear_gd

0.41560782934140184

## Linear regression using stochastic gradient descent 

In [9]:
def least_squares_SGD(y, tx, initial_w, gamma, max_iters):
    batch_size = y.shape[0]//2
    seed = 3
    
    return stochastic_gradient_descent(y, tx, initial_w, batch_size, gamma, max_iters,
                                       seed, compute_gradient_mse, compute_loss_mse)

In [10]:
max_iters = 100
gamma = 1e-8
initial_w = np.zeros(tX.shape[1])

w_linear_sgd, loss_linear_sgd = least_squares_SGD(y, tX, initial_w, gamma, max_iters)

In [11]:
loss_linear_sgd

0.43947820815603295

## Least squares regression using normal equations

In [12]:
def least_squares(y, tx):
    w = (np.linalg.inv(tx.T @ tx) @ tx.T @ y)
    loss = compute_loss_mse(y, tx, w)
    return w, loss

In [13]:
w_ls, loss_ls = least_squares(y, tX)

In [14]:
loss_ls

0.33968680990826089

## Ridge regression using normal equations

In [15]:
def ridge_regression(y, tx, lambda_):
    w = np.linalg.inv(tx.T @ tx + lambda_ * np.identity(tx.shape[1])) @ tx.T @ y
    loss = compute_loss_mse(y, tx, w)
    return w, loss

In [16]:
lamb = 23
w_ridge, loss_ridge = ridge_regression(y, tX, lamb)

In [17]:
loss_ridge

0.33968793894774219

## Logistic regression using gradient descent

In [18]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1 / (1 + np.exp(-t))

In [19]:
def compute_loss_log(y, tx, w):
    """compute cost by negative log likelihood."""
    xw = tx @ w
    return np.sum(np.log(1 + np.exp(xw)) - y * xw)

In [20]:
def compute_gradient_log(y, tx, w):
    """compute the gradient of loss."""
    xw = tx @ w
    return tx.T @ (np.apply_along_axis(sigmoid, 0, xw) - y)

In [21]:
## using gradient descent
def logistic_regression(y, tx, initial_w, gamma, max_iters):
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient_log, compute_loss_log)

In [22]:
max_iters = 100
gamma = 1e-14
initial_w = np.zeros(tX.shape[1])

w_log, loss_log = logistic_regression(y, tX, initial_w, gamma, max_iters)

In [23]:
loss_log

23641.840065980567

## Regularized  logistic  regression  using  gradient  descent

In [24]:
def compute_loss_rlog(y, tx, w, lambda_):
    return compute_loss_log(y, tx, w) + lambda_ * w.T @ w

In [25]:
def compute_gradient_rlog(y, tx, w, lambda_):
    return compute_gradient_log(y, tx, w) + 2 * lambda_ * w

In [26]:
def reg_logistic_regression(y, tx, lambda_, initial_w, gamma, max_iters):
    compute_gradient = partial(compute_gradient)
    compute_loss = partial(compute_loss_logistic, lambda_ = lambda_)
    return gradient_descent(y, tx, initial_w, gamma, max_iters,
                            compute_gradient, compute_loss)

In [27]:
max_iters = 100
gamma = 1e-14
initial_w = np.zeros(tX.shape[1])

w_rlog, loss_rlog = logistic_regression(y, tX, initial_w, gamma, max_iters)

In [28]:
loss_rlog

23641.840065980567