# Machine Learning - Project 1

In [1]:
# Setup and imports
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

import proj1_helpers

## Implementations of ML Methods

### Test data:

In [148]:
y = np.array([1, 2, 3, 4])
tx = np.array([[1, 2, 3], 
               [4, 5, 6], 
               [7, 8, 9], 
               [10, 11, 12]])
w = np.array([0, 0, 0])

### Helper functions:

In [3]:
def compute_loss_mse(y, tx, w):
    """Calculate the MSE loss."""
    N = len(y)
    e = y - np.dot(tx, w)
    return np.dot(e,e) / (2 * N)

In [4]:
def compute_loss_mae(y, tx, w):
    """Calculate the MAE loss."""
    N = len(y)
    e = y - np.dot(tx, w)
    return np.sum(np.absolute(e)) / N

In [5]:
def compute_rmse(y, tx, w):
    """Computes the Root Mean Square Error"""
    mse = compute_loss_mse(y, tx, w)
    return np.sqrt(2 * mse)

In [6]:
def compute_gradient_mse(y, tx, w):
    """Compute the MSE gradient."""
    N = len(y)
    e = y - np.dot(tx, w)
    return (-1/N) * np.dot(np.transpose(tx), e)

In [7]:
def compute_stochastic_subgradient_mae(y, tx, w):
    """Compute a stochastic subgradient from just few examples n and their corresponding y_n labels."""
    N = len(y)
    e = y - np.dot(tx, w)
    abs_e_subgrad = [np.sign(en) for en in e] # Sign chosen for subgradient of absolute value function
    return (-1/N) * np.dot(np.transpose(tx), abs_e_subgrad)

In [8]:
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]

In [9]:
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    np.random.seed(seed) # set seed
    permuted_idxs = np.random.permutation(x.shape[0])
    train_size = int(ratio * x.shape[0])
    train_idxs, test_idxs = permuted_idxs[:train_size], permuted_idxs[train_size:]
    
    return x[train_idxs], x[test_idxs], y[train_idxs], y[test_idxs]

### Functions to implement for project 1 submission:

In [10]:
"""Linear regression using gradient descent"""
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    """Gradient descent algorithm using MSE."""
    w = initial_w
    for n_iter in range(max_iters):
        grad = compute_gradient_mse(y, tx, w)
        loss = compute_loss_mse(y, tx, w)
        w = w - gamma * grad
        
    rmse = compute_rmse(y, tx, w)

    return w, rmse

In [11]:
"""Linear regression using stochastic gradient descent"""
def stochastic_subgradient_descent_mae(y, tx, initial_w, max_iters, gamma):
    """Stochastic subgradient descent algorithm using MAE."""
    batch_size = 1
    w = initial_w
    for n_iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
            g = compute_stochastic_subgradient_mae(minibatch_y, minibatch_tx, w)
            w = w - gamma * g
        loss = compute_loss_mae(y, tx, w)
        print("Stochastic Subgradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return w, loss

In [12]:
def least_squares(y, tx):
    """Calculates the explicit least squares solution.
    Returns rmse, optimal weights"""
    N = tx.shape[0]
    D = tx.shape[1]
    rank_tx = np.linalg.matrix_rank(tx)
    
    # Check if tx is invertible. If so, find explicit solution
    # using real inverses.
    # If not, find explicit solution using pseudoinverses.
    if (rank_tx == max(tx.shape[0], tx.shape[1])):
        gramian_inv = np.linalg.inv(np.dot(tx.T, tx))
        w = np.dot(gramian_inv, np.dot(tx.T, y))
    else:
        U, s, V_T = np.linalg.svd(tx)
        S_inv_T = np.zeros((D, N))
        S_inv_T[:len(s), :len(s)] = np.diag(1/s)
        w = np.dot(V_T.T, np.dot(S_inv_T, np.dot(U.T, y)))
    
    rmse = compute_rmse(y, tx, w)
    
    return w, rmse

In [154]:
def ridge_regression(y, tx, lambda_):
    """Ridge regression using normal equations"""
    
    #if (lambda_ == 0):
    #    return least_squares(y, tx)
    
    N = tx.shape[0]
    D = tx.shape[1]
    
    inv = np.linalg.inv(np.dot(tx.T, tx) + 2 * N * lambda_ * np.identity(D))
    w = np.dot(inv, np.dot(tx.T, y))
    
    rmse = compute_rmse(y, tx, w)
    
    return w, rmse

In [14]:
"""Logistic regression using gradient descent or SGD"""
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    raise NotImplementedError

In [15]:
"""Regularized logistic regression using gradient descent or SGD"""
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    raise NotImplementedError

## Higgs Data Analysis

In [16]:
from proj1_helpers import *

data_path = "./data/train.csv"
yb, input_data, ids = load_csv_data(data_path, sub_sample=False)

x_train, x_test, y_train, y_test = split_data(input_data, yb, ratio=0.9, seed=123456789)

In [18]:
def correctness(yb, y_pred):
    correct = 0
    for i in range(len(y_pred)):
        if (y_pred[i] == yb[i]):
            correct += 1
        
    incorrect = len(y_pred) - correct
    perc = correct / len(y_pred) * 100
    print("Total correct:", correct, "\nTotal incorrect:", incorrect, "\nCorrect percentage:", perc, "%")

In [None]:
lambda_ = 0
weights, rmse = ridge_regression(yb, input_data, lambda_)

In [None]:
y_pred = predict_labels(weights, input_data)
create_csv_submission(ids, y_pred, "test_output")
correctness(yb, y_pred)

### Experimentation

In [19]:
"""
Builds polynomial basis function for input data matrix tx, for j=0 up to j=degree,
where the result will be a matrix of form [1, tx, tx^2, ..., tx^j]
"""

def build_simple_poly(tx, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((tx.shape[0], 1))

    for j in range(1, degree+1):
        poly = np.column_stack((poly, np.power(tx, j)))

    return poly

In [161]:
import itertools

def build_mult_combinations(tx, deg):
    N = tx.shape[0]
    comb_iter = itertools.combinations_with_replacement(range(tx.shape[1]), deg)
    mult = np.empty((N,0))
    for comb in comb_iter:
        mult_col = np.ones((N, 1))
        for idx in comb:
            tx_col = tx[:,idx].reshape(N,1)
            mult_col = np.multiply(mult_col, tx_col)
        mult = np.append(mult, mult_col, axis=1)
    return mult
        

"""
Builds polynomial basis function for input data matrix tx, for j=0 up to j=degree,
where the result will be a matrix of form 
[1, tx, comb_mult(tx, 2), ..., comb_mult(tx, j)]
"""
def build_advanced_poly(tx, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((tx.shape[0], 1))

    for j in range(1, degree+1):
        mult = build_mult_combinations(tx, j)
        poly = np.column_stack((poly, mult))

    return poly

In [172]:
x_poly = build_simple_poly(input_data, degree=2)
x_train_poly = build_simple_poly(x_train, degree=2)
x_test_poly = build_simple_poly(x_test, degree=2)

weights, rmse = ridge_regression(y_train, x_train_poly, lambda_=0)

In [173]:
y_pred = predict_labels(weights, x_poly)
correctness(yb, y_pred)

print("-----------------------------")

rmse_train = compute_rmse(y_train, x_train_poly, weights)
rmse_test = compute_rmse(y_test, x_test_poly, weights)
print("Train RMSE:", rmse_train, ", Test RMSE:", rmse_test)

Total correct: 192608 
Total incorrect: 57392 
Correct percentage: 77.0432 %
-----------------------------
Train RMSE: 0.797452588264 , Test RMSE: 0.796328693096
