In [1]:
# Setup and imports

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import proj1_helpers

from mlcomp.config import DATA_PATH
from mlcomp.data import load_csv_data, create_csv_submission
from mlcomp.helpers import split_data, compute_rmse
from mlcomp.performance import correctness
from mlcomp.feature_eng import build_advanced_poly

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')

In [2]:
yb, input_data, ids = load_csv_data(TRAIN_PATH, sub_sample=False)
yb = yb.reshape((yb.shape[0], 1))

In [3]:
input_data.shape

(250000, 30)

## Logistic Regression

In [4]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1 / (1 + np.exp(-t))

In [5]:
def logistic_regression_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    N = tx.shape[0]
    loss = 0
    for n in range(N):
        xnw = np.dot(tx[n], w)
        loss += np.log(1 + np.exp(xnw)) - y[n]*xnw
        
    return loss

In [6]:
def logistic_regression_gradient(y, tx, w):
    """compute the gradient of loss."""
    return np.dot(tx.T, sigmoid(np.dot(tx, w)) - y)

In [7]:
def sigmoid_diff(x):
    return sigmoid(x) * (1 - sigmoid(x))

def logistic_regression_hessian(y, tx, w):
    """return the hessian of the loss function."""
    S = sigmoid_diff(np.dot(tx, w))
    return np.dot(tx.T, S * tx)

In [8]:
def newton_step(y, tx, w, gamma):
    """
    Do one step on Newton's method.
    return the loss and updated w.
    """
    D = tx.shape[1] if tx.shape[1:] else 1

    loss = logistic_regression_loss(y, tx, w)
    grad = logistic_regression_gradient(y, tx, w)
    hess = logistic_regression_hessian(y, tx, w)
    
    hess_inv = np.linalg.solve(hess, np.identity(D))
    w = w - gamma * np.dot(hess_inv, grad)
    
    return loss, w

In [14]:
def logistic_regression(y, tx):
    # init parameters
    max_iter = 100
    threshold = 0.0001 # 1e-8
    lambda_ = 0.1
    losses = []
    gamma = 0.01
    
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        if (len(losses) > 1 and losses[-1] < 1):
            gamma = 0.00000001
        elif (len(losses) > 1 and losses[-1] < 15):
            gamma = 0.0000001
        elif (len(losses) > 1 and losses[-1] < 200):
            gamma = 0.00001
        elif (len(losses) > 1 and losses[-1] < 5000):
            gamma = 0.0001
        elif (len(losses) > 1 and losses[-1] < 20000):
            gamma = 0.001
            
        loss, w = newton_step(y, tx, w, gamma)
        # log info
        print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return w

In [10]:
def logistic_regression_gradient_descent_step(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    loss = logistic_regression_loss(y, tx, w)
    grad = logistic_regression_gradient(y, tx, w)
    w = w - gamma * grad
    
    return loss, w

In [11]:
def logistic_regression_gradient_descent(y, x):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.01
    losses = []
    
    # build tx
    tx = np.c_[np.ones((y.shape[0], 1)), x]
    w = np.zeros((tx.shape[1], 1))
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = logistic_regression_gradient_descent_step(y, tx, w, gamma)
        # log info
        if iter % 10 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return w

## Testing on data

In [12]:
#important_cols = [0, 1, 2, 7, 10, 11, 13, 19]
important_cols = [0, 1, 2, 10, 11, 13]
degree = 3

tx = build_advanced_poly(input_data, degree, important_cols)
#tx = np.c_[np.ones((input_data.shape[0], 1)), input_data]

x_train, x_test, y_train, y_test = split_data(tx, yb, ratio=0.9, seed=123)

In [15]:
weights = logistic_regression(y_train, x_train)

Current iteration=0, the loss=[ 155958.11562618]
Current iteration=1, the loss=[ 147764.91582331]
Current iteration=2, the loss=[ 139728.40216235]
Current iteration=3, the loss=[ 131833.3326417]
Current iteration=4, the loss=[ 124065.45170514]
Current iteration=5, the loss=[ 116411.29955439]
Current iteration=6, the loss=[ 108858.0447108]
Current iteration=7, the loss=[ 101393.34246607]
Current iteration=8, the loss=[ 94005.23708427]
Current iteration=9, the loss=[ 86682.1489092]
Current iteration=10, the loss=[ 79412.98473542]
Current iteration=11, the loss=[ 72187.24477356]
Current iteration=12, the loss=[ 64994.78643889]
Current iteration=13, the loss=[ 57825.36075265]
Current iteration=14, the loss=[ 50668.42096854]
Current iteration=15, the loss=[ 43513.11888416]
Current iteration=16, the loss=[ 36348.2143641]
Current iteration=17, the loss=[ 29161.88546131]
Current iteration=18, the loss=[ 21941.52324619]
Current iteration=19, the loss=[ 14673.53836265]
Current iteration=20, the 

In [16]:
from proj1_helpers import predict_labels

# Predict labels with found weights and print some useful information about quality of fit
y_pred = predict_labels(weights, tx)
correctness(yb, y_pred)

Total correct: 172182 
Total incorrect: 77818 
Correct percentage: 68.8728 %


In [None]:
create_csv_submission(ids, y_pred, "test_output.csv")