In [1]:
# Setup and imports

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import proj1_helpers

from mlcomp.config import DATA_PATH
from mlcomp.data import load_csv_data, create_csv_submission
from mlcomp.helpers import split_data, compute_rmse
from mlcomp.performance import correctness
from mlcomp.feature_eng import build_advanced_poly

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')

In [2]:
yb, input_data, ids = load_csv_data(TRAIN_PATH, sub_sample=False)
yb_orig = yb.copy()
yb_orig = yb_orig.reshape((yb_orig.shape[0], 1))
yb[yb == -1] = 0
yb = yb.reshape((yb.shape[0], 1))

In [3]:
yb

array([[ 1.],
       [ 0.],
       [ 0.],
       ..., 
       [ 1.],
       [ 0.],
       [ 0.]])

## Logistic Regression

In [4]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1 / (1 + np.exp(-t))

In [5]:
def logistic_regression_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    N = tx.shape[0]
    loss = 0
    for n in range(N):
        xnw = np.dot(tx[n], w)
        loss += np.log(1 + np.exp(xnw)) - y[n]*xnw
        
    return loss

In [6]:
def logistic_regression_gradient(y, tx, w):
    """compute the gradient of loss."""
    return np.dot(tx.T, sigmoid(np.dot(tx, w)) - y)

In [7]:
def sigmoid_diff(x):
    return sigmoid(x) * (1 - sigmoid(x))

def logistic_regression_hessian(y, tx, w):
    """return the hessian of the loss function."""
    S = sigmoid_diff(np.dot(tx, w))
    return np.dot(tx.T, S * tx)

In [8]:
def newton_step(y, tx, w, gamma):
    """
    Do one step on Newton's method.
    return the loss and updated w.
    """
    D = tx.shape[1] if tx.shape[1:] else 1

    loss = logistic_regression_loss(y, tx, w)
    grad = logistic_regression_gradient(y, tx, w)
    hess = logistic_regression_hessian(y, tx, w)
    
    hess_inv = np.linalg.solve(hess, np.identity(D))
    w = w - gamma * np.dot(hess_inv, grad)
    
    return loss, w

In [50]:
def logistic_regression(y, tx):
    # init parameters
    max_iter = 100
    threshold = 0.1 # 1e-8
    lambda_ = 0.1
    losses = []
    gamma = 0.3
    
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        """
        if (len(losses) > 1 and losses[-1] < 1):
            gamma = 0.00000001
        elif (len(losses) > 1 and losses[-1] < 15):
            gamma = 0.0000001
        elif (len(losses) > 1 and losses[-1] < 200):
            gamma = 0.00001
        elif (len(losses) > 1 and losses[-1] < 5000):
            gamma = 0.0001
        elif (len(losses) > 1 and losses[-1] < 20000):
            gamma = 0.001
        """
            
        loss, w = newton_step(y, tx, w, gamma)
        # log info
        print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return w

In [10]:
def logistic_regression_gradient_descent_step(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    loss = logistic_regression_loss(y, tx, w)
    grad = logistic_regression_gradient(y, tx, w)
    w = w - gamma * grad
    
    return loss, w

In [18]:
def logistic_regression_gradient_descent(y, tx):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.01
    losses = []
    
    w = np.zeros((tx.shape[1], 1))
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = logistic_regression_gradient_descent_step(y, tx, w, gamma)
        # log info
        if iter % 10 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return w

In [16]:
def replace_nan_by_median(tx, nan_value):
    """Replaces values with a specified nan_value by the column median."""
    tx[tx == nan_value] = np.nan
    col_median = np.nanmedian(tx, axis=0)
    return np.where(np.isnan(tx), col_median, tx)

## Testing on data

In [33]:
x = np.array([[1, 2, 3], [4, -5, 6], [-7, 8, 9000]])
x = (x - x.min(0)) / x.ptp(0)
x

array([[  7.27272727e-01,   5.38461538e-01,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   3.33444481e-04],
       [  0.00000000e+00,   1.00000000e+00,   1.00000000e+00]])

In [47]:
#important_cols = [0, 1, 2, 7, 10, 11, 13, 19]

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import normalize

tx = replace_nan_by_median(input_data, -999)
important_cols = [0, 1, 2, 7, 10, 11, 13, 19]
degree = 4
poly = PolynomialFeatures(degree)
x_poly = poly.fit_transform(tx[:, important_cols])
x_poly = normalize(x_poly, axis=0, norm='max')

x_train, x_test, y_train, y_test = split_data(x_poly, yb, ratio=0.9, seed=123)

In [48]:
x_poly.max()

1.0

In [51]:
weights = logistic_regression(y_train, x_train)
#weights = logistic_regression_gradient_descent(y_train, x_train)

Current iteration=0, the loss=[ 155958.11562618]
Current iteration=1, the loss=[ 130022.17113303]
Current iteration=2, the loss=[ 115757.16928569]
Current iteration=3, the loss=[ 106910.87901842]
Current iteration=4, the loss=[ 101148.44961195]
Current iteration=5, the loss=[ 97314.83565042]
Current iteration=6, the loss=[ 94741.07652506]
Current iteration=7, the loss=[ 93004.16100331]
Current iteration=8, the loss=[ 91840.99929786]
Current iteration=9, the loss=[ 91070.55479957]
Current iteration=10, the loss=[ 90564.49776047]
Current iteration=11, the loss=[ 90233.18911906]
Current iteration=12, the loss=[ 90019.78209352]
Current iteration=13, the loss=[ 89881.28157526]
Current iteration=14, the loss=[ 89792.96301521]
Current iteration=15, the loss=[ 89735.78461935]
Current iteration=16, the loss=[ 89700.10910937]
Current iteration=17, the loss=[ 89677.93835444]
Current iteration=18, the loss=[ 89663.78904491]
Current iteration=19, the loss=[ 89652.4453274]


  This is separate from the ipykernel package so we can avoid doing imports until


Current iteration=20, the loss=[ 89645.6816281]
Current iteration=21, the loss=[ 89641.64638013]
Current iteration=22, the loss=[ 89639.46429906]
Current iteration=23, the loss=[ 89638.14616562]
Current iteration=24, the loss=[ 89634.37263575]
Current iteration=25, the loss=[ 89631.74950051]
Current iteration=26, the loss=[ 89629.84712511]
Current iteration=27, the loss=[ 89628.54992638]
Current iteration=28, the loss=[ 89627.68590933]
Current iteration=29, the loss=[ 89627.21618255]
Current iteration=30, the loss=[ 89626.95842696]
Current iteration=31, the loss=[ 89626.49345555]
Current iteration=32, the loss=[ 89626.13343175]
Current iteration=33, the loss=[ 89625.94493734]
Current iteration=34, the loss=[ 89625.84748253]


In [52]:
from proj1_helpers import predict_labels

# Predict labels with found weights and print some useful information about quality of fit
y_pred = predict_labels(weights, x_poly)
correctness(yb_orig, y_pred)

Total correct: 205391 
Total incorrect: 44609 
Correct percentage: 82.15639999999999 %


In [55]:
test_path = "../data/test.csv"
yb_test, input_data_test, ids_test = load_csv_data(test_path, sub_sample=False)

In [56]:
tx_test = replace_nan_by_median(input_data_test, -999)
important_cols = [0, 1, 2, 7, 10, 11, 13, 19]
degree = 4
poly_test = PolynomialFeatures(degree)
x_poly_test = poly_test.fit_transform(tx_test[:, important_cols])
x_poly_test = normalize(x_poly_test, axis=0, norm='max')

In [68]:
y_test_pred = predict_labels(weights, x_poly_test)

In [81]:
y_test_pred.shape

(568238, 1)

In [82]:
create_csv_submission(ids_test, y_test_pred, "test_output.csv")