In [None]:
# Setup and imports

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import proj1_helpers

from mlcomp.config import DATA_PATH
from mlcomp.data import load_csv_data, create_csv_submission
from mlcomp.helpers import split_data, compute_rmse, logistic_regression_loss, logistic_regression_gradient, sigmoid
from mlcomp.performance import eval_correctness
from mlcomp.feature_eng import build_advanced_poly, replace_nan_by_median
from mlcomp.models import logistic_regression

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')

In [None]:
yb, input_data, ids = load_csv_data(TRAIN_PATH, sub_sample=False)
yb_orig = yb.copy()
yb_orig = yb_orig.reshape((yb_orig.shape[0], 1))
yb[yb == -1] = 0
yb = yb.reshape((yb.shape[0], 1))

In [None]:
yb

## Logistic Regression

In [None]:
def logistic_regression_gradient_descent_step(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    loss = logistic_regression_loss(y, tx, w)
    grad = logistic_regression_gradient(y, tx, w)
    w = w - gamma * grad
    
    return loss, w

In [None]:
def logistic_regression_gradient_descent(y, tx):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.01
    losses = []
    
    w = np.zeros((tx.shape[1], 1))
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = logistic_regression_gradient_descent_step(y, tx, w, gamma)
        # log info
        if iter % 10 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return w

## Testing on data

In [None]:
tx = replace_nan_by_median(input_data, -999)
important_cols = [0, 2, 7, 1, 11, 13, 5, 9]
degree = 5
x_poly = build_advanced_poly(tx, degree, important_cols)

x_train, x_test, y_train, y_test = split_data(x_poly, yb, ratio=0.7, seed=123)

In [None]:
initial_w = np.zeros((x_poly.shape[1],1))
max_iters = 100
gamma = 0.7

weights, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)

In [None]:
weights

In [None]:
def predict_labels(weights, data, cutoff=0):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= cutoff)] = -1
    y_pred[np.where(y_pred > cutoff)] = 1

    return y_pred

In [None]:


# Predict labels with found weights and print some useful information about quality of fit
y_pred = predict_labels(weights, x_poly, 0.5)
eval_correctness(yb_orig, y_pred, verbose=True)

In [None]:
test_path = "../data/test.csv"
yb_test, input_data_test, ids_test = load_csv_data(test_path, sub_sample=False)

In [None]:
tx_test = replace_nan_by_median(input_data_test, -999)
x_poly_test = build_advanced_poly(tx_test, degree, important_cols)
x_poly_test = normalize(x_poly_test, axis=0, norm='max')

In [None]:
y_pred_submit = predict_labels(weights, x_poly_test, 0.5)

In [None]:
y_test_pred.shape

In [None]:
create_csv_submission(ids_test, y_test_pred, "test_output.csv")