In [1]:
import numpy as np
import pandas as pd
import helpers
import implementation
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

In [63]:
def quantile_normalize(data, q=0.75):
    low    = (1-q) / 2
    high   = 1-low
    q_low  = np.quantile(data, low,  axis=0)
    q_high = np.quantile(data, high, axis=0)
    median = np.quantile(data, 0.5, axis=0)
    return (data - median) / (q_high - q_low)

def mim_max_normalize(data):
    return (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))

def z_normalize(data):
    return (data - data.mean(axis=0)) / data.std(axis=0)

def accuracy(y, tx, w):
    pred    = np.where(tx.dot(w) > 0, 1, 0)
    correct = np.sum(np.where(pred == y, 1, 0))
    return correct / len(y)

def build_test_train(y, tx, proportion=0.9, seed=1):

    np.random.seed(seed)
    indices = np.random.permutation(len(y))

    delimiter_indice = int(proportion * len(y))
    te_indices = indices[delimiter_indice:]
    tr_indices = indices[:delimiter_indice]
    y_te = y[te_indices]
    y_tr = y[tr_indices]
    tx_te = tx[te_indices]
    tx_tr = tx[tr_indices]
    return y_te, y_tr, tx_te, tx_tr

# Loading data

In [3]:
yb, input_data, ids = helpers.load_csv_data("./data/train.csv")

# Finding the best model for the classification

### logistic regression and least squares
At first, we have to normalize and format the data.

In [4]:
# normalization of the data
x = z_normalize(input_data)
tx = np.append(np.ones(len(x)).reshape(-1,1), x, axis=1)

# creating classification vector y that fits for logistic regression
y  = np.where(yb > 0, 1, 0)

Then, we train our two models.

In [5]:
w_log_reg, loss = implementation.logistic_regression(y, tx, initial_w=np.zeros(tx.shape[1]), max_iters=2000, gamma=0.000003)
w_ls, loss      = implementation.least_squares(yb, tx)

We can see that logistic regression appears to achieve better accuracy.

In [6]:
print('Accuracy for logistic regression : ',accuracy(y, tx, w_log_reg))
print('Accuracy for least squares       : ',accuracy(y, tx, w_ls))

Accuracy for logistic regression :  0.75024
Accuracy for least squares       :  0.744972


#### Testing overfitting
We separate our dataset in the training set and the test set.

First, we train the data on the training set and then we test our model on the test set

In [15]:
y_te,  y_tr,  tx_te, tx_tr = build_test_train(y, tx)

In [16]:
w_log_reg, loss = implementation.logistic_regression(y_tr, tx_tr, initial_w=np.zeros(tx_tr.shape[1]), max_iters=2000, gamma=0.000003)

In [17]:
print('Accuracy for training set : ',accuracy(y_tr, tx_tr, w_log_reg))
print('Accuracy for testing set  : ',accuracy(y_te, tx_te, w_log_reg))

Accuracy for training set :  0.7504977777777778
Accuracy for testing set  :  0.74872


We can see that the accuracy for the training and testing set is really close.
It means that our model does not over fit too much and adding a regulator term would not yield significant improvements on the test set.

When taking a look at the leaderboard we notice that these results seem not good enough. Maybe our model is too simple as many teams can achieve over 0.8 accuracy.

### Using interaction of predictors

In [34]:
def build_interaction_tx(input_data, normalisation_function):
    input_data = normalisation_function(input_data)

    n_features = input_data.shape[1]
    n_interacted_features = int(n_features + (n_features-1) * n_features / 2)

    x = np.empty((n_interacted_features, len(input_data)))
    x[:n_features] = input_data.T
    index = n_features
    for i in range(n_features):
        for j in range(i):
            x[index] = x[i] * x[j]
            index = index + 1

    x = normalisation_function(x.T)
    tx = np.append(np.ones(len(x)).reshape(-1,1), x, axis=1)

    return tx

In [94]:
tx = build_interaction_tx(input_data, z_normalize)
y_te, y_tr, tx_te, tx_tr = build_test_train(y, tx)
yb_te, yb_tr, tx_te, tx_tr = build_test_train(yb, tx)

In [95]:
w_ls, loss_tr = implementation.least_squares(yb_tr, tx_tr)
accuracy_te_ls = accuracy(y_te, tx_te, w_ls)
accuracy_tr_ls = accuracy(y_tr, tx_tr, w_ls)

In [119]:
initial_w = np.zeros(tx.shape[1])
max_iters = 70000
gamma     = 0.0000005
w_log_reg, loss   = implementation.logistic_regression(y_tr, tx_tr, initial_w, max_iters, gamma)

0.6577288888888889        0.65376
0.8006222222222222        0.7992
0.8077955555555556        0.80568
0.8105955555555555        0.80876
0.8121733333333333        0.80996
0.8132222222222222        0.8104
0.8139155555555555        0.8108
0.8144533333333334        0.81144
0.8147244444444445        0.81236
0.8150222222222222        0.81264
0.8152622222222222        0.813
0.8153777777777778        0.8136
0.8156533333333333        0.8138
0.81584                   0.81408
0.8158888888888889        0.814
0.8159466666666667        0.81412
0.8161022222222222        0.81424
0.8161777777777778        0.8142
0.8161688888888889        0.81432
0.8161155555555556        0.8142


In [121]:
accuracy_te = accuracy(y_te, tx_te, w_log_reg)
accuracy_tr = accuracy(y_tr, tx_tr, w_log_reg)

print(accuracy_te)

0.81424


# Trash

In [118]:
def GD(y, tx, initial_w, max_iters, gamma, gradient_func, loss_func):
    w = initial_w

    for i in range(max_iters):
        if i%500 == 0:
            print(str(accuracy(y, tx, w)).ljust(25, ' '), accuracy(y_te, tx_te, w))
        grad = gradient_func(y, tx, w)
        w = w - gamma * grad

    loss = loss_func(y, tx, w)
    return w, loss

def logistic_regression(y, tx, initial_w, max_iters, gamma):
    w, loss = GD(y, tx, initial_w, max_iters, gamma, implementation.logistic_gradient, implementation.logistic_loss)
    return w, loss