In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [107]:
import numpy as np

import sys
sys.path.append('../')

from scripts.proj1_helpers import *
from scripts.implementations import *

# Load train data

In [13]:
# Load data
y, tx, ids = load_csv_data('../data/train.csv')

# Normalise data
tx, mean_tx, std_tx = standardise(tx)

# Check shape of data
print('Shape: y: {}, x:{}\n'.format(y.shape, tx.shape))

# Check that data is normalised
print(np.mean(tx, axis=0), np.std(tx, axis=0))

# Load test data

In [None]:
y_test, tx_test, ids_test = load_csv_data('../data/test.csv')

# Don't forget to standardise to same mean and std
tx_test = standardise_to_fixed(tx_test, mean_tx, std_tx)

# 0 Random baseline guessing

In [119]:
# Get baseline frequency of the two classes in training data
prior_probs = [sum(y == 1)/len(y), sum(y == -1)/len(y)]

y_test_pred = np.random.choice([1., -1.], size=len(y_test), p=prior_probs)

# Save in submission file
create_csv_submission(ids_test, y_test_pred, '../data/random_basline_submission.csv')

The accuracy on the test after submission to the AICrowd platform for the random guess model is 55%. This is thus our baseline. Anything that goes below that is probably overfitting or a model that diverged. 

# 1 Least-squares gradient descent with shuffle split 

In [122]:
# Define hyperparameters for gradient descent
max_iters = 50
gamma = .1

num_samples, num_dim = tx.shape
# Initial weights vector to train a linear model
initial_w = np.zeros(num_dim)

res = {
    'weights': {},
    'accuracy': {},
    'loss': {}
}
n_iter = 0

for train_data, eval_data in train_eval_split(y, tx, train_size=.7, num_splits=5):
    # Get training data
    y_train, tx_train = train_data
    
    # Run gradient descent under MSE loss to find optimal weights
    final_w, final_loss = least_squares_GD(y_train, tx_train, initial_w, max_iters, gamma)
    
    # Get validation set
    y_eval, tx_eval = eval_data
    
    # Get predictions from current model
    y_pred = predict_labels(final_w, tx_eval)
    
    acc = get_accuracy(y_pred, y_eval)
    
    print('Accuracy of predictions using least-squares gradient descent', acc, '\n')
    
    res['weights'][n_iter] = w
    res['loss'][n_iter] = loss
    res['accuracy'][n_iter] = acc
    
    n_iter += 1

# Select model with highest accuracy on validation set
iter_max_acc = max(res['accuracy'], key=res['accuracy'].get)
w_max_acc = res['weights'][iter_max_acc]

Gradient Descent(0/49): loss=0.4636087303159818, gradient=0.7850384279684134
Gradient Descent(1/49): loss=0.44856737946369746, gradient=0.42135566888668025
Gradient Descent(2/49): loss=0.43907337842098365, gradient=0.3239920655829415
Gradient Descent(3/49): loss=0.4326351490635258, gradient=0.2651820191539501
Gradient Descent(4/49): loss=0.428106852004358, gradient=0.2214672862075326
Gradient Descent(5/49): loss=0.4248031527475808, gradient=0.18834801945977317
Gradient Descent(6/49): loss=0.42229957092091547, gradient=0.163237290452147
Gradient Descent(7/49): loss=0.4203298629959851, gradient=0.14417116938505284
Gradient Descent(8/49): loss=0.41872496575343865, gradient=0.12962655796411296
Gradient Descent(9/49): loss=0.41737599713032336, gradient=0.11843430296549144
Gradient Descent(10/49): loss=0.4162116878207794, gradient=0.10971152110227202
Gradient Descent(11/49): loss=0.4151845274903265, gradient=0.10280296649612077
Gradient Descent(12/49): loss=0.41426219842172696, gradient=0.09

Gradient Descent(10/49): loss=0.3921403376329163, gradient=0.0229728546973048
Gradient Descent(11/49): loss=0.39208910207371, gradient=0.02270146646570106
Gradient Descent(12/49): loss=0.39203904358779296, gradient=0.022438162604586195
Gradient Descent(13/49): loss=0.39199011906774084, gradient=0.02218175426612966
Gradient Descent(14/49): loss=0.39194229016810234, gradient=0.02193132993247839
Gradient Descent(15/49): loss=0.39189552227461916, gradient=0.02168618563760748
Gradient Descent(16/49): loss=0.39184978373460055, gradient=0.02144577334208858
Gradient Descent(17/49): loss=0.3918050452788412, gradient=0.021209662635221685
Gradient Descent(18/49): loss=0.39176127958464035, gradient=0.020977512237130347
Gradient Descent(19/49): loss=0.3917184609432256, gradient=0.020749048721963103
Gradient Descent(20/49): loss=0.3916765650047805, gradient=0.02052405057518716
Gradient Descent(21/49): loss=0.39163556858143445, gradient=0.020302336202183465
Gradient Descent(22/49): loss=0.39159544949

Gradient Descent(17/49): loss=0.39010295529124167, gradient=0.007877280576943044
Gradient Descent(18/49): loss=0.39009691382045275, gradient=0.007793295909653595
Gradient Descent(19/49): loss=0.39009099861799945, gradient=0.007711208291203996
Gradient Descent(20/49): loss=0.3900852058441769, gradient=0.007630810567350824
Gradient Descent(21/49): loss=0.3900795319729104, gradient=0.007551938401720225
Gradient Descent(22/49): loss=0.39007397373532643, gradient=0.007474459832794413
Gradient Descent(23/49): loss=0.3900685280765424, gradient=0.007398267636524544
Gradient Descent(24/49): loss=0.39006319212219526, gradient=0.007323273686185222
Gradient Descent(25/49): loss=0.3900579631522196, gradient=0.007249404746061085
Gradient Descent(26/49): loss=0.3900528385800815, gradient=0.007176599303582832
Gradient Descent(27/49): loss=0.3900478159361572, gradient=0.007104805160387658
Gradient Descent(28/49): loss=0.39004289285429455, gradient=0.007033977583143347
Gradient Descent(29/49): loss=0.39

In [121]:
# Get predictions from current model
y_test_pred = predict_labels(w_max_acc, tx_test)

# Save in submission file
create_csv_submission(ids_test, y_test_pred, '../data/test_gd_submission.csv')