In [1]:
import pandas as pd
import numpy as np

In [2]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [4]:
# Randomly split training and testing datasets
np.random.seed(67)

num_of_samples = features.shape[0]
idx = np.random.permutation(range(num_of_samples))
cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
valid_idx = idx[cut:]

In [6]:
num_of_training_samples = len(train_idx)

In [14]:
number_of_minibatches = 4
minibatch_chunk_size = num_of_training_samples // number_of_minibatches
minibatch_chunk_size

217159

In [10]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

from utils import label_func, get_labels

get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [65]:
from sklearn.linear_model import SGDClassifier, Lasso

model = SGDClassifier(alpha=0.01, penalty='elasticnet')
# model = Lasso(alpha=0.1)

In [66]:
def get_binary_accuracy(preds, actual):
    num_of_correct_predictions = (preds == actual).sum()
    accuracy = num_of_correct_predictions / len(actual)
    return accuracy

In [67]:
from tqdm import tqdm, trange

with trange(number_of_minibatches) as t:
    for i in t:
        t.set_description(f'Minibatch {i}')
        indexes = train_idx[minibatch_chunk_size * i : minibatch_chunk_size * (i + 1)]
        X = features[indexes]
        y = get_labels(faces.iloc[indexes])
        model.partial_fit(X, y, classes=[0,1])

Minibatch 3: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:40<00:00, 25.17s/it]


In [68]:
y_preds = model.predict(features[valid_idx]).astype(float)

In [69]:
y_valid = get_labels(faces.iloc[valid_idx])

In [70]:
get_binary_accuracy(y_preds, y_valid)

0.6499477341487113

In [71]:
from joblib import dump

In [72]:
dump(model, 'saved_model/lasso.joblib')

['saved_model/lasso.joblib']