In [2]:
import pandas as pd
import numpy as np

In [3]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [4]:
# Randomly split training and testing datasets
np.random.seed(67)

num_of_samples = features.shape[0]
idx = np.random.permutation(range(num_of_samples))
cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
valid_idx = idx[cut:]

In [5]:
num_of_training_samples = len(train_idx)

In [6]:
number_of_minibatches = 4
minibatch_chunk_size = num_of_training_samples // number_of_minibatches
minibatch_chunk_size

217159

In [7]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

from utils import label_func, get_labels

get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [14]:
from sklearn.linear_model import SGDClassifier, Lasso

model = SGDClassifier(alpha=0.01, penalty='elasticnet',loss='log')
# model = Lasso(alpha=0.1)

In [15]:
def get_binary_accuracy(preds, actual):
    num_of_correct_predictions = (preds == actual).sum()
    accuracy = num_of_correct_predictions / len(actual)
    return accuracy

In [16]:
from tqdm import tqdm, trange

with trange(number_of_minibatches) as t:
    for i in t:
        t.set_description(f'Minibatch {i}')
        indexes = train_idx[minibatch_chunk_size * i : minibatch_chunk_size * (i + 1)]
        X = features[indexes]
        y = get_labels(faces.iloc[indexes])
        model.partial_fit(X, y, classes=[0,1])

Minibatch 3: 100%|████████████████████████████████| 4/4 [01:35<00:00, 23.85s/it]


In [17]:
y_preds = model.predict(features[valid_idx]).astype(float)

In [18]:
y_preds_proba = model.predict_proba(features[valid_idx])

In [19]:
y_preds_proba

array([[0.16891302, 0.83108698],
       [0.406648  , 0.593352  ],
       [0.6331258 , 0.3668742 ],
       ...,
       [0.43365634, 0.56634366],
       [0.66080738, 0.33919262],
       [0.39161475, 0.60838525]])

In [20]:
y_valid = get_labels(faces.iloc[valid_idx])

In [21]:
get_binary_accuracy(y_preds, y_valid)

0.6503621770223661

In [27]:
y_train_pred = model.predict(features[train_idx]).astype(float)

In [31]:
y_train = get_labels(faces.iloc[train_idx])

In [32]:
get_binary_accuracy(y_train_pred, y_train)

0.6523791323408193

In [25]:
from joblib import dump

In [26]:
dump(model, 'saved_model/lasso.joblib')

['saved_model/lasso.joblib']