In [10]:
import pandas as pd
import numpy as np

In [1]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [3]:
import torch 

# Check if GPU is available
print(torch.cuda.is_available())

True


In [4]:
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [5]:
features.shape

(1085795, 2048)

In [6]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

from utils import label_func, get_labels

get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [8]:
import numpy as np

# Randomly split training and testing datasets
np.random.seed(67)
torch.manual_seed(67)

num_of_samples = features.shape[0]
idx = np.random.permutation(range(num_of_samples))
cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
valid_idx = idx[cut:]

In [9]:
len(train_idx), len(valid_idx)

(868636, 217159)

In [10]:
features[0]

array([-0.46469116,  0.7696707 ,  0.28533868, ...,  0.87747054,
        2.11865328, -0.50703953])

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, indexes, label_func=label_func):
        self.indexes,self.label_func = indexes,label_func
        
    def __getitem__(self, i):
        index = self.indexes[i]
        
        sample = torch.tensor(features[index]).float()
        label = label_func(faces.iloc[index])
        
        return sample,label
    
    def __len__(self): return len(self.indexes)



In [12]:
train_ds = CustomDataset(train_idx)
valid_ds = CustomDataset(valid_idx)

In [13]:
train_dl = DataLoader(train_ds, batch_size=64, num_workers=6)
valid_dl = DataLoader(valid_ds, batch_size=64, num_workers=6)

In [14]:
one_batch = next(iter(train_dl))

In [15]:
one_batch[0]

tensor([[-0.6111,  0.2389, -0.6342,  ...,  0.0221, -0.2860, -0.2405],
        [ 0.7539,  1.9130, -0.1706,  ...,  0.3420,  0.2061, -0.5174],
        [-0.6160,  0.4719, -0.5864,  ..., -0.5715,  3.1570, -0.5152],
        ...,
        [ 2.2167,  0.8044,  3.0293,  ..., -0.5455, -0.6586, -0.5183],
        [ 2.4345,  1.0463, -0.6355,  ...,  3.4711,  1.0346, -0.1659],
        [ 0.0053,  1.4662, -0.6334,  ..., -0.5711, -0.6581, -0.5154]])

In [26]:
# Define a simple binary classifier that takes a 2048 feature long tensor as input
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()        
        
        # Number of input features is 2048
        self.layer_1 = nn.Linear(2048, 2048)
        self.layer_2 = nn.Linear(2048, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.dropout(x)
        x = self.layer_2(x)
        
        return x

In [27]:
import torch.optim as optim

LEARNING_RATE = 0.0001

model = BinaryClassifier()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [28]:
# Run a minibatch of data through the model to check we get any errors
preds = model(one_batch[0])
preds[:5],preds.shape

(tensor([[ 0.0632],
         [ 0.0171],
         [ 0.0563],
         [ 0.1310],
         [-0.4004]], grad_fn=<SliceBackward0>),
 torch.Size([64, 1]))

In [29]:
def binary_acc(y_pred, y_test):
    # Transform outputs to 0 and 1
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    # Calculate percentage of correct predictions
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    
    return acc

In [30]:
binary_acc(preds, one_batch[1].unsqueeze(1))

tensor(0.4688)

In [31]:
# Specify that we want to use the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [32]:
from tqdm import tqdm

In [33]:
def plot_losses_and_metrics(losses, val_losses, accuracies, val_accuracies):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))

    axes[0].plot(losses)
    axes[0].plot(val_losses)
    axes[0].set_title('model loss')
    axes[0].set_ylabel('loss')
    axes[0].set_xlabel('epoch')
    axes[0].legend(['train', 'test'], loc='upper left')

    axes[1].plot(accuracies)
    axes[1].plot(val_accuracies)
    axes[1].set_title('binary accuracy')
    axes[1].set_ylabel('acc')
    axes[1].set_xlabel('epoch')
    axes[1].legend(['train', 'test'], loc='upper left')
    
    fig.tight_layout()

In [34]:
EPOCHS = 8

losses = []
val_losses = []
accuracies = []
val_accuracies = []

# Move model to GPU if possible
model = model.to(device)
# Tells PyTorch we are in training mode
model.train()

try:
    for e in range(12):

        # Set loss and accuracy to zero at start of each epoch
        epoch_training_loss = 0
        epoch_training_accuracy = 0
        epoch_valid_loss = 0
        epoch_valid_accuracy = 0

        with tqdm(train_dl, unit="batch") as tepoch:
            for x_batch, y_batch in tepoch:
                tepoch.set_description(f"Epoch {e}")
                # Transfer the tensors to the GPU if possible
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
                optimizer.zero_grad()

                # Predict a minibatch of outputs
                y_pred = model(x_batch)

                # Calculate the loss (unsqueeze adds a dimension to y)
                loss = loss_function(y_pred, y_batch.unsqueeze(1))
                training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                # Backpropagation. Gradients are calculated
                loss.backward()
                optimizer.step()

                batch_loss = loss.item()
                batch_acc = training_acc.item()
                epoch_training_loss += batch_loss
                epoch_training_accuracy += batch_acc
                losses.append(batch_loss)
                accuracies.append(batch_acc)
                
                # tepoch.set_postfix(loss=loss.item(), accuracy=100. * training_acc.item())

        for x_batch, y_batch in valid_dl:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            valid_y_pred = model(x_batch)
            valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
            valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))

            batch_valid_loss = valid_loss.item()
            batch_valid_accuracy = valid_acc.item()
            epoch_valid_loss += batch_valid_loss
            epoch_valid_accuracy += batch_valid_accuracy
            val_losses.append(batch_valid_loss)
            val_accuracies.append(batch_valid_accuracy)

        avg_train_loss = epoch_training_loss/len(train_dl)
        avg_valid_loss = epoch_training_loss/len(valid_dl)

        avg_train_accuracy = epoch_training_accuracy/len(train_dl)
        avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)

        print(f'End of Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy} | Validation Loss: {avg_valid_loss} | Validation Accuracy: {avg_valid_accuracy}')
  
except Exception as e:
    print("Something went wrong in training")
    print(e)

Epoch 0: 100%|████████████████████████| 13573/13573 [00:53<00:00, 254.64batch/s]


End of Epoch 0: | Training Loss: 0.62025 | Training accuracy: 0.651943097379453 | Validation Loss: 2.480461270097314 | Validation Accuracy: 0.6548399486538938


Epoch 1: 100%|████████████████████████| 13573/13573 [00:52<00:00, 261.01batch/s]


End of Epoch 1: | Training Loss: 0.61460 | Training accuracy: 0.6579195435296757 | Validation Loss: 2.4578549569301376 | Validation Accuracy: 0.6529063894317371


Epoch 2: 100%|████████████████████████| 13573/13573 [00:53<00:00, 254.57batch/s]


End of Epoch 2: | Training Loss: 0.61119 | Training accuracy: 0.6613696374661673 | Validation Loss: 2.444232104350172 | Validation Accuracy: 0.6509682264971466


Epoch 3: 100%|████████████████████████| 13573/13573 [00:52<00:00, 259.04batch/s]


End of Epoch 3: | Training Loss: 0.60766 | Training accuracy: 0.6650534214490745 | Validation Loss: 2.430119082299677 | Validation Accuracy: 0.6480218505395744


Epoch 4: 100%|████████████████████████| 13573/13573 [00:53<00:00, 255.89batch/s]


End of Epoch 4: | Training Loss: 0.60383 | Training accuracy: 0.6687694385418322 | Validation Loss: 2.4148004531638536 | Validation Accuracy: 0.6452964527788202


Epoch 5: 100%|████████████████████████| 13573/13573 [00:52<00:00, 256.26batch/s]


End of Epoch 5: | Training Loss: 0.59978 | Training accuracy: 0.6730478905080423 | Validation Loss: 2.3985745873237807 | Validation Accuracy: 0.6423040396969109


Epoch 6: 100%|████████████████████████| 13573/13573 [00:52<00:00, 260.01batch/s]


End of Epoch 6: | Training Loss: 0.59542 | Training accuracy: 0.6774921127578746 | Validation Loss: 2.3811578803811315 | Validation Accuracy: 0.6384869044186722


Epoch 7: 100%|████████████████████████| 13573/13573 [00:52<00:00, 258.01batch/s]


End of Epoch 7: | Training Loss: 0.59085 | Training accuracy: 0.6815025036612353 | Validation Loss: 2.3628586027763205 | Validation Accuracy: 0.6349644067361907


Epoch 8: 100%|████████████████████████| 13573/13573 [00:52<00:00, 260.11batch/s]


End of Epoch 8: | Training Loss: 0.58625 | Training accuracy: 0.6859611979170624 | Validation Loss: 2.344487473652406 | Validation Accuracy: 0.6339430402466039


Epoch 9: 100%|████████████████████████| 13573/13573 [00:52<00:00, 257.42batch/s]


End of Epoch 9: | Training Loss: 0.58144 | Training accuracy: 0.690380916422086 | Validation Loss: 2.3252310089198014 | Validation Accuracy: 0.6283350608316532


Epoch 10: 100%|███████████████████████| 13573/13573 [00:55<00:00, 246.02batch/s]


End of Epoch 10: | Training Loss: 0.57622 | Training accuracy: 0.6955688025634676 | Validation Loss: 2.3043756449938884 | Validation Accuracy: 0.6252597809259373


Epoch 11: 100%|███████████████████████| 13573/13573 [00:53<00:00, 255.10batch/s]


End of Epoch 11: | Training Loss: 0.57078 | Training accuracy: 0.6998803099139933 | Validation Loss: 2.2826022378441992 | Validation Accuracy: 0.6245514668818425


In [55]:
y_pred_list = np.array([])

model.eval()

with torch.no_grad():
    for X_batch, y_batch in valid_dl:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_tag = y_pred_tag.squeeze(1).cpu().numpy()
        y_pred_list = np.append(y_pred_list, y_pred_tag)
                     
len(y_pred_list)

217159

In [35]:
torch.save(model.state_dict(), './saved_model/weights-2.pth')