In [10]:
import pandas as pd
import numpy as np

In [1]:
from data.load_data import load_faces, load_features

features = load_features()
# faces = load_faces()

In [2]:
import torch 

# Check if GPU is available
print(torch.cuda.is_available())

True


In [3]:
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [4]:
features.shape

(1085795, 2048)

In [6]:
# ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

# from utils import label_func, get_labels

# get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [5]:
import numpy as np

# Randomly split training and testing datasets
np.random.seed(67)
torch.manual_seed(67)

num_of_samples = features.shape[0]
idx = np.random.permutation(range(num_of_samples))
cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
valid_idx = idx[cut:]

In [6]:
len(train_idx), len(valid_idx)

(868636, 217159)

In [10]:
features[0]

array([-0.46469116,  0.7696707 ,  0.28533868, ...,  0.87747054,
        2.11865328, -0.50703953])

In [11]:
random_labels = np.random.randint(0,2,size=num_of_samples).astype(float)

In [13]:
random_labels.shape

(1085795,)

In [17]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, indexes):
        self.indexes = indexes
        
    def __getitem__(self, i):
        index = self.indexes[i]
        
        sample = torch.tensor(features[index]).float()
        label = random_labels[i]
        
        return sample,label
    
    def __len__(self): return len(self.indexes)



In [18]:
train_ds = CustomDataset(train_idx)
valid_ds = CustomDataset(valid_idx)

In [19]:
train_dl = DataLoader(train_ds, batch_size=64, num_workers=6)
valid_dl = DataLoader(valid_ds, batch_size=64, num_workers=6)

In [20]:
one_batch = next(iter(train_dl))

In [21]:
one_batch[0]

tensor([[-0.6111,  0.2389, -0.6342,  ...,  0.0221, -0.2860, -0.2405],
        [ 0.7539,  1.9130, -0.1706,  ...,  0.3420,  0.2061, -0.5174],
        [-0.6160,  0.4719, -0.5864,  ..., -0.5715,  3.1570, -0.5152],
        ...,
        [ 2.2167,  0.8044,  3.0293,  ..., -0.5455, -0.6586, -0.5183],
        [ 2.4345,  1.0463, -0.6355,  ...,  3.4711,  1.0346, -0.1659],
        [ 0.0053,  1.4662, -0.6334,  ..., -0.5711, -0.6581, -0.5154]])

In [22]:
# Define a simple binary classifier that takes a 2048 feature long tensor as input
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()        
        
        # Number of input features is 2048
        self.layer_1 = nn.Linear(2048, 2048)
        self.layer_2 = nn.Linear(2048, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.dropout(x)
        x = self.layer_2(x)
        
        return x

In [23]:
import torch.optim as optim

LEARNING_RATE = 0.0001

model = BinaryClassifier()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [24]:
# Run a minibatch of data through the model to check we get any errors
preds = model(one_batch[0])
preds[:5],preds.shape

(tensor([[ 0.2164],
         [ 0.1106],
         [ 0.0976],
         [-0.1226],
         [-0.0781]], grad_fn=<SliceBackward0>),
 torch.Size([64, 1]))

In [25]:
def binary_acc(y_pred, y_test):
    # Transform outputs to 0 and 1
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    # Calculate percentage of correct predictions
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    
    return acc

In [26]:
binary_acc(preds, one_batch[1].unsqueeze(1))

tensor(0.4062)

In [27]:
# Specify that we want to use the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [28]:
from tqdm import tqdm

In [29]:
def plot_losses_and_metrics(losses, val_losses, accuracies, val_accuracies):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))

    axes[0].plot(losses)
    axes[0].plot(val_losses)
    axes[0].set_title('model loss')
    axes[0].set_ylabel('loss')
    axes[0].set_xlabel('epoch')
    axes[0].legend(['train', 'test'], loc='upper left')

    axes[1].plot(accuracies)
    axes[1].plot(val_accuracies)
    axes[1].set_title('binary accuracy')
    axes[1].set_ylabel('acc')
    axes[1].set_xlabel('epoch')
    axes[1].legend(['train', 'test'], loc='upper left')
    
    fig.tight_layout()

In [30]:
EPOCHS = 12

losses = []
val_losses = []
accuracies = []
val_accuracies = []

# Move model to GPU if possible
model = model.to(device)
# Tells PyTorch we are in training mode
model.train()

try:
    for e in range(EPOCHS):

        # Set loss and accuracy to zero at start of each epoch
        epoch_training_loss = 0
        epoch_training_accuracy = 0
        epoch_valid_loss = 0
        epoch_valid_accuracy = 0

        with tqdm(train_dl, unit="batch") as tepoch:
            for x_batch, y_batch in tepoch:
                tepoch.set_description(f"Epoch {e}")
                # Transfer the tensors to the GPU if possible
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
                optimizer.zero_grad()

                # Predict a minibatch of outputs
                y_pred = model(x_batch)

                # Calculate the loss (unsqueeze adds a dimension to y)
                loss = loss_function(y_pred, y_batch.unsqueeze(1))
                training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                # Backpropagation. Gradients are calculated
                loss.backward()
                optimizer.step()

                batch_loss = loss.item()
                batch_acc = training_acc.item()
                epoch_training_loss += batch_loss
                epoch_training_accuracy += batch_acc
                losses.append(batch_loss)
                accuracies.append(batch_acc)
                
                # tepoch.set_postfix(loss=loss.item(), accuracy=100. * training_acc.item())

        for x_batch, y_batch in valid_dl:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            valid_y_pred = model(x_batch)
            valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
            valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))

            batch_valid_loss = valid_loss.item()
            batch_valid_accuracy = valid_acc.item()
            epoch_valid_loss += batch_valid_loss
            epoch_valid_accuracy += batch_valid_accuracy
            val_losses.append(batch_valid_loss)
            val_accuracies.append(batch_valid_accuracy)

        avg_train_loss = epoch_training_loss/len(train_dl)
        avg_valid_loss = epoch_training_loss/len(valid_dl)

        avg_train_accuracy = epoch_training_accuracy/len(train_dl)
        avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)

        print(f'End of Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy} | Validation Loss: {avg_valid_loss} | Validation Accuracy: {avg_valid_accuracy}')
  
except Exception as e:
    print("Something went wrong in training")
    print(e)

Epoch 0: 100%|████████████████████████| 13573/13573 [00:50<00:00, 267.37batch/s]


End of Epoch 0: | Training Loss: 0.69693 | Training accuracy: 0.5007356056140868 | Validation Loss: 2.78708555634986 | Validation Accuracy: 0.5017895287953199


Epoch 1: 100%|████████████████████████| 13573/13573 [00:51<00:00, 265.34batch/s]


End of Epoch 1: | Training Loss: 0.69478 | Training accuracy: 0.509069180151326 | Validation Loss: 2.77850067385845 | Validation Accuracy: 0.5021065272825497


Epoch 2: 100%|████████████████████████| 13573/13573 [00:51<00:00, 265.38batch/s]


End of Epoch 2: | Training Loss: 0.69295 | Training accuracy: 0.5220772134285462 | Validation Loss: 2.7712053648063253 | Validation Accuracy: 0.5018862067564277


Epoch 3: 100%|████████████████████████| 13573/13573 [00:50<00:00, 267.02batch/s]


End of Epoch 3: | Training Loss: 0.69046 | Training accuracy: 0.5345586629480548 | Validation Loss: 2.7612111838444595 | Validation Accuracy: 0.5007629009226033


Epoch 4: 100%|████████████████████████| 13573/13573 [00:50<00:00, 269.22batch/s]


End of Epoch 4: | Training Loss: 0.68755 | Training accuracy: 0.5448951305675935 | Validation Loss: 2.7495966587609257 | Validation Accuracy: 0.5009746716945538


Epoch 5: 100%|████████████████████████| 13573/13573 [00:50<00:00, 268.95batch/s]


End of Epoch 5: | Training Loss: 0.68432 | Training accuracy: 0.5544473139958489 | Validation Loss: 2.736674931528241 | Validation Accuracy: 0.5006524118241944


Epoch 6: 100%|████████████████████████| 13573/13573 [00:50<00:00, 271.21batch/s]


End of Epoch 6: | Training Loss: 0.68081 | Training accuracy: 0.5632869154643022 | Validation Loss: 2.7226368679510036 | Validation Accuracy: 0.5008135417593741


Epoch 7: 100%|████████████████████████| 13573/13573 [00:49<00:00, 271.57batch/s]


End of Epoch 7: | Training Loss: 0.67710 | Training accuracy: 0.5718078038357497 | Validation Loss: 2.707792886256983 | Validation Accuracy: 0.5006478081117607


Epoch 8: 100%|████████████████████████| 13573/13573 [00:50<00:00, 269.23batch/s]


End of Epoch 8: | Training Loss: 0.67303 | Training accuracy: 0.5798527604704676 | Validation Loss: 2.691530888674279 | Validation Accuracy: 0.5006247895495921


Epoch 9: 100%|████████████████████████| 13573/13573 [00:50<00:00, 271.25batch/s]


End of Epoch 9: | Training Loss: 0.66867 | Training accuracy: 0.5879848780711415 | Validation Loss: 2.6740820148332927 | Validation Accuracy: 0.5003202868582715


Epoch 10: 100%|███████████████████████| 13573/13573 [00:49<00:00, 271.67batch/s]


End of Epoch 10: | Training Loss: 0.66430 | Training accuracy: 0.5958440009621753 | Validation Loss: 2.656597651893692 | Validation Accuracy: 0.5002236088971637


Epoch 11: 100%|███████████████████████| 13573/13573 [00:50<00:00, 269.66batch/s]


End of Epoch 11: | Training Loss: 0.65954 | Training accuracy: 0.6032727460542214 | Validation Loss: 2.6375643334503605 | Validation Accuracy: 0.5013067966780385


In [55]:
y_pred_list = np.array([])

model.eval()

with torch.no_grad():
    for X_batch, y_batch in valid_dl:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_tag = y_pred_tag.squeeze(1).cpu().numpy()
        y_pred_list = np.append(y_pred_list, y_pred_tag)
                     
len(y_pred_list)

217159

In [35]:
torch.save(model.state_dict(), './saved_model/weights-2.pth')