In [1]:
!pip install -Uqq fastai fastbook

In [2]:
import pandas as pd
import numpy as np

In [6]:
import torch 

# Check if GPU is available
print(torch.cuda.is_available())

True


In [4]:
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from fastai.data.core import DataLoaders, CrossEntropyLossFlat
from fastai.data.block import CategoryBlock
from fastai.basics import *

In [22]:
# Load a small chunk of the full dataset for testing purposes
faces = pd.read_csv('./faces_smaller_chunk.csv')
features = pd.read_csv('./vgg_chunks/chunk_1.csv', usecols=range(1, 2049))

In [23]:
faces.shape

(10859, 43)

In [28]:
features.shape

(10859, 2048)

In [29]:
features_tensor = torch.tensor(features.values).float()

In [30]:
features_tensor.shape

torch.Size([10859, 2048])

In [31]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

In [37]:
def label_func(row):
    '''
    Checks each column of ground_truth and extracts whichever column is not null as the label
    '''
    for i in ground_truth:
        if ~np.isnan(row[i]):
            return row[i]
    return np.nan

def get_labels(data):
    '''
    Returns array of labels from entire dataset
    '''
    return data.apply(lambda row: label_func(row), axis=1).to_numpy()

get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [38]:
# Randomly split training and testing datasets
num_of_samples = features_tensor.shape[0]
idx = np.random.permutation(range(num_of_samples))

cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
train_x = features_tensor[train_idx]
train_y = get_labels(faces.iloc[train_idx])

valid_idx = idx[cut:]
valid_x = features_tensor[valid_idx]
valid_y = get_labels(faces.iloc[train_idx])

In [41]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data,self.labels = data,labels
        
    def __getitem__(self, i):
        sample = self.data[i]
        label = self.labels[i]
        return sample,label
    
    def __len__(self): return len(self.data)

train_ds = CustomDataset(train_x, train_y)
valid_ds = CustomDataset(valid_x, valid_y)

In [42]:
# Dataloader will be passing a batch of 64 samples at a time to our model
train_dl = DataLoader(train_ds, batch_size=64)
valid_dl = DataLoader(valid_ds, batch_size=64)

In [49]:
# Define a simple binary classifier that takes a 2048 feature long tensor as input
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()        
        
        # Number of input features is 2048
        self.layer_1 = nn.Linear(2048, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [47]:
LEARNING_RATE = 0.001
EPOCHS = 30

In [56]:
import torch.optim as optim

model = BinaryClassifier()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [55]:
# Run a minibatch of data through the model to check we get any errors
minibatch = train_x[:64]
model.float()
model(train_x.float())

tensor([[-0.5161],
        [ 0.5439],
        [-0.4218],
        ...,
        [ 0.0981],
        [ 0.3573],
        [-0.1066]], grad_fn=<AddmmBackward>)

In [57]:
def binary_acc(y_pred, y_test):
    # Transform outputs to 0 and 1
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    # Calculate percentage of correct predictions
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [58]:
# Specify that we want to use the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [61]:
# Move model to GPU if possible
model = model.to(device)
# Tells PyTorch we are in training mode
model.train()

for e in range(0, EPOCHS):
    
    # Set loss and accuracy to zero at start of each epoch
    epoch_training_loss = 0
    epoch_training_accuracy = 0
    epoch_valid_loss = 0
    epoch_valid_accuracy = 0
    
    for x_batch, y_batch in train_dl:
        # Transfer the tensors to the GPU if possible
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
        optimizer.zero_grad()
        
        # Predict a minibatch of outputs
        y_pred = model(x_batch)
        
        # Calculate the loss (unsqueeze adds a dimension to y)
        loss = loss_function(y_pred, y_batch.unsqueeze(1))
        training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        # Backpropagation. Gradients are calculated
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_training_accuracy += training_acc.item()
    
    for x_batch, y_batch in valid_dl:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        valid_y_pred = model(x_batch)
        valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
        valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))
        
        epoch_valid_loss += valid_loss.item()
        epoch_valid_accuracy += valid_acc.item()
        
    avg_train_loss = epoch_training_loss/len(train_dl)
    avg_valid_loss = epoch_training_loss/len(valid_dl)
    
    avg_train_accuracy = epoch_training_accuracy/len(train_dl)
    avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)
    
    print(f'Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy} | Validation Loss: {avg_valid_loss} | Validation Accuracy: {avg_valid_accuracy}')

NameError: name 'epoch_loss' is not defined

In [165]:
y_pred_list = []

net.eval()

with torch.no_grad():
    for X_batch, y_batch in valid_dl:
        X_batch = X_batch.to(device)
        y_test_pred = net(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [166]:
!pip install -Uqq sklearn

In [167]:
from sklearn.metrics import confusion_matrix, classification_report 

In [171]:
len(y_pred_list)

34

In [168]:
confusion_matrix(valid_labels, y_pred_list)

ValueError: Found input variables with inconsistent numbers of samples: [8687, 34]