In [5]:
!pip install seaborn



In [7]:
import pandas as pd
import numpy as np

In [8]:
import torch 

# Check if GPU is available
print(torch.cuda.is_available())

True


In [9]:
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [10]:
# Load a small chunk of the full dataset for testing purposes
faces = pd.read_csv('./faces_smaller_chunk.csv')
features = pd.read_csv('./vgg_chunks/chunk_1.csv', usecols=range(1, 2049))

In [11]:
faces.head()

Unnamed: 0.1,Unnamed: 0,userid,gender,age,country,facial_hair,pol,pol_dat_us,pol_dat_ca,pol_dat_uk,...,left_eye_status.no_glass_eye_open,left_eye_status.normal_glass_eye_close,left_eye_status.dark_glasses,right_eye_status.normal_glass_eye_open,right_eye_status.no_glass_eye_close,right_eye_status.occlusion,right_eye_status.no_glass_eye_open,right_eye_status.normal_glass_eye_close,right_eye_status.dark_glasses,ethnicity.value
0,1,1,0.0,,united states,0.001713,liberal,,,,...,1.8,0.0,0.0,0.0,0.0,99.9,0.1,0.0,0.0,black
1,2,2,1.0,,united states,0.005151,liberal,,,,...,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,black
2,3,3,0.0,,united states,0.977098,conservative,,,,...,0.3,0.0,15.5,39.7,0.0,0.0,0.0,0.0,60.2,india
3,4,4,1.0,24.36,united states,0.000478,liberal,,,,...,98.9,0.0,0.5,0.0,0.0,10.2,81.7,0.0,8.1,black
4,5,5,1.0,,united states,0.005617,liberal,,,,...,0.0,0.0,0.0,2.3,0.0,97.3,0.1,0.1,0.2,asian


In [12]:
features.shape

(10859, 2048)

In [13]:
features_tensor = torch.tensor(features.values).float()

In [8]:
features_tensor.shape

torch.Size([10859, 2048])

In [10]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

def label_func(row):
    '''
    Checks each column of ground_truth and extracts whichever column is not null as the label
    '''
    for i in ground_truth:
        if ~np.isnan(row[i]):
            return row[i]
    return np.nan

def get_labels(data):
    '''
    Returns array of labels from entire dataset
    '''
    return data.apply(lambda row: label_func(row), axis=1).to_numpy()

get_labels(faces.iloc[[1,2,4,5]])

In [11]:
# Randomly split training and testing datasets
num_of_samples = features_tensor.shape[0]
idx = np.random.permutation(range(num_of_samples))

cut = int(0.8 * num_of_samples)
train_idx = idx[:cut]
train_x = features_tensor[train_idx]
train_y = get_labels(faces.iloc[train_idx])

valid_idx = idx[cut:]
valid_x = features_tensor[valid_idx]
valid_y = get_labels(faces.iloc[valid_idx])

In [12]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data,self.labels = data,labels
        
    def __getitem__(self, i):
        sample = self.data[i]
        label = self.labels[i]
        return sample,label
    
    def __len__(self): return len(self.data)

train_ds = CustomDataset(train_x, train_y)
valid_ds = CustomDataset(valid_x, valid_y)

In [13]:
# Dataloader will be passing a batch of 64 samples at a time to our model
train_dl = DataLoader(train_ds, batch_size=64)
valid_dl = DataLoader(valid_ds, batch_size=64)

In [14]:
# Define a simple binary classifier that takes a 2048 feature long tensor as input
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()        
        
        # Number of input features is 2048
        self.layer_1 = nn.Linear(2048, 1024) 
        self.layer_2 = nn.Linear(1024, 480)
        self.layer_3 = nn.Linear(480, 240)
        self.layer_4 = nn.Linear(240, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(1024)
        self.batchnorm2 = nn.BatchNorm1d(480)
        self.batchnorm3 = nn.BatchNorm1d(240)
        self.batchnorm4 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.relu(self.layer_4(x))
        x = self.batchnorm4(x)
        x = self.layer_out(x)
        
        return x

In [25]:
LEARNING_RATE = 0.0001
EPOCHS = 20

In [26]:
import torch.optim as optim

model = BinaryClassifier()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [27]:
# Run a minibatch of data through the model to check we get any errors
minibatch = train_x[:64]
model
model(train_x)

tensor([[-0.6764],
        [ 0.1567],
        [-0.2527],
        ...,
        [ 0.2894],
        [-1.0348],
        [-0.2516]], grad_fn=<AddmmBackward>)

In [28]:
def binary_acc(y_pred, y_test):
    # Transform outputs to 0 and 1
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    # Calculate percentage of correct predictions
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [29]:
# Specify that we want to use the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [30]:
# Move model to GPU if possible
model = model.to(device)
# Tells PyTorch we are in training mode
model.train()

for e in range(0, EPOCHS):
    
    # Set loss and accuracy to zero at start of each epoch
    epoch_training_loss = 0
    epoch_training_accuracy = 0
    epoch_valid_loss = 0
    epoch_valid_accuracy = 0
    
    for x_batch, y_batch in train_dl:
        # Transfer the tensors to the GPU if possible
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
        optimizer.zero_grad()
        
        # Predict a minibatch of outputs
        y_pred = model(x_batch)
        
        # Calculate the loss (unsqueeze adds a dimension to y)
        loss = loss_function(y_pred, y_batch.unsqueeze(1))
        training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        # Backpropagation. Gradients are calculated
        loss.backward()
        optimizer.step()
        
        epoch_training_loss += loss.item()
        epoch_training_accuracy += training_acc.item()
    
    for x_batch, y_batch in valid_dl:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        valid_y_pred = model(x_batch)
        valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
        valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))
        
        epoch_valid_loss += valid_loss.item()
        epoch_valid_accuracy += valid_acc.item()
        
    avg_train_loss = epoch_training_loss/len(train_dl)
    avg_valid_loss = epoch_training_loss/len(valid_dl)
    
    avg_train_accuracy = epoch_training_accuracy/len(train_dl)
    avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)
    
    print(f'Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy} | Validation Loss: {avg_valid_loss} | Validation Accuracy: {avg_valid_accuracy}')

Epoch 0: | Training Loss: 0.66802 | Training accuracy: 59.01470588235294 | Validation Loss: 2.6720792487227545 | Validation Accuracy: 59.529411764705884
Epoch 1: | Training Loss: 0.50308 | Training accuracy: 77.33823529411765 | Validation Loss: 2.012335631463454 | Validation Accuracy: 61.05882352941177
Epoch 2: | Training Loss: 0.35523 | Training accuracy: 89.11029411764706 | Validation Loss: 1.4209346602368957 | Validation Accuracy: 60.73529411764706
Epoch 3: | Training Loss: 0.21611 | Training accuracy: 96.11029411764706 | Validation Loss: 0.8644352900056875 | Validation Accuracy: 62.38235294117647
Epoch 4: | Training Loss: 0.12393 | Training accuracy: 98.55882352941177 | Validation Loss: 0.49570416327149613 | Validation Accuracy: 61.88235294117647
Epoch 5: | Training Loss: 0.09151 | Training accuracy: 98.70588235294117 | Validation Loss: 0.3660219546958513 | Validation Accuracy: 63.14705882352941
Epoch 6: | Training Loss: 0.07900 | Training accuracy: 98.65441176470588 | Validation L

In [31]:
y_pred_list = np.array([])

model.eval()

with torch.no_grad():
    for X_batch, y_batch in valid_dl:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_tag = y_pred_tag.squeeze(1).cpu().numpy()
        y_pred_list = np.append(y_pred_list, y_pred_tag)
                     
len(y_pred_list)

2172

In [32]:
!pip install -Uqq sklearn

In [33]:
from sklearn.metrics import confusion_matrix, classification_report 

In [34]:
confusion_matrix(valid_y, y_pred_list)

array([[1030,  439],
       [ 367,  336]])

In [35]:
sns.heatmap(confusion_matrix(valid_y, y_pred_list))

NameError: name 'sns' is not defined