In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np

In [2]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [3]:
import torch 

# Check if GPU is available
print(torch.cuda.is_available())

True


In [4]:
# Specify that we want to use the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
features.shape

(1085795, 2048)

In [5]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

from utils import label_func, get_labels

get_labels(faces.iloc[[1,2,4,5]])

array([0., 1., 0., 0.])

In [6]:
def get_not_nan_labels(colname):
    col = faces[colname]
    return col[col.notna()]

In [7]:
samples = {k: get_not_nan_labels(k) for k in ground_truth}

In [23]:
# Randomly split training and testing datasets
np.random.seed(67)
torch.manual_seed(67)

num_of_samples = features.shape[0]

valid_sample_name = ground_truth[1]
valid_sample = samples[valid_sample_name]

In [24]:
valid_idx = np.array(valid_sample.index) - 1
valid_y = np.array(valid_sample.values)

In [58]:
# Define a simple binary classifier that takes a 2048 feature long tensor as input
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()        
        
        # Number of input features is 2048
        self.layer_1 = nn.Linear(2048, 2048)
        self.layer_2 = nn.Linear(2048, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.dropout(x)
        x = self.layer_2(x)
        
        return x

In [56]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, indexes, label_func=label_func):
        self.indexes,self.label_func = indexes,label_func
        
    def __getitem__(self, i):
        index = self.indexes[i]
        
        sample = torch.tensor(features[index]).float()
        label = label_func(faces.iloc[index])
        
        return sample,label
    
    def __len__(self): return len(self.indexes)



In [60]:
def binary_acc(y_pred, y_test):
    # Transform outputs to 0 and 1
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    # Calculate percentage of correct predictions
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    
    return acc

In [61]:
def plot_losses_and_metrics(losses, val_losses, accuracies, val_accuracies):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))

    axes[0].plot(losses)
    axes[0].plot(val_losses)
    axes[0].set_title('model loss')
    axes[0].set_ylabel('loss')
    axes[0].set_xlabel('epoch')
    axes[0].legend(['train', 'test'], loc='upper left')

    axes[1].plot(accuracies)
    axes[1].plot(val_accuracies)
    axes[1].set_title('binary accuracy')
    axes[1].set_ylabel('acc')
    axes[1].set_xlabel('epoch')
    axes[1].legend(['train', 'test'], loc='upper left')
    
    fig.tight_layout()

In [80]:
def get_data_sets(valid_set=1):
    sample_to_exclude = ground_truth[valid_set]
    
    training_sample_names = [name for name in ground_truth if ground_truth != valid_set]
    
    def get_idx(sample):
        return np.array(sample.index) - 1
    
    def get_y(sample):
        return np.array(sample.values)
    
    train_idx = np.concatenate([get_idx(samples[n]) for n in training_sample_names])
    train_y = np.concatenate([get_y(samples[n]) for n in training_sample_names])
    valid_idx = get_idx(samples[sample_to_exclude])
    valid_y = get_y(samples[sample_to_exclude])
    
    return train_idx, train_y, valid_idx, valid_y

In [92]:
train_idx, train_y, valid_idx, valid_y = get_data_sets(0)

In [93]:
len(train_idx), len(valid_idx)

(1085795, 862770)

In [94]:
train_ds = CustomDataset(train_idx)
valid_ds = CustomDataset(valid_idx)

train_dl = DataLoader(train_ds, batch_size=64, num_workers=6)
valid_dl = DataLoader(valid_ds, batch_size=64, num_workers=6)

In [95]:
LEARNING_RATE = 1e-4

model = BinaryClassifier()
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
EPOCHS = 10

losses = []
val_losses = []
accuracies = []
val_accuracies = []

# Move model to GPU if possible
model = model.to(device)
# Tells PyTorch we are in training mode
model.train()

try:
    for e in range(EPOCHS):

        # Set loss and accuracy to zero at start of each epoch
        epoch_training_loss = 0
        epoch_training_accuracy = 0
        epoch_valid_loss = 0
        epoch_valid_accuracy = 0

        with tqdm(train_dl, unit="training", total=len(train_dl)) as tepoch:
            for x_batch, y_batch in tepoch:
                tepoch.set_description(f"Epoch {e}")
                # Transfer the tensors to the GPU if possible
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
                optimizer.zero_grad()

                # Predict a minibatch of outputs
                y_pred = model(x_batch)

                # Calculate the loss (unsqueeze adds a dimension to y)
                loss = loss_function(y_pred, y_batch.unsqueeze(1))
                training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                # Backpropagation. Gradients are calculated
                loss.backward()
                optimizer.step()

                batch_loss = loss.item()
                batch_acc = training_acc.item()
                epoch_training_loss += batch_loss
                epoch_training_accuracy += batch_acc
                losses.append(batch_loss)
                accuracies.append(batch_acc)
                
                # tepoch.set_postfix(loss=loss.item(), accuracy=100. * training_acc.item())

        with tqdm(valid_dl, unit="training", total=len(valid_dl)) as tepoch:
            for x_batch, y_batch in tepoch:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                valid_y_pred = model(x_batch)
                valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
                valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))

                batch_valid_loss = valid_loss.item()
                batch_valid_accuracy = valid_acc.item()
                epoch_valid_loss += batch_valid_loss
                epoch_valid_accuracy += batch_valid_accuracy
                val_losses.append(batch_valid_loss)
                val_accuracies.append(batch_valid_accuracy)

        avg_train_loss = epoch_training_loss/len(train_dl)
        avg_valid_loss = epoch_training_loss/len(valid_dl)

        avg_train_accuracy = epoch_training_accuracy/len(train_dl)
        avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)

        print(f'End of Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy} | Validation Loss: {avg_valid_loss} | Validation Accuracy: {avg_valid_accuracy}')
  
except Exception as e:
    print("Something went wrong in training")
    print(e)

Epoch 0: 100%|█████████████████████| 16966/16966 [01:04<00:00, 263.26training/s]
100%|██████████████████████████████| 13481/13481 [00:41<00:00, 321.79training/s]


End of Epoch 0: | Training Loss: 0.61044 | Training accuracy: 0.6616339822514192 | Validation Loss: 0.7682439211309531 | Validation Accuracy: 0.6473471923438973


Epoch 1: 100%|█████████████████████| 16966/16966 [01:06<00:00, 256.25training/s]
100%|██████████████████████████████| 13481/13481 [00:40<00:00, 334.02training/s]


End of Epoch 1: | Training Loss: 0.60557 | Training accuracy: 0.6664243665918291 | Validation Loss: 0.7621139514287398 | Validation Accuracy: 0.6469945200658076


Epoch 2: 100%|█████████████████████| 16966/16966 [01:05<00:00, 258.94training/s]
100%|██████████████████████████████| 13481/13481 [00:43<00:00, 309.17training/s]


End of Epoch 2: | Training Loss: 0.60234 | Training accuracy: 0.6694972929064429 | Validation Loss: 0.7580559906910056 | Validation Accuracy: 0.6440267320662929


Epoch 3: 100%|█████████████████████| 16966/16966 [01:06<00:00, 255.80training/s]
100%|██████████████████████████████| 13481/13481 [00:43<00:00, 308.12training/s]


End of Epoch 3: | Training Loss: 0.59902 | Training accuracy: 0.6727429122965839 | Validation Loss: 0.7538776902960453 | Validation Accuracy: 0.6419210833741281


Epoch 4: 100%|█████████████████████| 16966/16966 [01:06<00:00, 256.21training/s]
100%|██████████████████████████████| 13481/13481 [00:42<00:00, 315.07training/s]


End of Epoch 4: | Training Loss: 0.59542 | Training accuracy: 0.6762597411221017 | Validation Loss: 0.7493382946356483 | Validation Accuracy: 0.6396528447419791


Epoch 5: 100%|█████████████████████| 16966/16966 [01:06<00:00, 254.12training/s]
100%|██████████████████████████████| 13481/13481 [00:42<00:00, 319.15training/s]


End of Epoch 5: | Training Loss: 0.59164 | Training accuracy: 0.679928370397158 | Validation Loss: 0.7445863255755597 | Validation Accuracy: 0.6379969030492658


Epoch 6:  38%|████████▍             | 6522/16966 [00:26<00:41, 250.62training/s]

In [55]:
y_pred_list = np.array([])

model.eval()

with torch.no_grad():
    for X_batch, y_batch in valid_dl:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_tag = y_pred_tag.squeeze(1).cpu().numpy()
        y_pred_list = np.append(y_pred_list, y_pred_tag)
                     
len(y_pred_list)

217159

In [35]:
torch.save(model.state_dict(), './saved_model/weights-2.pth')

In [11]:
def get_not_nan_labels(colname):
    col = faces[colname]
    return col[col.notna()]

In [12]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']
samples = {k: get_not_nan_labels(k) for k in ground_truth}

In [29]:
models = {}
LEARNING_RATE = 1e-4
EPOCHS = 4


loss_function = nn.BCEWithLogitsLoss()


with tqdm(enumerate(ground_truth)) as t:
    for i, sample_name in t:
        losses = []
        val_losses = []
        accuracies = []
        val_accuracies = []
        
        t.set_description(f'Sample {sample_name}')
        model = BinaryClassifier()
        model = model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
        sample = samples[sample_name]
        train_idx = np.array(sample.index) - 1
        train_y = np.array(sample.values)
        valid_sample_name = ground_truth[i + 1] if i < len(ground_truth) - 1 else ground_truth[0]
        valid_sample = samples[valid_sample_name]
        
        train_ds = CustomDataset(train_idx)
        valid_ds = CustomDataset(valid_idx)
        train_dl = DataLoader(train_ds, batch_size=64, num_workers=6)
        valid_dl = DataLoader(valid_ds, batch_size=64, num_workers=6)
        
        for e in range(EPOCHS):

            # Set loss and accuracy to zero at start of each epoch
            epoch_training_loss = 0
            epoch_training_accuracy = 0
            epoch_valid_loss = 0
            epoch_valid_accuracy = 0

            for x_batch, y_batch in train_dl:
                # Transfer the tensors to the GPU if possible
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # Zero out gradients before backpropagation (PyTorch cumulates the gradient otherwise)
                optimizer.zero_grad()

                # Predict a minibatch of outputs
                y_pred = model(x_batch)

                # Calculate the loss (unsqueeze adds a dimension to y)
                loss = loss_function(y_pred, y_batch.unsqueeze(1))
                training_acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                # Backpropagation. Gradients are calculated
                loss.backward()
                optimizer.step()

                batch_loss = loss.item()
                batch_acc = training_acc.item()
                epoch_training_loss += batch_loss
                epoch_training_accuracy += batch_acc
                losses.append(batch_loss)
                accuracies.append(batch_acc)

                # tepoch.set_postfix(loss=loss.item(), accuracy=100. * training_acc.item())

            for x_batch, y_batch in valid_dl:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                valid_y_pred = model(x_batch)
                valid_loss = loss_function(valid_y_pred, y_batch.unsqueeze(1))
                valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))

                batch_valid_loss = valid_loss.item()
                batch_valid_accuracy = valid_acc.item()
                epoch_valid_loss += batch_valid_loss
                epoch_valid_accuracy += batch_valid_accuracy
                val_losses.append(batch_valid_loss)
                val_accuracies.append(batch_valid_accuracy)

            avg_train_loss = epoch_training_loss/len(train_dl)
            # avg_valid_loss = epoch_training_loss/len(valid_dl)

            avg_train_accuracy = epoch_training_accuracy/len(train_dl)
            # avg_valid_accuracy = epoch_valid_accuracy/len(valid_dl)

            print(f'End of Epoch {e}: | Training Loss: {avg_train_loss:.5f} | Training accuracy: {avg_train_accuracy})

            models[sample_name] = model

Sample pol_dat_us: : 0it [00:00, ?it/s]

End of Epoch 0: | Training Loss: 1.21762 | Training accuracy: 0.5854974593874627 | Validation Loss: 4.836399648151718 | Validation Accuracy: 0.5864419353556197
End of Epoch 1: | Training Loss: 1.21176 | Training accuracy: 0.5885336306627281 | Validation Loss: 4.813125032944689 | Validation Accuracy: 0.5921748726945382
End of Epoch 2: | Training Loss: 1.14215 | Training accuracy: 0.5927725595285787 | Validation Loss: 4.536643441116376 | Validation Accuracy: 0.5836764195419158


Sample pol_dat_ca: : 1it [03:41, 221.61s/it]

End of Epoch 3: | Training Loss: 1.07523 | Training accuracy: 0.5944138973361085 | Validation Loss: 4.270829016213951 | Validation Accuracy: 0.5828569587287161
End of Epoch 0: | Training Loss: 1.06064 | Training accuracy: 0.6173057673770661 | Validation Loss: 0.32781731031743544 | Validation Accuracy: 0.5153329931045885
End of Epoch 1: | Training Loss: 1.22054 | Training accuracy: 0.6242760962480812 | Validation Loss: 0.37723687746236423 | Validation Accuracy: 0.5527006692582885
End of Epoch 2: | Training Loss: 1.27672 | Training accuracy: 0.6379647282785865 | Validation Loss: 0.3946010704988076 | Validation Accuracy: 0.5328086855029386


Sample pol_dat_uk: : 2it [04:45, 128.82s/it]

End of Epoch 3: | Training Loss: 1.25659 | Training accuracy: 0.6510867492941247 | Validation Loss: 0.38837942625420213 | Validation Accuracy: 0.5352210308182008
End of Epoch 0: | Training Loss: 1.04045 | Training accuracy: 0.5888155361361433 | Validation Loss: 0.229609266770143 | Validation Accuracy: 0.5217295226870948
End of Epoch 1: | Training Loss: 1.34689 | Training accuracy: 0.6033926645967767 | Validation Loss: 0.29723658561527655 | Validation Accuracy: 0.5796935505985019
End of Epoch 2: | Training Loss: 1.44451 | Training accuracy: 0.6190642010226587 | Validation Loss: 0.3187786445328455 | Validation Accuracy: 0.5723552329791737


Sample pol_fb_us: : 3it [05:45, 97.55s/it]  

End of Epoch 3: | Training Loss: 1.43196 | Training accuracy: 0.633432718001316 | Validation Loss: 0.31600981920363247 | Validation Accuracy: 0.5225371453926433
End of Epoch 0: | Training Loss: 1.20367 | Training accuracy: 0.6311618779649102 | Validation Loss: 0.5986447097718136 | Validation Accuracy: 0.5919170647982506
End of Epoch 1: | Training Loss: 1.15676 | Training accuracy: 0.6424537173944627 | Validation Loss: 0.5753120152145365 | Validation Accuracy: 0.5599751925742409
End of Epoch 2: | Training Loss: 1.31015 | Training accuracy: 0.652107153423307 | Validation Loss: 0.6516000181866565 | Validation Accuracy: 0.5545704341770694


Sample pol_fb_us: : 4it [06:58, 104.70s/it]

End of Epoch 3: | Training Loss: 1.29072 | Training accuracy: 0.6599829679981792 | Validation Loss: 0.6419389991751582 | Validation Accuracy: 0.5684052477110535





In [43]:
acc_matrix = np.zeros([4,4])

with tqdm(enumerate(ground_truth), total=4) as t:
    for i, sample_name in enumerate(ground_truth):
        t.set_description(f'Sample {sample_name}')
        model = models[sample_name]
        model.to(device)
        
        for j, test_sample in enumerate(ground_truth):
            t.set_postfix({'imputing':test_sample})
            
            sample = samples[test_sample]
            indexes = np.array(sample.index) - 1
            values = np.array(sample.values)
            
            valid_ds = CustomDataset(indexes)
            valid_dl = DataLoader(valid_ds, batch_size=64, num_workers=6)
            
            valid_acc = 0
            
             
            for x_batch, y_batch in valid_dl:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                valid_y_pred = model(x_batch)
                valid_acc = binary_acc(valid_y_pred, y_batch.unsqueeze(1))

                batch_valid_accuracy = valid_acc.item()
                valid_acc += batch_valid_accuracy
                
            avg_valid_accuracy = valid_acc/len(valid_dl).c
            t.set_postfix({'accuracy':avg_valid_accuracy})
            acc_matrix[i,j] = avg_valid_accuracy

Sample pol_fb_us:   0%| | 0/4 [03:50<?, ?it/s, accuracy=tensor(0.0007, device='c


In [44]:
acc_matrix

array([[1.09784145e-04, 9.91420355e-04, 1.23241264e-03, 6.16113772e-04],
       [8.90141673e-05, 1.29647285e-03, 1.64321670e-03, 7.34597212e-04],
       [5.93427758e-05, 8.77025712e-04, 2.67022708e-03, 5.92417084e-04],
       [9.19813101e-05, 8.77025712e-04, 1.43781467e-03, 7.34597212e-04]])

In [36]:
len(valid_dl)

1688

In [38]:
len(valid_ds) // 64

1687

In [40]:
valid_acc.item()

0.6599999666213989