In [1]:
# Import the required libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision import models
from transformers import BertTokenizer, VisualBertModel, VisualBertForPreTraining, logging
from PIL import Image
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()
import numpy as np

In [2]:
# Read the dataframe containing the path to image, text and label
train_df = pd.read_json("/kaggle/input/facebook-hmc/facebook/train.json")
dev_df = pd.read_json("/kaggle/input/facebook-hmc/facebook/dev.json")
train_df.head()

Unnamed: 0,id,img,text,label
0,42953,train/non_hateful/42953.png,it their charact not their color that matter,0
1,23058,train/non_hateful/23058.png,dont be afraid to love again everyon is not li...,0
2,13894,train/non_hateful/13894.png,put bow on your pet,0
3,37408,train/non_hateful/37408.png,i love everyth and everybodi except for squirr...,0
4,82403,train/non_hateful/82403.png,everybodi love chocol chip cooki even hitler,0


In [3]:
# Declare some global variables
BATCH_SIZE = 32
EPOCHS = 5
ROOT_PATH = '/kaggle/input/facebook-hmc/facebook'
IMAGE_SIZE = 224*224
NUM_CLASSES = 2
TEXTUAL_DIMENSION = 512
VISUAL_DIMENSION = 512
CHECKPOINT = '/kaggle/working/model.pt'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else'cpu')

In [4]:
# Initialize the dataset and maintain the dataloader
class DynamicDataset(Dataset):
    def __init__(self, json_path, transform = None):
        self.df = pd.read_json(json_path)
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.df.loc[index, 'img']
        img_file = os.path.join(ROOT_PATH, img_path)
        image = Image.open(img_file).convert("RGB")
        if self.transform is not None:
            image = self.transform(image)
        
        text = self.df.loc[index, 'text']
        if 'label' not in self.df.columns:
            return image, text
        label = self.df.loc[index, 'label']

        return image ,text, label

In [5]:
# Define a transform function for image preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Create objects of each set of data
train_data = DynamicDataset(os.path.join(ROOT_PATH, 'train.json'), transform = transform)
dev_data = DynamicDataset(os.path.join(ROOT_PATH, 'dev.json'), transform = transform)

# Create a dataloader
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
dev_loader = DataLoader(dev_data, batch_size = BATCH_SIZE, shuffle = True)

In [6]:
# Construct a class for extracting visual features from resnet50 architecture
class Visual_Feature(nn.Module):
    def __init__(self):
        super().__init__()

        # Define resnet50 model
        resnet50 = models.resnet50(weights = models.ResNet50_Weights.DEFAULT)
        convolution_layers = nn.Sequential(
            nn.Conv2d(2048, 1024, kernel_size=(3, 3), stride = (1, 1), padding = (1, 1)),
            nn.ReLU(),
            nn.Conv2d(1024, 512, kernel_size=(3, 3), stride = (1, 1), padding = (1, 1)),
            nn.ReLU(),
        )
        
        # Freeze parameters
        for param in resnet50.parameters():
            param.requires_grad = False

        self.resnet50 = nn.Sequential(*list(resnet50.children())[:-1])
        self.convolution_layers = convolution_layers

    def get_visual_features(self, images, get_conv_features):
        # Extract visual features from resnet50 model
        
        """
            Based on the value of get_conv_features, convolution layers are applied.
            This is required because visual bert requires the input visual fetures to be 2048.
            ResNet50 by default gives 2048 features and hence there is no need to apply conv. layers.
        """
        
        # Define the scope of the variable
        visual_features = None
        if(get_conv_features):
            visual_features = self.convolution_layers(self.resnet50(images))
        else:
            visual_features = self.resnet50(images)
        visual_features = visual_features.reshape(BATCH_SIZE, 1, -1)

        return visual_features

In [7]:
class Textual_Feature(nn.Module):
    def __init__(self):
        super().__init__()

        # Define virtual bert model
        visual_bert = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa')
        dense_layers = nn.Sequential(
#             nn.Linear(30522, 20000), # 30522 is the dimensions returned by the visual bert pretrained model
#             nn.ReLU(),
#             nn.Linear(20000, 10000),
#             nn.ReLU(),
#             nn.Linear(10000, 5000),
#             nn.ReLU(),
#             nn.Linear(5000, 2000),
#             nn.ReLU(),
#             nn.Linear(2000, 1000),
#             nn.ReLU(),
#             nn.Linear(1000, 512),
#             nn.ReLU(),
            
            nn.Linear(768, 512),
            nn.ReLU(),
        )
        
        # Freeze parameters
#         for param in visual_bert.parameters():
#             param.requires_grad = False

        self.visual_bert = visual_bert
        self.dense_layers = dense_layers

        # Define tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def get_textual_features(self, images, texts):
        # Define indices and attention mask
        inputs = self.tokenizer(texts, padding = True, truncation = True, return_tensors = 'pt').to(DEVICE)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_ids = inputs['token_type_ids']

        # Extract visual features
        resnet50 = Visual_Feature().to(DEVICE)
        visual_features = resnet50.get_visual_features(images.to(DEVICE), get_conv_features = False)
        visual_token_ids = torch.ones(visual_features.shape[:-1], dtype=torch.long).to(DEVICE)
        visual_attention_mask = torch.ones(visual_features.shape[:-1], dtype=torch.float).to(DEVICE)

        # Extract textual features from virtual bert model
        textual_features = self.visual_bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_ids,
            visual_embeds = visual_features, # pass the visual features as received from resnet50
            visual_token_type_ids = visual_token_ids,
            visual_attention_mask = visual_attention_mask,
        )
        
        textual_features = textual_features[0][:, 0, :] # Extract the first token of last hidden state
        textual_features = self.dense_layers(textual_features)

        return textual_features

In [8]:
# Test visual bert (WORKS BUT SKIPPED TO PRESERVE MEMORY)
# vbert = Textual_Feature().to(DEVICE)
# for images, texts, labels in tqdm(train_loader):
#     images = images.to(DEVICE)
#     textual_feature = vbert.get_textual_features(images, texts)
#     print(textual_feature.shape)
#     break

In [9]:
# Test resnet50 (WORKS BUT SKIPPED TO PRESERVE MEMORY)
# resnet50 = Visual_Feature()
# resnet50.to(DEVICE)
# image = Image.open(os.path.join(ROOT_PATH, 'dev/hateful/01456.png'))
# image = transform(image).reshape(1, 3, 224, 224)
# visual_features = resnet50.get_visual_features(image.to(DEVICE), get_conv_features = True)
# print(visual_features.shape)

In [10]:
class Fusion(nn.Module):
    def __init__(self):
        super().__init__()

        # Define fusion layers
        fusion_layers = nn.Sequential(
            nn.Linear((VISUAL_DIMENSION + TEXTUAL_DIMENSION), 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

        self.fusion_layers = fusion_layers
    
    def forward(self, images, texts):
        # Initialize text and visual classes
        visual_class = Visual_Feature().to(DEVICE)
        textual_class = Textual_Feature().to(DEVICE)

        # Extract visual and textual features
        visual_features = visual_class.get_visual_features(images, get_conv_features = True).reshape(BATCH_SIZE, -1)
        
        textual_features = textual_class.get_textual_features(images, texts)

        # Concatenate visual and textual features
        features = torch.cat((visual_features, textual_features), dim = 1)

        # Pass through fusion layers
        output = self.fusion_layers(features)

        return output

In [11]:
# Define fusion model
fusion = Fusion().to(DEVICE)
print(fusion)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()

# Define optimizer
optimizer = optim.Adam(fusion.parameters(), lr = 0.001)

Fusion(
  (fusion_layers): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): ReLU()
    (10): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [12]:
def train_model(model):
    # Initialize required variables
    train_loss = 0
    train_acc = 0
    total = 0
    correct = 0
    
    # Loop over all the batches
    for images, texts, labels in tqdm(train_loader):
        images = images.to(DEVICE)
        labels = torch.reshape(labels, (-1, 1)).to(dtype = torch.float32, device = DEVICE)

        optimizer.zero_grad(set_to_none = True) # Uses less memory
        outputs = fusion(images, texts)

        predicted = torch.sigmoid(outputs)
        
        # For debugging purposes only
        print(outputs.reshape(1, -1))
        print(predicted.reshape(1, -1))
        predicted = torch.round(predicted)
        print(predicted.reshape(1, -1))
        print((predicted == labels).sum().item())
        print(labels.sum().item())
        
        total += labels.size(0) # Must be adding equivalent to batch size
        correct += (predicted == labels).sum().item()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        train_acc = 100 * correct / total
        print(f"Train Accuracy = {train_acc:.4f}")
        # For debugging purposes only
    train_acc = 100 * correct / total
    train_loss /= len(train_data)
    return train_acc, train_loss
        

# Debugging Insights
* The outputs are almost same to each other and similar is the case for predicted variables. This means that the model is not able to train well.

In [13]:
# NOT CHECKED
def eval_model(model):    
    # Initialize the required variables
    dev_loss = 0
    dev_acc = 0
    total = 0
    correct = 0
    
    for images, texts, labels in tqdm(dev_loader):
        images = images.to(DEVICE)
        labels = torch.reshape(labels, (-1, 1)).to(dtype = torch.float32, device = DEVICE)
        
        outputs = model(images, texts)
        predicted = torch.round(torch.sigmoid(outputs)) # threshold issues
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        loss = criterion(outputs, labels)
        dev_loss += loss.item() * images.size(0)
        
    dev_acc = 100 * correct / total
    dev_loss /= len(dev_data)
    
    return dev_acc, dev_loss

In [14]:
# NOT CHECKED
def save_model(prev_acc, curr_acc, epoch, model, optimizer):
    # Compare and save
    if curr_acc > prev_acc:
        # Save the model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, CHECKPOINT)
        
        # Return new highest accuracy
        return curr_acc
    return prev_acc

In [15]:
prev_dev_acc = 0
dev_acc = 0
try:
    for epoch in range(EPOCHS):
        # Train model
        fusion = fusion.train()
        train_acc, train_loss = train_model(fusion)
        print(f"OVERALL - Epoch {epoch+1}/{EPOCHS}, Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}")

        # Evaluate model
        fusion.eval()
        dev_acc, dev_loss = eval_model(fusion)
        print(f"OVERALL - Epoch {epoch+1}/{EPOCHS}, Dev Loss = {dev_loss:.4f}, Dev Accuracy = {dev_acc:.4f}")

        # Save best model
        prev_dev_acc = save_model(prev_dev_acc, dev_acc, epoch + 1, fusion, optimizer)
    
except Exception as e:
    # Log the exception
    print(e)

    # Save best model
    prev_dev_acc = save_model(prev_dev_acc, dev_acc, epoch, fusion, optimizer)

  0%|          | 0/682 [00:00<?, ?it/s]

tensor([[-0.1631, -0.1628, -0.1635, -0.1635, -0.1635, -0.1627, -0.1643, -0.1636,
         -0.1651, -0.1626, -0.1630, -0.1639, -0.1642, -0.1642, -0.1633, -0.1639,
         -0.1645, -0.1641, -0.1631, -0.1648, -0.1638, -0.1647, -0.1647, -0.1656,
         -0.1643, -0.1650, -0.1667, -0.1644, -0.1637, -0.1629, -0.1649, -0.1625]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)
tensor([[0.4593, 0.4594, 0.4592, 0.4592, 0.4592, 0.4594, 0.4590, 0.4592, 0.4588,
         0.4594, 0.4593, 0.4591, 0.4590, 0.4590, 0.4593, 0.4591, 0.4590, 0.4591,
         0.4593, 0.4589, 0.4591, 0.4589, 0.4589, 0.4587, 0.4590, 0.4588, 0.4584,
         0.4590, 0.4592, 0.4594, 0.4589, 0.4595]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
15
17.0


  0%|          | 1/682 [00:05<1:06:26,  5.85s/it]

Train Accuracy = 46.8750


  0%|          | 2/682 [00:09<54:31,  4.81s/it]  

tensor([[-0.1532, -0.1520, -0.1534, -0.1530, -0.1533, -0.1520, -0.1532, -0.1523,
         -0.1525, -0.1531, -0.1520, -0.1522, -0.1521, -0.1520, -0.1527, -0.1526,
         -0.1530, -0.1528, -0.1525, -0.1527, -0.1530, -0.1524, -0.1530, -0.1523,
         -0.1526, -0.1518, -0.1540, -0.1529, -0.1521, -0.1534, -0.1518, -0.1532]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)
tensor([[0.4618, 0.4621, 0.4617, 0.4618, 0.4617, 0.4621, 0.4618, 0.4620, 0.4619,
         0.4618, 0.4621, 0.4620, 0.4621, 0.4621, 0.4619, 0.4619, 0.4618, 0.4619,
         0.4619, 0.4619, 0.4618, 0.4620, 0.4618, 0.4620, 0.4619, 0.4621, 0.4616,
         0.4619, 0.4621, 0.4617, 0.4621, 0.4618]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
13
19.0
Train Accuracy = 43.7500


  0%|          | 3/682 [00:13<48:21,  4.27s/it]

tensor([[-0.1417, -0.1418, -0.1425, -0.1413, -0.1426, -0.1423, -0.1412, -0.1428,
         -0.1423, -0.1433, -0.1435, -0.1418, -0.1422, -0.1414, -0.1423, -0.1435,
         -0.1426, -0.1424, -0.1413, -0.1416, -0.1424, -0.1424, -0.1421, -0.1415,
         -0.1428, -0.1420, -0.1425, -0.1415, -0.1429, -0.1424, -0.1427, -0.1420]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)
tensor([[0.4646, 0.4646, 0.4644, 0.4647, 0.4644, 0.4645, 0.4647, 0.4644, 0.4645,
         0.4642, 0.4642, 0.4646, 0.4645, 0.4647, 0.4645, 0.4642, 0.4644, 0.4645,
         0.4647, 0.4647, 0.4644, 0.4644, 0.4645, 0.4647, 0.4644, 0.4646, 0.4644,
         0.4647, 0.4643, 0.4645, 0.4644, 0.4646]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
18
14.0
Train Accuracy = 47.9167


  1%|          | 4/682 [00:17<45:05,  3.99s/it]

tensor([[-0.1379, -0.1383, -0.1377, -0.1387, -0.1385, -0.1376, -0.1388, -0.1383,
         -0.1386, -0.1383, -0.1381, -0.1378, -0.1379, -0.1378, -0.1389, -0.1387,
         -0.1378, -0.1366, -0.1372, -0.1377, -0.1381, -0.1380, -0.1388, -0.1380,
         -0.1374, -0.1374, -0.1389, -0.1383, -0.1382, -0.1375, -0.1389, -0.1371]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)
tensor([[0.4656, 0.4655, 0.4656, 0.4654, 0.4654, 0.4657, 0.4654, 0.4655, 0.4654,
         0.4655, 0.4655, 0.4656, 0.4656, 0.4656, 0.4653, 0.4654, 0.4656, 0.4659,
         0.4657, 0.4656, 0.4655, 0.4655, 0.4654, 0.4656, 0.4657, 0.4657, 0.4653,
         0.4655, 0.4655, 0.4657, 0.4653, 0.4658]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
20
12.0
Train Accuracy = 51.5625


  1%|          | 5/682 [00:20<42:55,  3.80s/it]

tensor([[-0.1400, -0.1399, -0.1404, -0.1406, -0.1406, -0.1404, -0.1401, -0.1406,
         -0.1404, -0.1401, -0.1404, -0.1405, -0.1396, -0.1402, -0.1397, -0.1404,
         -0.1401, -0.1401, -0.1402, -0.1403, -0.1403, -0.1401, -0.1412, -0.1403,
         -0.1405, -0.1404, -0.1400, -0.1398, -0.1403, -0.1399, -0.1401, -0.1396]],
       device='cuda:0', grad_fn=<ReshapeAliasBackward0>)
tensor([[0.4651, 0.4651, 0.4650, 0.4649, 0.4649, 0.4649, 0.4650, 0.4649, 0.4649,
         0.4650, 0.4650, 0.4649, 0.4651, 0.4650, 0.4651, 0.4650, 0.4650, 0.4650,
         0.4650, 0.4650, 0.4650, 0.4650, 0.4648, 0.4650, 0.4649, 0.4650, 0.4651,
         0.4651, 0.4650, 0.4651, 0.4650, 0.4652]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReshapeAliasBackward0>)
11
21.0
Train Accuracy = 48.1250


  1%|          | 5/682 [00:22<51:05,  4.53s/it]


KeyboardInterrupt: 