In [1]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import os
from pytorch_pretrained_bert.modeling import BertModel
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms
from pytorch_pretrained_bert import BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# **Image Handling Resnet-152**

In [2]:
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        model = torchvision.models.resnet152(pretrained=True)
        modules = list(model.children())[:-2]
        # we are removing the last adaptive average pooling layer and the 
        # the classification layer
        self.model = nn.Sequential(*modules)
    
    def forward(self, x):
        out = (self.model(x))
        # print('Model output', out.size())

        out = nn.AdaptiveAvgPool2d((7, 1))(out)#specifying the H and W of the image
        # to be obtained after pooling
        # print('Pooling output', out.size())

        out = torch.flatten(out, start_dim=2)
        # print('Flattening output', out.size())

        out = out.transpose(1, 2).contiguous()
        # print('Transpose output', out.size())
        
        return out


### Important Note:






Understanding the process of obtaining N(in this case 7) 2048 dimensional image embeddings
<br>
<br>
Below is the example of a sample image
```
img_enc = ImageEncoder()
img = torch.randn(1, 3, 224, 224)
img
```


```
tensor([[[[ 0.4623, -0.0570,  0.1685,  ..., -0.6377, -0.4702,  0.8996],
          [ 0.5874,  0.1590, -0.2373,  ..., -1.7897, -0.3391, -1.0945],
          [ 0.6259,  1.3741,  0.6457,  ..., -0.3259,  0.2340,  0.5563],
          ...,
          [-0.3431,  0.8013, -1.1648,  ...,  0.3589, -1.0933,  0.0880],
          [ 0.3228, -2.2501,  1.8554,  ...,  0.6990,  1.2223, -0.6696],
          [ 0.0949,  0.3022, -1.7768,  ...,  0.5936,  1.3039,  1.4402]],

         [[-0.7338,  0.3525, -0.0956,  ..., -0.5781, -0.8532, -0.9768],
          [ 0.3267, -0.4692,  0.2099,  ...,  0.8854, -0.0515, -0.9874],
          [ 2.0738, -0.5577,  0.3773,  ...,  0.9743, -2.0519,  0.0128],
          ...,
          [-0.1382, -0.8803,  0.6664,  ..., -0.3854, -1.2113,  1.0680],
          [-0.8094,  0.6352, -0.1113,  ..., -2.2602,  0.3099,  0.2487],
          [-0.3672,  1.2410,  0.0260,  ..., -0.0627,  0.2084, -0.2197]],

         [[ 0.6515, -0.2968, -0.1592,  ..., -0.0610,  0.3312, -0.9807],
          [-1.9452, -1.1792, -0.3001,  ...,  0.5704,  1.4844, -1.4242],
          [ 0.1115, -0.1929,  0.0363,  ...,  0.8737,  0.2437,  0.4418],
          ...,
          [ 1.6531,  0.0160, -0.6031,  ...,  0.8056, -0.5860, -0.2903],
          [-0.1911, -1.4188, -0.2629,  ..., -1.3827, -0.7149, -2.4575],
          [-1.5174, -1.5290, -0.3920,  ...,  1.0713,  0.4248, -0.2714]]]])
```





```
img.size()
```


```
torch.Size([1, 3, 224, 224])
```



**Note:**<br>
Below are the shapes of the image at each step after obtaining an image from the resnet-152 model, where the input for this example operation was `(1, 3, 224, 224)` where the no of batches is 1, no of channels is `3` and the shape of the image is `224x224`
<br>
`img_enc.forward(img)`
```
Model output torch.Size([1, 2048, 7, 7])
Pooling output torch.Size([1, 2048, 7, 1])
Flattening output torch.Size([1, 2048, 7])
Transpose output torch.Size([1, 7, 2048])

tensor([[[1.0920, 0.5761, 0.6760,  ..., 0.5043, 0.0468, 0.8262],
         [1.6031, 0.7189, 1.2634,  ..., 0.7476, 0.2092, 0.3963],
         [1.4418, 0.3756, 1.0606,  ..., 0.6728, 0.8360, 0.1597],
         ...,
         [0.8339, 0.6820, 0.6216,  ..., 0.0877, 0.6460, 0.4525],
         [0.1193, 0.1641, 0.5969,  ..., 0.2471, 0.5955, 0.0536],
         [0.0970, 0.1573, 1.4045,  ..., 0.0740, 0.2112, 0.4067]]],
       grad_fn=<CopyBackwards>)

```



In [3]:
df = pd.read_csv('clean_datav5.csv')
(df['tweet_id'][1])

1052207832081129472

In [4]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,tweet_id,text,missing_text,Text_Only_Informative,Image_Only_Informative,Directed_Hate,Generalized_Hate,Sarcasm,Allegation,Justification,Refutation,Support,Oppose
0,0,0,0,1052237153789390853,new post domestic violence awareness caught me...,0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,1,1,1052207832081129472,domestic violence awareness caught metoo,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,2,2,1052183746344960000,mother nature metoo,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,3,1052156864840908800,ption no2,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,4,4,1052095305133510656,high time metoo named shamed men medium advert...,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7973,7973,7973,7973,1052099226799353856,one priyaramani make billion people metooindia...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7974,7974,7974,7974,1052099000688631809,thought metoo limited woman condeming wake rea...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7975,7975,7975,7975,1052098808178302977,wake metoo movement hairstylist sapna bhavani ...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7976,7976,7976,7976,1052098776490340352,metoo icc step sexual harassment,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
class Vocab(object):
    def __init__(self, emptyInit=False):
        if emptyInit:
            self.stoi={}#string to index dictionary
            self.itos=[]#index to string dictionary
            self.vocab_size=0
        else:
            self.stoi={
                w:i
                for i, w in enumerate(["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
            }
            self.itos = [w for w in self.stoi]
            self.vocab_size = len(self.itos)
    
    def add(self, words):
        counter = len(self.itos)
        for w in words:
            if w in self.stoi:
                continue
            self.stoi[w]=counter
            counter+=1
            self.itos.append(w)
        self.vocab_size = len(self.itos)

In [131]:
class TextNImageDataset(Dataset):
    def __init__(self, data, image_path, label_name, transforms, tokenizer, vocab):
        self.data = data
        self.image_path = (image_path)
        self.label_name = label_name
        self.transforms = transforms
        self.tokenizer = tokenizer
        self.max_sent_len = 512 - 7 - 2
        self.vocab = vocab
        
    def __getitem__(self,  index):
        text = self.data['text'][index]
        text = self.tokenizer.tokenize(text)[:self.max_sent_len]
        text = torch.LongTensor(
            [
                self.vocab.stoi[w] if w in self.vocab.stoi else self.vocab.stoi["[UNK]"]
                for w in text
            ]
        )
        tweet_id = self.data['tweet_id'][index]
        label = torch.LongTensor([self.data[self.label_name][index]])
        image = None
        try:
            image = Image.open(
                self.image_path+"/"+str(tweet_id)+".jpg"
            ).convert("RGB")
#             print(self.image_path+"/"+str(tweet_id)+".jpg"+" opened!")
#             image.show()
            image = self.transforms(image)
        except:
            image = Image.fromarray(128*np.ones((256, 256, 3), dtype=np.uint8))
            image = self.transforms(image)
            
        return text, label, image
    
    def __len__(self):
        return len(self.data)

In [132]:
img_transformations = transforms.Compose(
        [
            transforms.Resize(256),
#             transforms.Resize((224, 244)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.46777044, 0.44531429, 0.40661017],
                std=[0.12221994, 0.12145835, 0.14380469],
            ),
        ]
    )

In [133]:
vocab = Vocab()
vocab.stoi

{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}

In [134]:
bert_tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True
        )

In [135]:
data1 = TextNImageDataset(df, '/home/soham/Desktop/IEEE-BigMM/Data/train_images', 'Sarcasm', img_transformations, bert_tokenizer, vocab)

In [136]:
data1.__len__()

7978

In [137]:
text, label, img = data1.__getitem__(0)

ValueError: too many dimensions 'str'

In [138]:
# below is to view the above image fetched
# torchvision.transforms.ToPILImage()(img).show()
img

tensor([[[-1.4850, -1.4529, -0.5545,  ..., -1.1000, -1.0679, -1.0679],
         [-1.4850, -1.4208, -0.5545,  ..., -1.0679, -1.0679, -1.1000],
         [-1.5492, -1.4208, -0.5866,  ..., -1.1000, -1.1000, -1.1320],
         ...,
         [-1.1000, -1.1000, -1.0679,  ..., -2.6401, -2.6722, -2.6722],
         [-1.1320, -1.1320, -1.1000,  ..., -2.6722, -2.6722, -2.6722],
         [-1.1320, -1.1320, -1.1000,  ..., -2.6722, -2.6722, -2.6722]],

        [[-1.1157, -1.1157, -0.3085,  ..., -1.0834, -1.1157, -1.1157],
         [-1.1157, -1.0834, -0.2762,  ..., -1.0511, -1.1157, -1.1480],
         [-1.1480, -1.0834, -0.3085,  ..., -1.0834, -1.1480, -1.1803],
         ...,
         [-0.6637, -0.6314, -0.6314,  ..., -2.3426, -2.3749, -2.3749],
         [-0.6637, -0.6637, -0.6637,  ..., -2.3749, -2.3749, -2.3749],
         [-0.6637, -0.6637, -0.6960,  ..., -2.3749, -2.3749, -2.3749]],

        [[-0.7823, -0.8368, -0.1550,  ..., -0.8095, -0.8368, -0.8368],
         [-0.7823, -0.8095, -0.1278,  ..., -0

In [None]:
text

In [69]:
def collate_function_for_dataloader(batch, task_type='singlelabel'):
    lengths = [len(row[0]) for row in batch]
    batch_size = len(batch)
    max_sent_len = max(lengths)
    if(max_sent_len>512-7-2):
        max_sent_len=512-7-2
    text_tensors = torch.zeros(batch_size, max_sent_len).long()
    text_attention_mask = torch.zeros(batch_size, max_sent_len).long()
    text_segment = torch.zeros(batch_size, max_sent_len).long()
    
    batch_image_tensors = torch.stack([row[2] for row in batch])
    label_tensors = torch.cat([row[1] for row in batch]).long()
    if task_type=='multilabel':
        label_tensors = torch.stack([row[1] for row in batch])
#     note there is a difference between stack and cat, refer link below if needed
# https://stackoverflow.com/questions/54307225/whats-the-difference-between-torch-stack-and-torch-cat-functions
    
    for i, (row, length) in enumerate(zip(batch, lengths)):
        text_tokens = row[0]
        if(length>512-7-2):
            length = 512-7-2
        text_tensors[i, :length] = text_tokens
        text_segment[i, :length] = 1
        text_attention_mask[i, :length]=1
    
    return text_tensors, label_tensors, text_segment, text_attention_mask, batch_image_tensors
        
    
    

In [70]:
data1loader = torch.utils.data.DataLoader(data1, batch_size=4, shuffle=True, collate_fn=collate_function_for_dataloader)

In [71]:
dataiter = iter(data1loader)

In [72]:
dataiter.next()

TypeError: can't assign a str to a torch.LongTensor

In [139]:
bert_tokenizer.vocab['[UNK]']

AttributeError: 'collections.OrderedDict' object has no attribute 'stoi'

In [107]:
bert_tokenizer.ids_to_tokens[2]

'[unused1]'

In [97]:
bert_tokenizer.tokenize('written metoo created downplay rapesonnuns christian father amp bishop clear tanushreedutta convertedchristian rest left reader ')

['written',
 'met',
 '##oo',
 'created',
 'down',
 '##play',
 'rape',
 '##son',
 '##nu',
 '##ns',
 'christian',
 'father',
 'amp',
 'bishop',
 'clear',
 'tan',
 '##ush',
 '##ree',
 '##du',
 '##tta',
 'converted',
 '##christ',
 '##ian',
 'rest',
 'left',
 'reader']

In [15]:
bert = BertModel.from_pretrained('bert-base-uncased')

In [16]:
bert.encoder

BertEncoder(
  (layer): ModuleList(
    (0): BertLayer(
      (attention): BertAttention(
        (self): BertSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): BertSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): BertLayerNorm()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (intermediate): BertIntermediate(
        (dense): Linear(in_features=768, out_features=3072, bias=True)
      )
      (output): BertOutput(
        (dense): Linear(in_features=3072, out_features=768, bias=True)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): BertLayer(
      (attention): BertAttention(
        (self)

In [27]:
bert.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): BertLayerNorm()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [18]:
bert.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [20]:
torch.cuda.is_available()

False

In [21]:
class ImageEmbeddingsForBert(nn.Module):
    def __init__(self, embeddings, vocabObject):
        self.vocab = vocabObject
#       the embeddins received as input are the 
#       all the embeddings provided by the bert model from pytorch
        self.img_embeddings = nn.Linear(2048, 768)
#       above is linear layer is used to convert the flattened images 
#       logits obtained after pooling from Image encoder which have 2048
#       dimensions to a 768 dimensions which is the size of bert's hidden layer
        
        self.position_embeddings = embeddings.position_embeddings
        self.token_type_embeddings = embeddings.token_type_embdeddings
        self.word_embeddings = embeddings.word_embeddings
        self.LayerNorm = embeddings.LayerNorm
        self.dropout = embeddings.dropout
        
    def forward(self, batch_input_imgs, token_type_ids):
        batch_size = batch_input_imgs.size(0)
        seq_length = 7 + 2
#         since we are assuming that from each image we will obtain
#         7 image embeddings of 768 dimensions each
        
        cls_id = torch.LongTensor([self.vocab.stoi["[CLS]"]])
        if torch.cuda.is_available():
            cls_id = clis_id.cuda()
        cls_id = cls_id.unsqueeze(0).expand(batch_size, 1)
        cls_token_embeddings = self.word_embeddings(cls_id)
        
        sep_id = torch.LongTensor([self.vocab.stoi["[SEP]"]])
        if torch.cuda.is_available():
            sep_id = sep_id.cuda()
        sep_id = sep_id.unsqueeze(0).expand(batch_size, 1)
        sep_token_embeddings = self.word_embeddings(sep_id)
        
        batch_image_embeddings_768 = self.image_embeddings(batch_input_imgs)
        
        token_embeddings = torch.cat(
        [cls_token_embeddings, batch_image_embeddings_768, sep_token_embeddings], dim=1)
        
        position_ids = torch.arange(seq_length, dtype=torch.long)
        if torch.cuda.is_available():
            position_ids = position_ids.cuda()
        position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_length)
        
        position_embeddings = self.position_embeddings(position_ids)
        
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        
        embeddings = token_embeddings+position_embeddings+token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        
        return embeddings


In [22]:
class MultiModalBertEncoder(nn.Module):
    def __init__(self, no_of_classes, tokenizer):
        super(MultimodalBertEncoder, self).__init__()
        bert = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = tokenizer
        self.embeddings = bert.embeddings
        self.vocab=Vocab()
        self.image_embeddings = ImageEmbeddingsForBert(self.embeddings, self.vocab)
        self.image_encoder = ImageEncoder()
        self.encoder = bert.encoder
        self.pooler = bert.pooler
        self.clf = nn.Linear(768, no_of_classes)
        
    def forward(self, input_text, text_attention_mask, text_segment, input_image):
        batch_size = input_text.size(0)
# input text is a tensor of encoded texts!
        temp = torch.ones(batch_size, 7+2).long()
        if torch.cuda.is_available():
            temp = temp.cuda()
        attention_mask = torch.cat(
            [
                temp, text_attention_mask
            ],
            dim=1
        )
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(
            dtype=next(self.parameters()).dtype
        )
        extended_attention_mask = (1.0 - extended_attention_mask)*-10000.0
        
        image_token_type_ids = torch.LongTensor(batch_size, 7+2).fill_(0)
        if(torch.cuda.is_available()):
            image_token_type_ids= image_token_type_ids.cuda()
        
        image = self.image_encoder(input_image)
#         above image returned is of the formc nC x nH x nW and is a tensor
        image_embedding_out = self.image_embeddings(image, image_token_type_ids)
        print('Image embeddings: ', image_embedding_out.size())
        
        text_embedding_out = self.embeddings(input_text, segment)
        print('Text embeddings: ', text_embedding_out.size(), text_embedding_out)
        
        
        encoder_input = torch.cat([image_embedding_out, text_embedding_out], dim=1)
#         the encoder input is of the form CLS (7 image embeddings) SEP text_embeddings
    
        encoded_layers = self.encoder(encoder_input, extended_attention_mask, output_all_encoded_layers=False)
        print('encoded layers', encoded_layers)
        return self.pooler(encoder_layers[-1])
        
        

In [23]:
class MultiModalBertClf(nn.Module):
    def __init__(self, no_of_classes):
        super(MultiModalBertClf, self).__init__()
        self.no_of_classes = no_of_classes
        self.enc = MultiModalBertEncoder(self.no_of_classes)
        self.clf = nn.Linear(768, self.no_of_classes)
    
    def forward(self, text, text_attention_mask, text_segment, image):
        x = self.enc(txt, mask, segment, img)
        x = self.clf(x)
        return x