In [2]:
import pandas as pd
import os

In [3]:
covid_path = "Dataset/covid/annotations"
politics_path = "Dataset/politics/annotations"

covid_anno = pd.read_json(f"../{covid_path}/train.jsonl", lines = True).reset_index(drop=True)
politics_anno = pd.read_json(f"../{politics_path}/train.jsonl", lines = True).reset_index(drop=True)

covid_entities = covid_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
politics_entities = politics_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)

In [4]:
final_ds = pd.concat([covid_anno, politics_anno], axis =0, ignore_index=True)

In [11]:
# Prepare target variable
def prepare_target(hero, villain, victim, other):
    target = ""
    if len(hero) > 0:
        target += "[BH] " + " [SEP] ".join(hero) + " [EH] "
    if len(villain) > 0:
        target += "[BVIL] " + " [SEP] ".join(villain) + " [EVIL] "
    if len(victim) > 0:
        target += "[BVIC] " + " [SEP] ".join(victim) + " [EVIC] "
    if len(other) > 0:
        target += "[BO] " + " [SEP] ".join(other) + " [EO]"

    return target

final_ds["Target"] = final_ds.apply(lambda x: prepare_target(x.hero, x.villain, x.victim, x.other), axis=1)

In [13]:
final_ds.head()

Unnamed: 0,OCR,image,hero,villain,victim,other,Target
0,Bernie or Elizabeth?\nBe informed.Compare them...,covid_memes_18.png,[],[],[],"[bernie sanders, elizabeth warren]",[BO] bernie sanders [SEP] elizabeth warren [EO]
1,Extending the\nBrexit deadline until\nOctober ...,covid_memes_19.png,[],[uk government],[],[],[BVIL] uk government [EVIL]
2,kwai\ngkwa 0964\n#nnevvy\napplause to Thais fr...,covid_memes_252.png,[thais],[],[],[hong kong],[BH] thais [EH] [BO] hong kong [EO]
3,"So, I order this\nfoce mask to\nprotect ogains...",covid_memes_255.png,[],[china],[],"[face mask, made in china, coronavirus]",[BVIL] china [EVIL] [BO] face mask [SEP] made ...
4,best candidate for\nJA\n2020\njoe biden\nKamal...,covid_memes_20.png,[joe biden],[],[],"[bernie sanders, kamala harris, tiktok]",[BH] joe biden [EH] [BO] bernie sanders [SEP] ...
...,...,...,...,...,...,...,...
5547,Trump could shoot someone\non the Senate floor...,memes_5039.png,[],[donald trump],[],"[senate floor, republican]",[BVIL] donald trump [EVIL] [BO] senate floor [...
5548,MANY PEOPLE ASK\nME WHY ALL MY SCHOOL\nRECORDS...,memes_2635.png,[],[],[],"[school, university, joe biden]",[BO] school [SEP] university [SEP] joe biden [EO]
5549,my bes\nfriend\nmy\nmother\nconsclence\nmy the...,memes_1384.png,[],[],[],"[msnbc, bernie sanders, democratic party, joe ...",[BO] msnbc [SEP] bernie sanders [SEP] democrat...
5550,THE\nN-WORD\nPASS\nSigned and\napproved by\nBe...,memes_944.png,[],[],[],[barack obama],[BO] barack obama [EO]


#### Preprocess text

In [17]:
import nltk
from nltk.corpus import stopwords  
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sid\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
final_ds.OCR.values[:5]

array(['Bernie or Elizabeth?\nBe informed.Compare them on the issues that matter.\nIssue: Who makes the dankest memes?\n',
       'Extending the\nBrexit deadline until\nOctober 31st in\norder to ensure a deal\neveryone can agree with.\nUsing the extension\nto elect a new prime\nminister and then\ntake a recess until one\nmonth before the deadline\nimgflip.com\n',
       'kwai\ngkwa 0964\n#nnevvy\napplause to Thais from Hong Kong\nWHY THAIS DID NOT GET HURT?\nCULTURAL DIFFERENCE\nJUST STAY IN THE WALL\n',
       'So, I order this\nfoce mask to\nprotect ogainst\nfhe Corond virus\nHooold up, it says\nsomething here.\n"Made in China"\n',
       'best candidate for\nJA\n2020\njoe biden\nKamala harris\nBernie sanders\nTikTok\n@eliguthrie\n'],
      dtype=object)

In [24]:
def preprocess_text(text):
    new_text = text.replace("\n"," ")
    _text = []
 
    for t in new_text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        _text.append(t)
    return " ".join(_text)

final_ds.OCR = final_ds.OCR.apply(lambda x: preprocess_text(x))

In [25]:
final_ds

Unnamed: 0,OCR,image,hero,villain,victim,other,Target
0,Bernie or Elizabeth? Be informed.Compare them ...,covid_memes_18.png,[],[],[],"[bernie sanders, elizabeth warren]",[BO] bernie sanders [SEP] elizabeth warren [EO]
1,Extending the Brexit deadline until October 31...,covid_memes_19.png,[],[uk government],[],[],[BVIL] uk government [EVIL]
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,covid_memes_252.png,[thais],[],[],[hong kong],[BH] thais [EH] [BO] hong kong [EO]
3,"So, I order this foce mask to protect ogainst ...",covid_memes_255.png,[],[china],[],"[face mask, made in china, coronavirus]",[BVIL] china [EVIL] [BO] face mask [SEP] made ...
4,best candidate for JA 2020 joe biden Kamala ha...,covid_memes_20.png,[joe biden],[],[],"[bernie sanders, kamala harris, tiktok]",[BH] joe biden [EH] [BO] bernie sanders [SEP] ...
...,...,...,...,...,...,...,...
5547,Trump could shoot someone on the Senate floor ...,memes_5039.png,[],[donald trump],[],"[senate floor, republican]",[BVIL] donald trump [EVIL] [BO] senate floor [...
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,memes_2635.png,[],[],[],"[school, university, joe biden]",[BO] school [SEP] university [SEP] joe biden [EO]
5549,my bes friend my mother consclence my therapis...,memes_1384.png,[],[],[],"[msnbc, bernie sanders, democratic party, joe ...",[BO] msnbc [SEP] bernie sanders [SEP] democrat...
5550,THE N-WORD PASS Signed and approved by Beak Ob...,memes_944.png,[],[],[],[barack obama],[BO] barack obama [EO]


In [26]:
entity_dict = {}
def count_entity(x):
    if x is not None and len(x) > 0:
        for item in x:
            key = item.lower()
            if key in entity_dict:
                entity_dict[key] = entity_dict[key] + 1
            else:
                entity_dict[key] = 1

_=covid_entities.apply(lambda x: count_entity(x))
_=politics_entities.apply(lambda x: count_entity(x))

In [32]:
filtered_entities = []
for key, value in entity_dict.items():
    if value < 5:
        filtered_entities.append(key)

In [43]:
def get_low_occuring_indexes(row):
    index = row.name
    entities = row.entities
    for _item in entities:
        if _item in filtered_entities:
            return index
    return None
    
final_ds = final_ds.reset_index(drop=True)
final_ds["entities"] =  final_ds.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
indexes = final_ds.apply(lambda x:get_low_occuring_indexes(x), axis = 1)

In [45]:
indexes[~indexes.isna()]

1          1.0
2          2.0
3          3.0
4          4.0
5          5.0
         ...  
5545    5545.0
5546    5546.0
5547    5547.0
5548    5548.0
5551    5551.0
Length: 3051, dtype: float64

In [47]:
final_ds.loc[5551].other

['biden obama meme', 'john robinson', 'memes', 'joe biden', 'barack obama']

In [23]:
import torch
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

inputs = tokenizer("What is the man eating?", return_tensors="pt")
# this is a custom function that returns the visual embeddings given the image path
visual_embeds = get_visual_embeddings(image_path)

visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)
outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state

Downloading: 100%|██████████| 631/631 [00:00<00:00, 214kB/s]
Downloading: 100%|██████████| 428M/428M [04:31<00:00, 1.65MB/s]    
Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

NameError: name 'get_visual_embeddings' is not defined