In [1]:
import json
import sys
import random
from tqdm import tqdm
from collections import defaultdict
import pyarrow as pa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import torch

# Import data

In [2]:
%%time
root = '.'

# train_data = list(
#     map(json.loads, open(f"{root}/cosmos/train_data.json").readlines())
# )
test_data = list(
    map(json.loads, open(f"{root}/cosmos/test_data.json").readlines())
)
train_data = list(map(json.loads, open(f"{root}/cosmos/val_data.json").readlines()))

CPU times: user 2.54 s, sys: 111 ms, total: 2.65 s
Wall time: 2.65 s


In [3]:
train_data[10]

{'img_local_path': 'val/10.jpg',
 'articles': [{'caption': "Swiss guards enter St. Peter's Square after white smoke came out of the Sistine Chapel chimney, signifying the election of a new pope.",
   'article_url': 'https://edition.cnn.com/2013/03/13/world/gallery/st-peters-pope-reaction/index.html',
   'caption_modified': 'Swiss guards enter PERSON FAC after white smoke came out of the Sistine Chapel chimney, signifying the election of a new pope.',
   'entity_list': [["St. Peter's", 'PERSON'], ['Square', 'FAC']]},
  {'caption': "Swiss guards enter St. Peter's Square after white smoke came out of the Sistine Chapel chimney, signifying the election of a new pope.",
   'article_url': 'https://edition.cnn.com/2013/03/13/world/gallery/st-peters-pope-reaction/index.html',
   'caption_modified': 'Swiss guards enter PERSON FAC after white smoke came out of the Sistine Chapel chimney, signifying the election of a new pope.',
   'entity_list': [["St. Peter's", 'PERSON'], ['Square', 'FAC']]}],


# Remove duplicate captions

In [16]:
len(train_data)

41006

In [30]:
def remove_duplicate(dataset):
    for data in tqdm(dataset):
        seen_caption = set()
        new_list = []
        for caption in data['articles']:
            if caption['caption_modified'] not in seen_caption:
                new_list.append(caption)
                seen_caption.add(caption['caption_modified'])
        if len(new_list)==0:
            print('0',end='')
        data['articles'] = new_list
    return seen_caption

remove_duplicate(train_data)


100%|██████████| 41006/41006 [00:00<00:00, 277042.57it/s]


{'CARDINAL through photos: Using trucks and excavators to block the road to translate corona in FAC.',
 'Cable car over GPE, GPE.',
 'GPE ... with &#MONEY tram&#39; :-)',
 'Interesting photos, attractive photos, cable car',
 'PERSON, ORG cars are seen above the city of FAC.',
 'Pictures of CARDINAL Bahman 1398, Pictures of CARDINAL Bahman 1398, Pictures of 1 February 2020',
 'View from the lookout PERSON during the NORP, the traditional fair of miniatures wishes, in GPE, GPE, DATE. ORG offers CARDINAL tours through the streets of the GPE, so that NORP and foreigners know about this traditional holiday that intends that the GPE, its main figure, be recognized as a ORG.'}

# SBERT Paraphrase Mining

In [31]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [32]:
import notebook_util
notebook_util.pick_gpu_lowest_memory()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0

In [33]:
torch.cuda.set_device(0)

In [34]:
def neg_cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.neg(torch.mm(a_norm, b_norm.transpose(0, 1)))

In [35]:
# Single list of sentences - Possible tens of thousands of sentences
sentences = []

for data in tqdm(train_data):
    for caption in data['articles']:
        sentences.append(caption['caption_modified'])

paraphrases = util.paraphrase_mining(model, sentences, top_k=1, score_function=neg_cos_sim)


100%|██████████| 41006/41006 [00:00<00:00, 430571.11it/s]


In [9]:
print(len(sentences))
print(len(paraphrases))

264768
435678


In [36]:
furthest_list = np.empty((len(sentences),),dtype=int)
for entry in paraphrases:
    furthest_list[entry[1]]=entry[2]

In [37]:
sentences_dict = dict()
for x in range(len(sentences)):
    sentences_dict[sentences[x]] = x

# Gen case functions

In [3]:
# Take 2 random correct caption
def gen_positive_case(data):
    caption_1_idx, caption_2_idx = random.sample(range(0,len(data['articles'])),2)
    cap1 = data['articles'][caption_1_idx]['caption']
    cap2 = data['articles'][caption_2_idx]['caption']
    if cap1 == cap2:
        return None
    return [data['img_local_path'],[cap1],[cap2],[False]]

In [13]:
# Take 2 random false caption
def gen_negative_case_1(data, train_data):
    data_random1 = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random1['img_local_path']:
        data_random1 = train_data[np.random.randint(len(train_data))]
    cap1 = data_random1['articles'][np.random.randint(len(data_random1['articles']))]['caption_modified']
    data_random2 = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random2['img_local_path']:
        data_random2 = train_data[np.random.randint(len(train_data))]
    cap2 = data_random2['articles'][np.random.randint(len(data_random2['articles']))]['caption_modified']
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [4]:
# Take 1 random correct caption, 1 random false caption
def gen_negative_case_2(data, train_data):
     # Pick 1st correct caption
    if np.random.rand()>0.5:
        cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption']
        # Pick a random article then pick its first caption
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption']
    else:
        cap2 = data['articles'][np.random.randint(len(data['articles']))]['caption']
        # Pick a random article then pick its first caption
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap1 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption']
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [15]:
def gen_negative_case_3(data, train_data, furthest_list, sentences_dict, sentences, count):
    # pick 1 random correct caption
    cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
    # then find its furthest distance caption
    cap1_index = sentences_dict[cap1]
    try:
        cap2 = sentences[furthest_list[cap1_index]]
    except:
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
        count.append(cap1_index)
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [16]:
def gen_negative_case_4(data,train_data, furthest_list, sentences_dict, sentences, count):
    # pick 1 random false caption
    data_random = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random['img_local_path']:
        data_random = train_data[np.random.randint(len(train_data))]
    cap1 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
    # then find its furthest distance caption
    cap1_index = sentences_dict[cap1]
    try:
        cap2 = sentences[furthest_list[cap1_index]]
    except:
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
        count.append(cap1_index)
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [17]:
def gen_negative_case_5(data, train_data):
      # choose random image
    result = gen_positive_case(data)
    result[0] = train_data[np.random.randint(len(train_data))]['img_local_path']
    return result


In [18]:
def gen_negative_case_6(data, train_data):
      # choose random image
    result = gen_positive_case(data)
    result[0] = train_data[np.random.randint(len(train_data))]['img_local_path']
    return result

In [19]:
from SentenceNegator import SentenceNegator
sn = SentenceNegator()

def gen_negative_sentence(data):
    cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
    cap2 = sn.negate(cap1)
    if cap2 != cap1:
        return [data['img_local_path'],[cap1],[cap2],[True]]
    else:
        return None

In [17]:
np.random.seed(42)
# train_data_sample = np.random.choice(train_data, size=int(len(train_data)*50/100))
# Not OOC cases
l = []
count = []
count_true = 0
count_negative = 0
count_case=[0,0,0]
dup=0
cc = 0
for data in tqdm(train_data):
    if len(data['articles']) > 1:
        r = gen_positive_case(data)
        if r is not None:
            l.append(r)
            count_true += 1

    # 2 random false cap
    # if np.random.rand() > 0.5:
    #     l.append(gen_negative_case_1(data, train_data))
    #     count_case[0]+=1
    # # 1 true, 1 false
    if np.random.rand() > 0:
        l.append(gen_negative_case_2(data, train_data))
        count_case[1]+=1
    # # 1 true, 1 false furthest
    # if np.random.rand() > 0.3:
    #     count_case[2]+=1
    #     l.append(gen_negative_case_3(data, train_data, furthest_list, sentences_dict, sentences, count))
    
    # 1 false, 1 furthest
    # if np.random.rand() > 0.5:
    #     l.append(gen_negative_case_4(data, train_data, furthest_list, sentences_dict, sentences, count))
    # 2 false from 1 other image

    # 2 false from 1 furthest image



100%|██████████| 41006/41006 [00:00<00:00, 65277.29it/s]


In [18]:
count_true,count_negative,count_case,len(l)

(15902, 0, [0, 41006, 0], 56908)

In [19]:
dataframe = pd.DataFrame(
    l, columns=["image", "caption_1", "caption_2", "label"],
)

In [19]:
dataframe = pd.DataFrame(
    l, columns=["img_local_path", "caption1", "caption2", "context_label"],
)

In [20]:
dataframe

Unnamed: 0,image,caption_1,caption_2,label
0,val/0.jpg,"[Merlin Jackson, 75, a Zulu member, is a publi...",[The Aga Khan Music Initiative performs at the...,[True]
1,val/1.jpg,[Pictures / Shopping riots in Mexico],[Protesters block the entrance to Pemex gas st...,[False]
2,val/1.jpg,[A resident wearing mask and raincoat voluntee...,[Pictures / Shopping riots in Mexico],[True]
3,val/2.jpg,[The allegations have inspired an online campa...,"[Father Jerome, featured in our Dispatch from ...",[True]
4,val/3.jpg,[Armed police officers lean over the bonnet of...,"[Senator Mitch McConnell, the majority leader,...",[True]
...,...,...,...,...
56903,val/41006.jpg,[Sam Oosterhoff celebrates his victory in the ...,"[Raphia mango wood and rattan sideboard, £599,...",[True]
56904,val/41007.jpg,"[Beatrice Mtetwa, lawyerMtetwa is one of Zimba...",[Beatrice Mtetwa is one of Zimbabwe’s most res...,[False]
56905,val/41007.jpg,[Chile is advancing in its goals for reliance ...,[Beatrice Mtetwa is one of Zimbabwe’s most res...,[True]
56906,val/41008.jpg,[View from the lookout Jacha Kollo during the ...,"[Cable car over La Paz, Bolivia.]",[False]


In [20]:
dataframe.to_json('train.json', orient='records', lines=True)


In [41]:
dataframe[0:20]

Unnamed: 0,image,caption_1,caption_2,label
0,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[This photo taken DATE, shows apps for ORG, OR...","[The technical infrastructure of ORG, PRODUCT ...",[False]
1,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[This photo taken DATE, shows apps for ORG, OR...",[Children ride a manually operated PERSON whee...,[True]
2,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[ORG actually owns ORG but has been planning a...,[‘Staunch Characters’ Battle Over Painting of ...,[True]
3,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Mr. PERSON wants to increase the utility of t...,[Mr. PERSON does not want to increase the util...,[True]
4,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Mr. PERSON wants to increase the utility of t...,[PERSON conducts CARDINAL of his healing cerem...,[True]
5,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Mr. PERSON wants to increase the utility of t...,[The mother was slaughtered and the newborn ba...,[True]
6,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[A statue depicting PERSON stands in GPE, GPE,...",[GPE had fiercely objected to its neighbor's u...,[False]
7,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[A statue depicting PERSON stands in GPE, GPE,...",[A statue depicting PERSON does not stand in G...,[True]
8,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Bloggers who quit jobs to travel the world en...,[PERSON listening to his ORG headphones near O...,[True]
9,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Participants in a rally in GPE in support of ...,"[A statue depicting PERSON stands in GPE, GPE,...",[True]


In [21]:
def load_image(path):
   try:
      with open(path, "rb") as fp:
        return fp.read()
   except:
      return None

In [22]:
tqdm.pandas()

dataframe['image'] = dataframe['image'].progress_apply(lambda x: load_image(x))

  from pandas import Panel
100%|██████████| 56908/56908 [00:18<00:00, 3059.45it/s]


In [30]:
dataframe = dataframe[dataframe.image.notnull()]

# PyArrow

In [23]:
table = pa.Table.from_pandas(dataframe)


In [24]:
split = 'val'
with pa.OSFile(f"dataset/normal_cap/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)
# Data 2 mất file train

# Val

# Test

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def modify_caption_replace_entities(caption_text):
    """
        Utility function to replace named entities in the caption with their corresponding hypernyms
        Args:
            caption_text (str): Original caption with named entities
        Returns:
            caption_modified (str): Modified caption after replacing named entities
    """
    doc = nlp(caption_text)
    caption_modified = caption_text
    caption_entity_list = []
    for ent in doc.ents:
        caption_entity_list.append((ent.text, ent.label_))
        caption_modified = caption_modified.replace(ent.text, ent.label_, 1)
    return caption_modified

In [25]:
l_test  = []
for data in tqdm(test_data):
    cap1 = (data['caption1'])
    cap2 = (data['caption2'])
    l_test.append([data['img_local_path'],[cap1],[cap2],[data['context_label']==True]])

100%|██████████| 1700/1700 [00:00<00:00, 482266.95it/s]


In [26]:
dataframe_test = pd.DataFrame(
    l_test, columns=["image", "caption_1", "caption_2", "label"],
)

In [27]:
dataframe_test

Unnamed: 0,image,caption_1,caption_2,label
0,test/0.jpg,[Julian Castro at his announcement in San Anto...,[Julian Castro at his announcement in San Anto...,[False]
1,test/1.jpg,[Supporters of Tanzania's ruling Chama Cha Map...,[A person sits on a truck as supporters of the...,[False]
2,test/2.jpg,"[53,000 dead people turned up on the state’s v...",[These social media posts did not link to a re...,[True]
3,test/3.jpg,"[Actor, musician, director and devoted followe...",[A shocking report about the former child acto...,[True]
4,test/4.jpg,[Men from the Maasai tribe perform a tradition...,"[And on the same day in Kenya's Narok county, ...",[False]
...,...,...,...,...
1695,test/1695.jpg,[President Obama trademarked the name 'Obamaca...,[There was no truth that Obama family millions...,[True]
1696,test/1696.jpg,[A photograph shows a soldier carrying a donke...,[Coronavirus meme featuring “WWII donkey” is n...,[True]
1697,test/1697.jpg,[Homeless people living on streets in Denver],[The State Capitol Building in Colorado],[False]
1698,test/1698.jpg,[The castle's esplanade was a perfect spot for...,[Picture shows an Edinburgh skier],[False]


In [28]:
tqdm.pandas()

dataframe_test['image'] = dataframe_test['image'].progress_apply(lambda x: load_image(x))

  from pandas import Panel
100%|██████████| 1700/1700 [00:00<00:00, 1710.96it/s]


In [29]:
dataframe_test

Unnamed: 0,image,caption_1,caption_2,label
0,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Julian Castro at his announcement in San Anto...,[Julian Castro at his announcement in San Anto...,[False]
1,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Supporters of Tanzania's ruling Chama Cha Map...,[A person sits on a truck as supporters of the...,[False]
2,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[53,000 dead people turned up on the state’s v...",[These social media posts did not link to a re...,[True]
3,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[Actor, musician, director and devoted followe...",[A shocking report about the former child acto...,[True]
4,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Men from the Maasai tribe perform a tradition...,"[And on the same day in Kenya's Narok county, ...",[False]
...,...,...,...,...
1695,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[President Obama trademarked the name 'Obamaca...,[There was no truth that Obama family millions...,[True]
1696,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[A photograph shows a soldier carrying a donke...,[Coronavirus meme featuring “WWII donkey” is n...,[True]
1697,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Homeless people living on streets in Denver],[The State Capitol Building in Colorado],[False]
1698,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[The castle's esplanade was a perfect spot for...,[Picture shows an Edinburgh skier],[False]


# PyArrow

In [30]:
table = pa.Table.from_pandas(dataframe_test)
split = 'test'
with pa.OSFile(f"dataset/normal_cap/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)