In [1]:
import json
import sys
import random
from tqdm import tqdm
from collections import defaultdict
import pyarrow as pa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import torch

# Import data

In [13]:
%%time
root = '.'

# train_data = list(
#     map(json.loads, open(f"{root}/cosmos/train_data.json").readlines())
# )
test_data = list(
    map(json.loads, open(f"{root}/cosmos/test_data.json").readlines())
)
train_data = list(map(json.loads, open(f"{root}/cosmos/val_data.json").readlines()))

CPU times: user 3.01 s, sys: 89.8 ms, total: 3.1 s
Wall time: 3.11 s


In [4]:
torch.cuda.set_device(0)

# Gen case functions

In [12]:
# Take 2 random correct caption
def gen_positive_case(data):
    caption_1_idx, caption_2_idx = random.sample(range(0,len(data['articles'])),2)
    cap1 = data['articles'][caption_1_idx]['caption_modified']
    cap2 = data['articles'][caption_2_idx]['caption_modified']
    if cap1 == cap2:
        return None
    return [data['img_local_path'],[cap1],[cap2],[False]]

In [13]:
# Take 2 random false caption
def gen_negative_case_1(data, train_data):
    data_random1 = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random1['img_local_path']:
        data_random1 = train_data[np.random.randint(len(train_data))]
    cap1 = data_random1['articles'][np.random.randint(len(data_random1['articles']))]['caption_modified']
    data_random2 = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random2['img_local_path']:
        data_random2 = train_data[np.random.randint(len(train_data))]
    cap2 = data_random2['articles'][np.random.randint(len(data_random2['articles']))]['caption_modified']
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [14]:
# Take 1 random correct caption, 1 random false caption
def gen_negative_case_2(data, train_data):
     # Pick 1st correct caption
    if np.random.rand()>0.5:
        cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
        # Pick a random article then pick its first caption
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
    else:
        cap2 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
        # Pick a random article then pick its first caption
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap1 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [15]:
def gen_negative_case_3(data, train_data, furthest_list, sentences_dict, sentences, count):
    # pick 1 random correct caption
    cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
    # then find its furthest distance caption
    cap1_index = sentences_dict[cap1]
    try:
        cap2 = sentences[furthest_list[cap1_index]]
    except:
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
        count.append(cap1_index)
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [16]:
def gen_negative_case_4(data,train_data, furthest_list, sentences_dict, sentences, count):
    # pick 1 random false caption
    data_random = train_data[np.random.randint(len(train_data))]
    while data['img_local_path'] == data_random['img_local_path']:
        data_random = train_data[np.random.randint(len(train_data))]
    cap1 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
    # then find its furthest distance caption
    cap1_index = sentences_dict[cap1]
    try:
        cap2 = sentences[furthest_list[cap1_index]]
    except:
        data_random = train_data[np.random.randint(len(train_data))]
        while data['img_local_path'] == data_random['img_local_path']:
            data_random = train_data[np.random.randint(len(train_data))]
        cap2 = data_random['articles'][np.random.randint(len(data_random['articles']))]['caption_modified']
        count.append(cap1_index)
    return [data['img_local_path'],[cap1],[cap2],[True]]

In [17]:
def gen_negative_case_5(data, train_data):
      # choose random image
    result = gen_positive_case(data)
    result[0] = train_data[np.random.randint(len(train_data))]['img_local_path']
    return result


In [18]:
def gen_negative_case_6(data, train_data):
      # choose random image
    result = gen_positive_case(data)
    result[0] = train_data[np.random.randint(len(train_data))]['img_local_path']
    return result

In [19]:
from SentenceNegator import SentenceNegator
sn = SentenceNegator()

def gen_negative_sentence(data):
    cap1 = data['articles'][np.random.randint(len(data['articles']))]['caption_modified']
    cap2 = sn.negate(cap1)
    if cap2 != cap1:
        return [data['img_local_path'],[cap1],[cap2],[True]]
    else:
        return None

In [14]:
np.random.seed(42)
# train_data_sample = np.random.choice(train_data, size=int(len(train_data)*50/100))
# Not OOC cases
l = []

for data in tqdm(train_data):
    # articles = [article['caption_modified'] for article in data['articles']]
    # l.append([data['img_local_path'],articles,data['img_local_path'],'train'])
    for article in data['articles']:
        l.append([data['img_local_path'],article['caption']])

100%|██████████| 41006/41006 [00:00<00:00, 528677.96it/s]


In [15]:
dataframe = pd.DataFrame(
    l, columns=["image", "caption"],
)

In [16]:
dataframe.to_json('cosmos_caption_val.json', orient='records')

In [20]:
dataframe.to_json('train.json', orient='records', lines=True)


In [20]:
def load_image(path):
   try:
      with open(path, "rb") as fp:
        return fp.read()
   except:
      return None

In [31]:
tqdm.pandas()

dataframe['image'] = dataframe['image'].progress_apply(lambda x: load_image(x))

  from pandas import Panel
100%|██████████| 161754/161754 [00:17<00:00, 8995.33it/s] 


In [32]:
dataframe = dataframe[dataframe.image.notnull()]

In [33]:
dataframe_train = dataframe.copy()

In [38]:
dataframe_full = pd.concat([dataframe_train,dataframe_val])

In [46]:
np.random.seed(42)
dataframe_shuffled = dataframe_full.sample(frac=1).copy()

In [52]:
df_train = dataframe_shuffled[:141932]
df_val = dataframe_shuffled[141932:172346]
df_test = dataframe_shuffled[172346:]


In [53]:
len(df_train),len(df_val),len(df_test)

(141932, 30414, 30414)

In [54]:
df_train['split']='train'
df_val['split']='val'
df_test['split']='test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['split']='train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['split']='val'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['split']='test'


# PyArrow

In [56]:
table = pa.Table.from_pandas(df_train)
split = 'train'
with pa.OSFile(f"dataset/captioning/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)

table = pa.Table.from_pandas(df_val)
split = 'val'
with pa.OSFile(f"dataset/captioning/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)

table = pa.Table.from_pandas(df_test)
split = 'test'
with pa.OSFile(f"dataset/captioning/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)

# Val

# Test

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def modify_caption_replace_entities(caption_text):
    """
        Utility function to replace named entities in the caption with their corresponding hypernyms
        Args:
            caption_text (str): Original caption with named entities
        Returns:
            caption_modified (str): Modified caption after replacing named entities
    """
    doc = nlp(caption_text)
    caption_modified = caption_text
    caption_entity_list = []
    for ent in doc.ents:
        caption_entity_list.append((ent.text, ent.label_))
        caption_modified = caption_modified.replace(ent.text, ent.label_, 1)
    return caption_modified

In [72]:
l_test  = []
for data in tqdm(test_data):
    cap1 = modify_caption_replace_entities(data['caption1'])
    cap2 = modify_caption_replace_entities(data['caption2'])
    l_test.append([data['img_local_path'],[cap1],[cap2],[data['context_label']==True]])

100%|██████████| 1700/1700 [00:19<00:00, 88.33it/s]


In [73]:
dataframe_test = pd.DataFrame(
    l_test, columns=["image", "caption_1", "caption_2", "label"],
)

In [74]:
dataframe_test

Unnamed: 0,image,caption_1,caption_2,label
0,test/0.jpg,"[PERSON at his announcement in GPE, GPE, on DA...","[PERSON at his announcement in GPE, GPE, on DA...",[False]
1,test/1.jpg,[Supporters of GPE's ruling ORG party come out...,[A person sits on a truck as supporters of the...,[False]
2,test/2.jpg,[CARDINAL dead people turned up on the state’s...,[These social media posts did not link to a re...,[True]
3,test/3.jpg,"[Actor, musician, director and devoted followe...",[A shocking report about the former child acto...,[True]
4,test/4.jpg,[Men from the LOC tribe perform a traditional ...,"[And on DATE in GPE's Narok county, young PERS...",[False]
...,...,...,...,...
1695,test/1695.jpg,[President PERSON trademarked the name 'WORK_O...,[There was no truth that PERSON family MONEY w...,[True]
1696,test/1696.jpg,[A photograph shows a soldier carrying a donke...,[Coronavirus meme featuring “EVENT donkey” is ...,[True]
1697,test/1697.jpg,[Homeless people living on streets in GPE],[ORG in GPE],[False]
1698,test/1698.jpg,[The castle's esplanade was a perfect spot for...,[Picture shows an ORG skier],[False]


In [75]:
tqdm.pandas()

dataframe_test['image'] = dataframe_test['image'].progress_apply(lambda x: load_image(x))

  from pandas import Panel
100%|██████████| 1700/1700 [00:00<00:00, 9703.16it/s]


In [76]:
dataframe_test

Unnamed: 0,image,caption_1,caption_2,label
0,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[PERSON at his announcement in GPE, GPE, on DA...","[PERSON at his announcement in GPE, GPE, on DA...",[False]
1,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Supporters of GPE's ruling ORG party come out...,[A person sits on a truck as supporters of the...,[False]
2,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[CARDINAL dead people turned up on the state’s...,[These social media posts did not link to a re...,[True]
3,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,"[Actor, musician, director and devoted followe...",[A shocking report about the former child acto...,[True]
4,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Men from the LOC tribe perform a traditional ...,"[And on DATE in GPE's Narok county, young PERS...",[False]
...,...,...,...,...
1695,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[President PERSON trademarked the name 'WORK_O...,[There was no truth that PERSON family MONEY w...,[True]
1696,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[A photograph shows a soldier carrying a donke...,[Coronavirus meme featuring “EVENT donkey” is ...,[True]
1697,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[Homeless people living on streets in GPE],[ORG in GPE],[False]
1698,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,[The castle's esplanade was a perfect spot for...,[Picture shows an ORG skier],[False]


# PyArrow

In [77]:
table = pa.Table.from_pandas(dataframe_test)
split = 'test'
with pa.OSFile(f"dataset_50/cosmos_{split}.arrow", "wb") as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)