In [1]:
import os
import os.path as op
import json
import cv2
import base64
import numpy as np
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw

from maskrcnn_benchmark.structures.tsv_file_ops import tsv_reader, tsv_writer
from maskrcnn_benchmark.structures.tsv_file_ops import generate_linelist_file
from maskrcnn_benchmark.structures.tsv_file_ops import generate_hw_file
from maskrcnn_benchmark.structures.tsv_file import TSVFile
from maskrcnn_benchmark.data.datasets.utils.image_ops import img_from_base64

In [2]:
orig_root = '../../../../datasets/coco_2014'
img_root = op.join(orig_root, 'images')

exp_root = '../../../../datasets/coco_my_mini_nn'

splits = ['train', 'val', 'test']

In [5]:
print(karpathy['images'][555]['filepath'])
print(karpathy['images'][555]['filename'])
print(karpathy['images'][555]['cocoid'])

KeyError: 'filepath'

In [5]:
karpathy_dict = {str(k['cocoid']): op.join(k['filepath'], k['filename']) for k in karpathy['images']}

karpathy_dict['382088']

'val2014/COCO_val2014_000000382088.jpg'

In [6]:
len(karpathy['images']), len(karpathy_dict)  # must be equal

(123287, 123287)

In [3]:
cap_source = '../../../../datasets/coco/coco_oscar_preexacted_vvl'
captions = {}

for split in splits:
    with open(op.join(cap_source, f'{split}_caption.json')) as f:
        captions[split] = json.load(f)

In [4]:
print(captions['test'][0])

{'image_id': '179765', 'id': 38, 'caption': 'A black Honda motorcycle parked in front of a garage.'}


In [10]:
img_list = {split: {} for split in splits}

for split in splits:
    for sample in tqdm(captions[split]):
        img_key = sample['image_id']
        img_fullpath = karpathy_dict[img_key]
        if img_key not in img_list[split]:
            img_list[split][img_key] = img_fullpath
        else:
            if img_list[split][img_key] != img_fullpath:
                print('Error!')
                
print(list(img_list['train'])[0], captions['train'][0])  # sample
print([len(img_list[split]) for split in splits])  # train, val, test

100%|██████████| 566747/566747 [00:00<00:00, 1985489.31it/s]
100%|██████████| 25010/25010 [00:00<00:00, 2013620.18it/s]
100%|██████████| 25010/25010 [00:00<00:00, 2010301.51it/s]

318556 {'image_id': '318556', 'id': 48, 'caption': 'A very clean and well decorated empty bathroom'}
[113287, 5000, 5000]





In [39]:
for split in splits[::-1]:
    rows = []
    rows_label = []
    rows_hw = []

    i = 50
    for sample in tqdm(img_list[split].items()):
        # print(sample)
        img_key = sample[0]
        img_path = op.join(img_root, sample[1])
        img = cv2.imread(img_path)
        img_encoded_str = base64.b64encode(cv2.imencode('.jpg', img)[1])
        row = (img_key, img_encoded_str)
        # print(row[1][800:900] , flush=True)
        rows.append(row)

        height = img.shape[0]
        width = img.shape[1]
        row_hw = [img_key, json.dumps([{'width': width, 'height': height}])]
        rows_hw.append(row_hw)
        i -= 1
        if i == 0:
            break
        
    exp_encoded_img_file = op.join(exp_root, f'{split}.img.tsv')
    exp_hw_file = op.join(exp_root, f'{split}.hw.tsv')
    tsv_writer(rows, exp_encoded_img_file)
    tsv_writer(rows_hw, exp_hw_file)
    # tsv_writer(rows_label, label_file)

  1%|          | 49/5000 [00:01<02:51, 28.82it/s]
  1%|          | 49/5000 [00:01<02:19, 35.52it/s]
  0%|          | 49/113287 [00:00<34:19, 54.98it/s]


In [5]:
langs = ['EN', 'RU', 'FR', 'SP']
lang_caps_dir = '../../../../datasets/coco'
lang_captions = {}

for lang in langs:
    with open(op.join(lang_caps_dir, f'COCO_Final_{lang}.json')) as f:
        lang_captions[lang] = json.load(f)['annotations']

In [6]:
(lang_captions['EN'][-1], lang_captions['RU'][-1])

({'file_name': 'COCO_val2014_000000047720.jpg',
  'caption': 'a cup of coffee sits next to a panini sandwich on a counter'},
 {'file_name': 'COCO_val2014_000000047720.jpg',
  'caption': 'чашка кофе стоит рядом с бутербродом с панини на прилавке'})

In [7]:
id_to_captions = dict()

for i, el in enumerate(lang_captions['EN']):
    cap_image_id = el['file_name'][15:-4].lstrip('0')
    cap_translations = {lang: lang_captions[lang][i]['caption'] for lang in langs}

    if cap_image_id not in id_to_captions:
        id_to_captions[cap_image_id] = [cap_translations]
    else:
        id_to_captions[cap_image_id].append(cap_translations)
        
#print(id_to_captions)

In [8]:
print(captions['test'][0])

{'image_id': '179765', 'id': 38, 'caption': 'A black Honda motorcycle parked in front of a garage.'}


In [23]:
def clean_cap(s):
    return s.strip().replace("'", '').replace('"', '').replace('\n', '').replace('\\', '').split('.')[0].lower()

translated_caps = {}

for split in splits:
    split_caps = {lang: [] for lang in langs}

    no_ids = 0
    no_equal = 0

    for el in tqdm(captions[split]):
        image_id, id, caption = el['image_id'], el['id'], clean_cap(el['caption'])

        if image_id not in id_to_captions:
            no_ids +=1
            continue

        el_trans = id_to_captions[image_id]
        el_en_caps = [clean_cap(c['EN']) for c in el_trans]

        if caption not in el_en_caps:
            #print([caption], [el_en_caps])
            no_equal += 1
            continue

        for lang in langs:
            cap = el_trans[el_en_caps.index(caption)][lang]
            new_el = {'image_id': image_id, 'id': id, 'caption': cap}
            split_caps[lang].append(new_el)
    
    translated_caps[split] = split_caps
    print(no_ids, no_equal)

100%|██████████| 566747/566747 [00:03<00:00, 147319.92it/s]
100%|██████████| 25010/25010 [00:00<00:00, 142014.64it/s]
  0%|          | 0/25010 [00:00<?, ?it/s]

65 91
0 1


100%|██████████| 25010/25010 [00:00<00:00, 141083.51it/s]

5 1





In [25]:
i = 14
print(captions['val'][i])
print(translated_caps['val']['RU'][i])

{'image_id': '238836', 'id': 9264, 'caption': 'The car is at the intersection while the sun is setting.'}
{'image_id': '238836', 'id': 9264, 'caption': 'Автомобиль стоит на перекрестке, пока садится солнце.'}


In [30]:
for lang in langs:
    exp_root = op.join(lang_caps_dir, lang)

    for split in tqdm(splits):

        out_captions = {split: translated_caps[split][lang]}

        # Generate captions in COCO format

        idim = []
        for cap in out_captions[split]:
            idim.append({'id': cap['image_id'], 'file_name': cap['image_id']})

        out_captions_coco_fmt = {'annotations': out_captions[split], 'images': idim, 'type': 'captions', 'info': 'dummy', 'licenses': 'dummy'}

        # Save JSON

        with open(os.path.join(exp_root, f'{split}_caption.json'), 'w') as fp:
            json.dump(out_captions[split], fp, ensure_ascii=False)

        with open(os.path.join(exp_root, f'{split}_caption_coco_format.json'), 'w') as f:
            json.dump(out_captions_coco_fmt, f, ensure_ascii=False)

print('Done')

100%|██████████| 3/3 [00:06<00:00,  2.29s/it]
100%|██████████| 3/3 [00:07<00:00,  2.36s/it]
100%|██████████| 3/3 [00:07<00:00,  2.37s/it]
100%|██████████| 3/3 [00:07<00:00,  2.35s/it]

Done



