In [1]:
import json
import os
import os.path as op
import csv

from tqdm import tqdm

In [2]:
data_source = '../datasets/coco_caption'
data_export = '../datasets/coco_kar_feats'

In [3]:
sorted(os.listdir(data_source))

['coco-train-words.p',
 'test.feature.lineidx',
 'test.feature.tsv',
 'test.label.lineidx',
 'test.label.tsv',
 'test.yaml',
 'test_caption.json',
 'test_caption_coco_format.json',
 'train.feature.lineidx',
 'train.feature.tsv',
 'train.label.lineidx',
 'train.label.tsv',
 'train.yaml',
 'train_caption.json',
 'val.feature.lineidx',
 'val.feature.tsv',
 'val.label.lineidx',
 'val.label.tsv',
 'val.yaml',
 'val_caption.json',
 'val_caption_coco_format.json']

In [4]:
splits = ['train', 'val', 'test']
coco_ids = {}

for split in splits:
    split_ids = []
    with open(op.join(data_export, f'{split}.label.tsv')) as tsv_file:
        split_tsv = csv.reader(tsv_file, delimiter="\t")
        
        for row in split_tsv:
            split_ids.append(row[0])

    coco_ids[split] = set(split_ids)

In [5]:
list(coco_ids['val'])[:5]

['313420', '126216', '506942', '373255', '203081']

In [6]:
print([len(coco_ids[split]) for split in splits])  # train, val, test

[82783, 5000, 5000]


In [7]:
captions = {}

for split in splits:
    with open(op.join(data_source, f'{split}_caption.json')) as f:
        captions[split] = json.load(f)


In [8]:
captions['test'][0]

{'image_id': '179765',
 'id': 38,
 'caption': 'A black Honda motorcycle parked in front of a garage.'}

In [9]:
print([len(captions[split]) // 5  for split in splits])  # train, val, test

[113349, 5002, 5002]


In [10]:
captions_all = captions['train'] + captions['val'] + captions['test']
new_captions = {split: [] for split in splits}

for cap in captions_all:
    for split in splits:
        if cap['image_id'] in coco_ids[split]:
            new_captions[split].append(cap)

In [11]:
new_captions['val'][0]

{'image_id': '106140',
 'id': 98,
 'caption': 'A large passenger airplane flying through the air.'}

In [12]:
for split in splits:
    with open(op.join(data_export, f'{split}_caption.json'), 'w') as wf:
        json.dump(new_captions[split], wf)

In [13]:
with open(op.join(data_source, 'val_caption_coco_format.json')) as f:
    val_caps_cf = json.load(f)
    
val_caps_cf.keys()

dict_keys(['annotations', 'images', 'type', 'info', 'licenses'])

In [14]:
images = [{'id': img_id, 'file_name': img_id} for img_id in coco_ids['val']]

new_val_caps_cf = {
    'annotations': new_captions['val'],
    'images': images,
    'type': val_caps_cf['type'],
    'info': val_caps_cf['info'],
    'licenses': val_caps_cf['licenses'],
}

In [15]:
# new_val_caps_cf['images'][0]

In [16]:
with open(op.join(data_export, f'val_caption_coco_format.json'), 'w') as wf:
    json.dump(new_val_caps_cf, wf)