In [1]:
import os
import os.path as op
import json
import cv2
import base64
import random
import numpy as np
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from PIL import Image, ImageDraw

from maskrcnn_benchmark.structures.tsv_file import TSVFile

In [2]:
orig_root = '/mnt/Toshiba2TB/dataset_ImageCaption'
exp_root = '../../../../datasets_proc/big_nn_nms1'

In [3]:
img_paths = dict()

folder_list = [f for f in os.listdir(orig_root) if '.' not in f]

for folder in folder_list[:5]:
    print(folder, flush=True)
    img_list = [f for f in os.listdir(op.join(orig_root, folder))]
    for img_filename in tqdm(img_list):
        img_paths[img_filename] = op.join(orig_root, folder, img_filename)


CC_DATASET1


100%|██████████| 449055/449055 [00:00<00:00, 854793.42it/s]

CC_DATASET2



100%|██████████| 436727/436727 [00:00<00:00, 815164.81it/s]

CC_DATASET3



100%|██████████| 369671/369671 [00:00<00:00, 832508.74it/s]

CC_DATASET4



100%|██████████| 385483/385483 [00:00<00:00, 729705.98it/s]

CC_val



100%|██████████| 9988/9988 [00:00<00:00, 307582.29it/s]


In [4]:
# sample
img_paths[list(img_paths.keys())[0]]

'/mnt/Toshiba2TB/dataset_ImageCaption/CC_DATASET1/0000006ee283bee151d361963b2907c1.jpg'

In [5]:
with open(op.join(orig_root, 'Final_EN.json')) as f:
    dataset_captions = json.load(f)

dataset_captions = dataset_captions['annotations']
len(dataset_captions)

2620065

In [6]:
img_keys = set(c['file_name'] for c in dataset_captions)
len(img_keys)

1762097

In [7]:
captions_by_num = dict()

for caption in tqdm(dataset_captions):
    cap = caption['file_name']
    if cap not in captions_by_num:
        captions_by_num[cap] = 1
    else:
        captions_by_num[cap] += 1

100%|██████████| 2620065/2620065 [00:00<00:00, 2765091.24it/s]


In [8]:
# calculate counts of captions per image over dataset

counts = dict()

for count in captions_by_num.values():
    if count not in counts:
        counts[count] = 1
    else:
        counts[count] += 1

print(sorted(counts.items(), key=lambda x:x[0]))

# remove images with more than 16 captions

img_keys_filtered = [name for (name, count) in captions_by_num.items() if count <= 16]
len(img_keys_filtered)

[(1, 1536397), (2, 7550), (3, 7552), (4, 32013), (5, 174719), (6, 329), (7, 45), (8, 118), (9, 266), (10, 579), (11, 648), (12, 948), (13, 827), (14, 11), (15, 24), (16, 36), (17, 2), (19, 2), (20, 4), (21, 2), (25, 1), (26, 3), (29, 1), (31, 1), (35, 2), (36, 1), (39, 1), (64, 1), (99, 1), (104, 1), (105, 3), (109, 1), (111, 1), (117, 1), (118, 1), (120, 1), (128, 1), (165, 1), (167, 1), (384, 1)]


1762062

In [9]:
# filter keys

json_keys = set(img_keys_filtered)
file_keys = set(img_paths.keys())

img_keys_filtered = json_keys.intersection(file_keys)
img_keys_filtered = list(img_keys_filtered)

In [10]:
# Split dataset to train/val/test

random.shuffle(img_keys_filtered)
split_cnt = len(img_keys_filtered) // 100
print(split_cnt)

splits = {
    'train': img_keys_filtered[split_cnt * 2:],
    'val':img_keys_filtered[split_cnt: split_cnt * 2],
    'test': img_keys_filtered[: split_cnt],
}

15406


In [11]:
# Shrink image if big

def scale_img(img):
    height, width = img.shape[:2]
    max_height = 1000
    max_width = 1000

    # only shrink if img is bigger than required
    if max_height < height or max_width < width:
        # get scaling factor
        scaling_factor = max_height / float(height)
        if max_width/float(width) < scaling_factor:
            scaling_factor = max_width / float(width)
        # resize image
        img = cv2.resize(img, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_AREA)
    return img

In [13]:
def tsv_writer(values, tsv_file):
    #mkdir(op.dirname(tsv_file))
    lineidx_file = op.splitext(tsv_file)[0] + '.lineidx'
    idx = 0
    tsv_file_tmp = tsv_file + '.tmp'
    lineidx_file_tmp = lineidx_file + '.tmp'
    with open(tsv_file_tmp, 'w') as fp, open(lineidx_file_tmp, 'w') as fpidx:
        assert values is not None
        for value in values:
            assert value is not None
            value = [v if type(v)!=bytes else v.decode('utf-8') for v in value]
            v = '{0}\n'.format('\t'.join(map(str, value)))
            fp.write(v)
            fpidx.write(str(idx) + '\n')
            idx = idx + len(v)
    os.rename(tsv_file_tmp, tsv_file)
    os.rename(lineidx_file_tmp, lineidx_file)

In [14]:
splits.keys()

dict_keys(['train', 'val', 'test'])

In [None]:
for split in splits:
    exp_encoded_img_file = op.join(exp_root, f'{split}.img.tsv')
    exp_hw_file = op.join(exp_root, f'{split}.hw.tsv')
    
    lineidx_file = op.splitext(exp_encoded_img_file)[0] + '.lineidx'
    idx = 0
    
    tsv_file_tmp = exp_encoded_img_file + '.tmp'
    lineidx_file_tmp = lineidx_file + '.tmp'

    rows_hw = []
    
    with open(tsv_file_tmp, 'w') as fp, open(lineidx_file_tmp, 'w') as fpidx:
        for img_p in tqdm(splits[split]):  # [:20000]):
            img_key = img_p.split('.')[0]
            img_path = img_paths[img_p]

            img = cv2.imread(img_path)
            if img is None:
                continue

            img = scale_img(img)
            height = img.shape[0]
            width = img.shape[1]

            img_encoded_str = base64.b64encode(cv2.imencode('.jpg', img)[1])
            row = [img_key, img_encoded_str]
            value = [v if type(v)!=bytes else v.decode('utf-8') for v in row]
            v = '\t'.join(map(str, value)) + '\n'
            fp.write(v)
            fpidx.write(str(idx) + '\n')
            idx = idx + len(v)
            
            row_hw = [img_key, json.dumps([{'height': height, 'width': width}])]
            rows_hw.append(row_hw)
    
    os.rename(tsv_file_tmp, exp_encoded_img_file)
    os.rename(lineidx_file_tmp, lineidx_file)

    tsv_writer(rows_hw, exp_hw_file)

print('Done.')

In [None]:
# for split in splits:
#     exp_hw_file = op.join(exp_root, f'{split}.hw.tsv')
    
#     rows_hw = []
#     for img_p in tqdm(splits[split]):
#         img_key = img_p.split('.')[0]
#         img_path = img_paths[img_p]

#         img = cv2.imread(img_path)
#         if img is None:
#             continue
#         img = scale_img(img)
#         height = img.shape[0]
#         width = img.shape[1]
#         row_hw = [img_key, json.dumps([{'height': height, 'width': width}])]
#         rows_hw.append(row_hw)
    
#     tsv_writer(rows_hw, exp_hw_file)

# print('Done.')

In [15]:
k = splits['val'][0]
dataset_captions[0]
#k

{'file_name': 'a1650e00b6261e99a6bbe6fe13919302.jpg',
 'caption': 'author : a life in photography -- in pictures',
 'category_id': -1}

In [16]:
# Run cell only if we need to re-read keys from generated TSVs
exp_root = '/mnt/Toshiba2TB/cc_vvl_nms1'

img_splits = dict()

for split in splits:
    tsv = TSVFile(op.join(exp_root, f'{split}.label.tsv'))
    keys = [tsv.seek(i)[0] for i in tqdm(range(tsv.num_rows()))]
    img_splits[split] = keys

len(img_splits['train'])

100%|██████████| 1509745/1509745 [00:10<00:00, 150876.44it/s]
100%|██████████| 15406/15406 [00:00<00:00, 102118.22it/s]
100%|██████████| 15406/15406 [00:00<00:00, 103780.96it/s]


1509745

In [17]:
# Make dict with img_key : {img_key, id, caption}

ds_caps_by_key = {
    s['file_name'][:-4]: {
        'image_id': s['file_name'][:-4],
        'id': i + 1,
        'caption': s['caption'],
    } for (i, s) in enumerate(dataset_captions)
}

ds_caps_by_key['a1650e00b6261e99a6bbe6fe13919302']

{'image_id': 'a1650e00b6261e99a6bbe6fe13919302',
 'id': 1,
 'caption': 'author : a life in photography -- in pictures'}

In [18]:
exp_root = '/mnt/Toshiba2TB/cc_vvl_nms1'

In [19]:
# Filter captions for splits

for split in tqdm(splits):
    
    # Get current split samples from dataset
    
    if img_splits is not None:  # ???????
        out_captions = [ds_caps_by_key[img_key] for img_key in img_splits[split]]
    else:
        out_captions = [ds_caps_by_key[img_key[:-4]] for img_key in splits[split]]              
    
    # Generate captions in COCO format

    idim = []
    for cap in out_captions:
        idim.append({'id': cap['image_id'], 'file_name': cap['image_id']})

    out_captions_coco_fmt = {'annotations': out_captions, 'images': idim, 'type': 'captions', 'info': 'dummy', 'licenses': 'dummy'}

    # Save JSON

    with open(os.path.join(exp_root, f'{split}_caption.json'), 'w') as fp:
        json.dump(out_captions, fp)

    with open(os.path.join(exp_root, f'{split}_caption_coco_format.json'), 'w') as f:
        json.dump(out_captions_coco_fmt, f)

out_captions[:3]

100%|██████████| 3/3 [00:20<00:00,  6.76s/it]


[{'image_id': '8d71c508fc505357f78ebd7653edc54f',
  'id': 788901,
  'caption': 'an aerial view of the campus'},
 {'image_id': 'bd4d9437fef6643a77f06fb47c3db104',
  'id': 841564,
  'caption': "there 's only one     grime artist made sure the crowd knew she was back for good , with her name emblazoned across the stage"},
 {'image_id': 'cca63ad121aa5ee9ce1f83d0b7e3df8d',
  'id': 630792,
  'caption': 'false teeth biting a bullet in a cocktail glass'}]