# Convert dataset from Open Images to RecordIO format


- Based on Open Images Dataset format (https://storage.googleapis.com/openimages/web/download.html)
- Use `openimage_extractor.ipynb` to extract and download a subset of data from Open Images if needed.

I am using a collection of existing resources from GluonCV and MXNet to perform some tasks. There is no need to install MXNet if this notebook is executed on SageMaker Notebook Instance (just use `conda_mxnet_***` kernel), though you may still need to install GluonCV.

This notebook comes with a tiny dataset in Open Images Dataset format, located under `data` folder. Follow the structure of the sample dataset if you want to create your own.

In [None]:
!pip install gluoncv

In [None]:
import os
import random
import glob
from matplotlib import pyplot as plt

import numpy as np
import cv2

from gluoncv.data import COCODetection, RecordFileDetection
from gluoncv.utils import viz

import importlib
import utils

In [None]:
importlib.reload(utils)

In [None]:
# Specify your own classes here
CLASSES = ['monitor', 'vase', 'camera']
COLORS = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(CLASSES))]

DATA_ROOT = 'data/openimage_like_sample'  # This would be path to folder 2017 if you are using a full COCO 2017 dataset 
DATA_SPLIT_NAME = 'test'
IMAGE_EXT = '.png'

lst_dir_path = os.path.join(DATA_ROOT, DATA_SPLIT_NAME)
lst_file_path = lst_dir_path + '.lst'
rec_file_path = lst_file_path.replace('.lst', '.rec')

sample_idx = [0, 1]

##### Use a wrapper class to specify your own classes and image format

In [None]:
class OpenImageDetection:
    CLASSES = CLASSES
    def __init__(self, root, split, image_ext='.jpg'):
        self.root = root
        self.split = split
        self._image_ext = image_ext
        self._anno_ext = '.txt'
        self._items = self._load_items()
        self._index_map = dict(zip(self.classes, range(len(self.classes))))
        
    def __str__(self):
        return self.__class__.__name__ + '(' + self.split + ')'

    @property
    def classes(self):
        return type(self).CLASSES

    def __len__(self):
        return len(self._items)

    def __getitem__(self, idx):
        img_id, image_path, _ = self._items[idx]
        labels = self._load_labels(idx)
        img = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        return img, labels

    def _load_items(self):
        ids = []
        
        # The same image may appear in many classes
        processed = []
        class_root = os.path.join(self.root, self.split)
        images = glob.glob(os.path.join(class_root, f'*/*' + self._image_ext))
        for image_path in images:
            image_id = os.path.splitext(os.path.split(image_path)[1])[0]
            if image_id not in processed:
                labels = glob.glob(os.path.join(class_root, f'*/Label/{image_id}.txt'))
                if len(labels) == 0:
                    print(f'WARNING: no labels found for image {image_path}')
                else:
                    ids.append([image_id, image_path, labels])
                    processed.append(image_id)
        return ids

    def _load_labels(self, idx):
        
        out_labels = []
        for annot_path in self._items[idx][-1]:
            with open(annot_path) as fin:
                labels = [l.split(' ') for l in fin.read().strip().split('\n')]

            for cls, x1, y1, x2, y2 in labels:
                if cls not in self.classes:
                    continue
                out_labels.append([float(x1), float(y1), float(x2), float(y2), self._index_map[cls]])
            
        return np.array(out_labels)

##### Load the included dataset and create an `.lst` file

Also display the dataset images with bounding boxes, you may want to skip plotting those images for large datasets

In [None]:
print(f'Creating LST file {lst_file_path}')
dataset = OpenImageDetection(root=DATA_ROOT, split=DATA_SPLIT_NAME, image_ext='.png')
print('Dataset length:', len(dataset))

with open(lst_file_path, 'w') as lst_out:
    for idx in range(len(dataset)):
        image, labels = dataset[idx]
        h, w = image.shape[:2]
        image_path = '/'.join(dataset._items[idx][1].split('/')[-2:])
        bboxes, cids = labels[:, :4], labels[:, 4:5]
        lst_record = utils.build_lst_record(image_path, w, h,  bboxes, cids, idx)
        lst_out.write(lst_record + '\n')
        if idx in sample_idx:
            viz.plot_bbox(image, bboxes=bboxes, labels=cids, class_names=CLASSES)

print(f'- finished, {idx+1} records written')
plt.show()

### Build RecordIO file

Use an existing script from Apache MXNet complete this task. If you struggle with a download then you can use an copy of the script included alongside this Notebook named `im2rec_local.py`

In [None]:
!wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py

##### Build

Run the downloaded or included script. Don't worry if you see `count:0` message after the cell finishes - script only reports once in 1000 records so the message is a bit misleading.

In [None]:
!python im2rec.py $lst_file_path $lst_dir_path --encoding $IMAGE_EXT --num-thread 4 --pack-label

# or uncommend and run the following if you want to use an included script
#!python im2rec_local.py $lst_file_path $lst_dir_path --encoding $IMAGE_EXT --num-thread 4 --pack-label

##### Test

Extract the same sample from the created RecordIO file and show with boxes

In [None]:
print(f'Loading records from {rec_file_path}')
rec_dataset = RecordFileDetection(rec_file_path)
print('Dataset length:', len(rec_dataset))
for idx in sample_idx:
    img, labels = rec_dataset[idx]
    viz.plot_bbox(img, bboxes=labels[:, :4], labels=labels[:, 4:5], class_names=CLASSES)
plt.show()
