# Prepare Dataset

In [12]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [13]:
DATASET_DIR = '/data/lego/dataset/'

## Rename jpg and xml to have matching names

In [None]:
import os
import glob

In [None]:
jpg_files = glob.glob(os.path.join(DATASET_DIR, 'JPEGImages/*.jpg'))

In [None]:
examples = []
for i, jpg_path in enumerate(jpg_files):
    i += 1
    example_id = '{0:08d}'.format(i) # eg '00000020'
    
    # derive xml path from jpg path
    xml_path = jpg_path.replace('JPEGImages/bbox-image', 'Annotations/bbox-annotation')
    xml_path = xml_path.replace('jpg', 'xml')
    
    os.rename(jpg_path, '{}/JPEGImages/{}.jpg'.format(DATASET_DIR, example_id))
    os.rename(xml_path, '{}/Annotations/{}.xml'.format(DATASET_DIR, example_id))
    
    examples.append(example_id)

## Save example ids

In [None]:
with open(os.path.join(DATASET_DIR, 'examples.txt'), 'wt') as f:
    f.writelines('\n'.join(examples))

## Read unique label names

In [16]:
import os
import glob
import xml.etree.ElementTree as ET

In [17]:
xml_files = glob.glob(os.path.join(DATASET_DIR, 'Annotations/*.xml'))
names = set()
for f in xml_files:
    root = ET.parse(f).getroot()
    # get 'name' for each object
    for obj in root.iter('object'):
        names.add(obj.find('name').text)

In [18]:
print('number of classes:', len(names))

number of classes: 45


## Save label map protobuf

In [19]:
from object_detection.protos import string_int_label_map_pb2

In [42]:
label_map = string_int_label_map_pb2.StringIntLabelMap()
for name_id, name in enumerate(names):
    item = label_map.item.add()
    item.id = name_id
    item.name = name


In [44]:
with open(os.path.join(DATASET_DIR, 'label_map.pbtxt'), 'wt') as f:
    f.write(str(label_map))