This notebook fleshes out the skeleton provided by TensorFlow (https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/preparing_inputs.md)

### Import statements

In [1]:
from platform import python_version
import os
import xml.etree.ElementTree as ET

import tensorflow as tf
from object_detection.utils import dataset_util

### Check Python version and change input file directory as well as output file name

You should run this script one level above the folder in which the annotations are contained.

In [2]:
# Check version--needs to be 3.6 or above to use `os.fsencode()`
print(python_version())

# Input string for directory over which to iterate
directory_str = "./annotations"

# Input string for file to which serialized records should be written
output_str = "./tf_records/new_test"

3.6.8


### Write a `tf_example` for one record

In [3]:
# Set file directory. Initialize empty lists and counters
directory = os.fsencode(directory_str)

# Do we need to include all labels here?
LABEL_DICT = {
    "chair": 1,
    "stool": 2,
    "oven": 3,
    "stove": 4,
    "ladder": 5,
    "sofa": 6
}

def create_tf_example(full_path):
    
    # Construct full file path, read XML tree object, set tree root, and obtain image size
    tree = ET.parse(full_path)
    root = tree.getroot()
    size = root.find('size')

    height = int(size.find('height').text) # Image height
    width = int(size.find('width').text) # Image width
    filename = root.find('filename').text # Filename of the image. Empty if image is not from file
    encoded_full_path = full_path.encode()
    
    with tf.io.gfile.GFile(full_path, 'rb') as file:
        encoded_image_data = file.read() # Encoded image bytes
        
    image_format = b'jpg' # b'jpeg' or b'png' NOTE, the example script uses b'jpg', which is what I used

    xmins = [] # List of normalized left x coordinates in bounding box (1 per box)
    xmaxs = [] # List of normalized right x coordinates in bounding box
             # (1 per box)
    ymins = [] # List of normalized top y coordinates in bounding box (1 per box)
    ymaxs = [] # List of normalized bottom y coordinates in bounding box
             # (1 per box)
    classes_text = [] # List of string class name of bounding box (1 per box)
    classes = [] # List of integer class id of bounding box (1 per box)

    # Iterate through each annotated 'object' in an image
    for obj in root.findall('object'):
        # Extract bounding box coordinates and calculate width, height, and area
        bndbox = obj.find('bndbox')
        
        # Normalize x-coordinates to image width and y-coordinates to image height
        xmin = float(bndbox.find('xmin').text)/width
        ymin = float(bndbox.find('ymin').text)/height
        xmax = float(bndbox.find('xmax').text)/width
        ymax = float(bndbox.find('ymax').text)/height
        
        # Extract name (i.e., category) and use LABEL_DICT dictionary to get label ID
        name = obj.find('name').text
        name_id = LABEL_DICT[name]
        encoded_name = name.encode()
        
        # Append information to lists
        xmins.append(xmin)
        ymins.append(ymin)
        xmaxs.append(xmax)
        ymaxs.append(ymax)
        classes_text.append(encoded_name)
        classes.append(name_id)
        
    # I think that `filename` and `source_id` should encode the full path (not just the file name), but
    # I could be wrong
    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(encoded_full_path),
        'image/source_id': dataset_util.bytes_feature(encoded_full_path),
        'image/encoded': dataset_util.bytes_feature(encoded_image_data),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))        

    return tf_example

### Iterate through directory, convert each Pascal/VOC record, serialize to string, and write to disk

In [4]:
# Iterate through file structure
for file in sorted(os.listdir(directory)):
    filename = os.fsdecode(file)
    full_path = os.path.join(directory_str, filename)
    tf_example = create_tf_example(full_path)
        
    # Skip files that don't contain a '.xml' extension
    if not filename.endswith(".xml"):
        print(full_path)
        continue 
        
#   write_to_file = tf.python_io.TFRecorderWriter(output_path)
    write_to_file = tf.compat.v2.io.TFRecordWriter(output_str)
    write_to_file.write(tf_example.SerializeToString())
    
write_to_file.close()

### Read serialized file from disk and retrieve one record for inspection

In [5]:
# Taken from https://www.tensorflow.org/tutorials/load_data/tfrecord

filenames = [output_str]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [6]:
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "image/encoded"
    value {
      bytes_list {
        value: "<annotation>\n\t<folder>kitchens_renamed_flattened</folder>\n\t<filename>white_kitchen.EH.090.jpg</filename>\n\t<path>/home/errett/datasets/kitchens_renamed_flattened/white_kitchen.EH.090.jpg</path>\n\t<source>\n\t\t<database>Unknown</database>\n\t</source>\n\t<size>\n\t\t<width>810</width>\n\t\t<height>1215</height>\n\t\t<depth>3</depth>\n\t</size>\n\t<segmented>0</segmented>\n\t<object>\n\t\t<name>stove</name>\n\t\t<pose>Unspecified</pose>\n\t\t<truncated>1</truncated>\n\t\t<difficult>0</difficult>\n\t\t<bndbox>\n\t\t\t<xmin>1</xmin>\n\t\t\t<ymin>763</ymin>\n\t\t\t<xmax>132</xmax>\n\t\t\t<ymax>1211</ymax>\n\t\t</bndbox>\n\t</object>\n\t<object>\n\t\t<name>chair</name>\n\t\t<pose>Unspecified</pose>\n\t\t<truncated>0</truncated>\n\t\t<difficult>0</difficult>\n\t\t<bndbox>\n\t\t\t<xmin>405</xmin>\n\t\t\t<ymin>617</ymin>\n\t\t\t<xmax>525</xmax>\n\t\t\t<ymax>794</ymax>\n\t\t</bndbox>\n\t</object