# Data Preparation (from .xml to .tfrecord)

In [1]:
import os, glob, pandas as pd, xml.etree.ElementTree as ET, tensorflow as tf, cv2, numpy as np

from object_detection.utils import dataset_util as DU, label_map_util as LMU

from collections import namedtuple

In [2]:
label_map = "./label_map.pbtxt"
label_map_dict = LMU.get_label_map_dict(label_map)

label_map_dict

{'pothole': 1, 'crack': 2}

In [3]:
def class_text_to_num(text):
    return label_map_dict[text]

In [4]:
def xml_to_csv(path):
    xml_list = []
    for file in glob.glob(path + "/*.xml"):
        tree = ET.parse(file)
        root = tree.getroot()
        for member in root.findall("object"):
            print(root.find("filename").text)
            if member[0].text == "plain": continue
            xml_list.append((
                root.find("filename").text,
                int(member[4][0].text),
                int(member[4][1].text),
                int(member[4][2].text),
                int(member[4][3].text),
                int(class_text_to_num(member[0].text)),
            ))  # filename, x1, y1, x2, y2, class
    columns = ["filename", "x1", "y1", "x2", "y2", "class"]
    return pd.DataFrame(xml_list, columns = columns)

In [5]:
def regroup(dataframe, group = "filename"):
    data = namedtuple("data", ["filename", "objects"])
    group_by = dataframe.groupby(group)
    return [data(filename, group_by.get_group(x)) for filename, x in zip(group_by.groups.keys(), group_by.groups)]

In [6]:
def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    # If the value is an eager tensor BytesList won't unpack a string from an EagerTensor.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [7]:
def create_tf_ex(group, path):
    image = cv2.imread(os.path.join(path, group.filename), cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    filename = group.filename.encode("utf-8")
    label = []
    bbox = []
    for i, data in group.objects.iterrows():
        bbox.append([data["y1"] / 512, data["x1"] / 512, data["y2"] / 512, data["x2"] / 512])
        label.append(data["class"])
    features = tf.train.Features(feature = {
        "filename": DU.bytes_feature(filename),
        "pic": bytes_feature(tf.io.serialize_tensor(image)),
        "bbox": bytes_feature(tf.io.serialize_tensor(np.array(bbox))),
        "label": bytes_feature(tf.io.serialize_tensor(np.array(label))),
    })
    return tf.train.Example(features = features)

In [8]:
def create_tf_record(group_x, path, writer):
    i = 0
    for group in group_x:
        tf_ex = create_tf_ex(group, path)
        writer.write(tf_ex.SerializeToString())
        i += 1
    writer.close()
    print("DONE {}\n{}".format(path, i))

In [9]:
path_train = "../dataset/train/augmented"
path_test = "../dataset/test/augmented"

train_out = "./train.tfrecord"
test_out = "./test.tfrecord"

writer_train = tf.io.TFRecordWriter(train_out)
writer_test = tf.io.TFRecordWriter(test_out)

df_train = xml_to_csv(path_train)
df_test = xml_to_csv(path_test)

group_train = regroup(df_train)
group_test = regroup(df_test)

pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
crack
pothole
crack
pothole
crack
pothole
crack
pothole
crack
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
po

pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole
pothole


crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
crack
.


KeyError: '.'

In [None]:
create_tf_record(group_train, path_train, writer_train)
create_tf_record(group_test, path_test, writer_test)