# ODSI-DB to Pascal VOC - Format Conversion

References:
1. [ODSI-DB](https://sites.uef.fi/spectral/odsi-db)
2. [Pascal VOC](https://towardsdatascience.com/coco-data-format-for-object-detection-a4c5eaf518c5)
3. ["Motivation"](https://imageai.readthedocs.io/en/latest/customdetection)

In [1]:
from random import random
import xml.etree.ElementTree as ET
from os import path, listdir, makedirs, symlink
from cv2 import imread, cvtColor, COLOR_BGR2RGB

anotations_min_freq = 200
train_validation_ratio = 0.8
basepath = 'odsi_db/dataset'

# ----------------------------------------------------------------------------------------------------

def write_xml_annotation(img_folder, img_file, img_size, obj_name, boundbox, xml_filename):

    annotation = ET.Element("annotation")
    ET.SubElement(annotation, "folder").text = img_folder   # Folder that contains the images
    ET.SubElement(annotation, "filename").text = img_file   # Name of the physical file that exists in the folder
    ET.SubElement(annotation, "path").text = path.join('./'+img_folder, img_file)
    source = ET.SubElement(annotation, "source")
    ET.SubElement(source, "database").text = "ODSI-DB"      # Database name
    size = ET.SubElement(annotation, "size")
    ET.SubElement(size, "width").text = str(img_size[0])    # Image width
    ET.SubElement(size, "height").text = str(img_size[1])   # Image height
    ET.SubElement(size, "depth").text = str(img_size[2])    # Image depth (1 for black/white, 3 for color)
    ET.SubElement(annotation, "segmented").text = "0"       # 
    obj = ET.SubElement(annotation, "object")
    ET.SubElement(obj, "name").text = obj_name              # Object name
    ET.SubElement(obj, "pose").text = "Unspecified"         # Object pose
    ET.SubElement(obj, "truncated").text = "0"              # '1' if the object is partially visible in the bounding box
    ET.SubElement(obj, "difficult").text = "0"              # '1' if the object is difficult to recognize
    bndbox = ET.SubElement(obj, "bndbox")
    ET.SubElement(bndbox, "xmin").text = str(boundbox[0])   # Bounding box top-left x coordinate
    ET.SubElement(bndbox, "ymin").text = str(boundbox[1])   # Bounding box top-left y coordinate
    ET.SubElement(bndbox, "xmax").text = str(boundbox[2])   # Bounding box bottom-right x coordinate
    ET.SubElement(bndbox, "ymax").text = str(boundbox[3])   # Bounding box bottom-right y coordinate
    tree = ET.ElementTree(annotation)
    tree.write(xml_filename)

def bounding_box(points):
    xmin = min(int(point.split(';')[1]) for point in points)
    ymin = min(int(point.split(';')[0]) for point in points)
    xmax = max(int(point.split(';')[1]) for point in points)
    ymax = max(int(point.split(';')[0]) for point in points)
    return [xmin, ymin, xmax, ymax]

# ----------------------------------------------------------------------------------------------------
    
files = {'masks': None, 'annot': None, 'tiff': None}

annotations_hist = {}

# First pass (annotations_hist build-up)
files_cnt = 0
for filename in sorted(listdir(basepath)):
    
    if filename.startswith('invalid_'):
        continue
        
    if filename.endswith('_annot.csv'):
        with open(path.join(basepath, filename)) as f:
            lis = [line.split(',') for line in f]
            for x in lis:
                class_name = x[0].split(';')[0].replace(' ','')
                if class_name != 'Specularreflection':
                    if class_name in annotations_hist:
                        annotations_hist[class_name] += 1
                    else:
                        annotations_hist[class_name] = 0
        files_cnt += 1

# Annotation Whitelist:
annotations_hist_sorted = sorted(annotations_hist.items(), key=lambda x: x[1], reverse=True)

annotations_whitelist = []
for el in annotations_hist_sorted:
    if el[1] >= anotations_min_freq:
        annotations_whitelist.append(el[0])

# Filesystem prepare:
dataset_basename = 'dataset'
if not path.exists(dataset_basename):
    makedirs(dataset_basename)
for setname in ('train', 'validation'):
    set_basepath = path.join(dataset_basename, setname)
    if not path.exists(set_basepath):
        makedirs(set_basepath)
    if not path.exists(path.join(set_basepath, 'annotations')):
        makedirs(path.join(set_basepath, 'annotations'))
    if not path.exists(path.join(set_basepath, 'images')):
        makedirs(path.join(set_basepath, 'images'))

# Second pass (XML write):
cnts = {'train': 0, 'validation': 0}
for filename in sorted(listdir(basepath)):
    
    if filename.startswith('invalid_'):
        continue

    if filename.endswith('_annot.csv'):
        with open(path.join(basepath, filename)) as f:
            lis = [line.split(',') for line in f]
            for i, x in enumerate(lis):
                class_name = x[0].split(';')[0].replace(' ','')
                setname = 'train' if random() < train_validation_ratio else 'validation'
                set_basepath = path.join(dataset_basename, setname)
                cnts[setname] += 1                    
                if class_name in annotations_whitelist:
                    tiff_file = path.join(basepath, filename).replace('_annot.csv', '.tif')
                    tiff_file_link = path.join(path.join(set_basepath, 'images'), path.basename(tiff_file)).replace('.tif', '_%d.tif' % i)
                    symlink('../../../' + tiff_file, tiff_file_link)
                    xml_filename = path.join(path.join(set_basepath, 'annotations'), path.basename(tiff_file_link).replace('.tif', '.xml'))
                    img_tiff = cvtColor(imread(tiff_file_link), COLOR_BGR2RGB)
                    write_xml_annotation(img_folder=path.dirname(tiff_file_link),
                                         img_file=path.basename(tiff_file_link),
                                         img_size=img_tiff.shape,
                                         obj_name=class_name,
                                         boundbox=bounding_box([yx.replace(' ','') for yx in x[3:-1]]),
                                         xml_filename=xml_filename)

# ----------------------------------------------------------------------------------------------------
# Annotations Histogram (overall):
print('%d images overall\n' % files_cnt)
print('Training Set size: %d' % cnts['train'])
print('Validation Set size: %d' % cnts['validation'])
print('\n')
print('Annotaions Histogram (sorted):\n')
print(annotations_hist_sorted)
print('\n')
print('%d Annotaions in Whitelist (freq>%d):\n' % (len(annotations_whitelist), anotations_min_freq))
print(annotations_whitelist)

215 images overall

Training Set size: 7001
Validation Set size: 1821


Annotaions Histogram (sorted):

[('Enamel', 2286), ('Marginalgingiva', 859), ('Attachedgingiva', 328), ('Metal', 311), ('Skin', 310), ('Oralmucosa', 306), ('Plastic', 276), ('Outoffocusarea', 225), ('Lip', 217), ('Stain', 217), ('Shadow/Noise', 177), ('Gingivitis', 166), ('Calculus', 148), ('Attrition/Erosion', 133), ('Initialcaries', 85), ('Hair', 80), ('Prosthetics', 76), ('Hardpalate', 63), ('Softpalate', 60), ('Tongue', 58), ('Root', 58), ('Microfracture', 40), ('Bloodvessel', 34), ('Dentinecaries', 30), ('Plaque', 21), ('Inflammation', 20), ('Fluorosis', 19), ('Ulcer', 16), ('Mole', 14), ('Pigmentation', 14), ('Leukoplakia', 8), ('Makeup', 4), ('Fibroma', 0), ('Malignantlesion', 0)]


10 Annotaions in Whitelist (freq>200):

['Enamel', 'Marginalgingiva', 'Attachedgingiva', 'Metal', 'Skin', 'Oralmucosa', 'Plastic', 'Outoffocusarea', 'Lip', 'Stain']


In [2]:
print('Completed Successfully')

Completed Successfully
