In [3]:
import numpy as np
from pathlib import Path
import xml.etree.cElementTree as ET
from PIL import Image


def create_labimg_xml(image_path, annotation_list):

    image_path = Path(image_path)
    img = np.array(Image.open(image_path).convert('RGB'))

    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = str(image_path.parent.name)
    ET.SubElement(annotation, 'filename').text = str(image_path.name)
    ET.SubElement(annotation, 'path').text = str(image_path)

    source = ET.SubElement(annotation, 'source')
    ET.SubElement(source, 'database').text = 'Unknown'

    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str (img.shape[1])
    ET.SubElement(size, 'height').text = str(img.shape[0])
    ET.SubElement(size, 'depth').text = str(img.shape[2])

    ET.SubElement(annotation, 'segmented').text = '0'

    for annot in annotation_list:
        tmp_annot = annot.split(',')
        cords, label = tmp_annot[0:-2], tmp_annot[-1]
        xmin, ymin, xmax, ymax = cords[0], cords[1], cords[4], cords[5]

        object = ET.SubElement(annotation, 'object')
        ET.SubElement(object, 'name').text = label
        ET.SubElement(object, 'pose').text = 'Unspecified'
        ET.SubElement(object, 'truncated').text = '0'
        ET.SubElement(object, 'difficult').text = '0'

        bndbox = ET.SubElement(object, 'bndbox')
        ET.SubElement(bndbox, 'xmin').text = str(xmin)
        ET.SubElement(bndbox, 'ymin').text = str(ymin)
        ET.SubElement(bndbox, 'xmax').text = str(xmax)
        ET.SubElement(bndbox, 'ymax').text = str(ymax)

    tree = ET.ElementTree(annotation)
    xml_file_name = image_path.parent / (image_path.name.split('.')[0]+'.xml')
    tree.write(xml_file_name)


# --------------------------------------------------------------------------------
# a quadrilateral bounding box(8 points) coordinate example
anotation_list = ['291,473,385,481,383,504,289,496,Hello',
                  '270,507,330,507,330,516,270,516,SUPERLATIVE']

create_labimg_xml('/Users/natashalove/Downloads/25_25_data/MRCNN_25_25_pascal/images/000001293.tif', anotation_list)

In [8]:
from xml.etree import ElementTree
 
# function to extract bounding boxes from an annotation file
def extract_boxes(filename):
	# load and parse the file
	tree = ElementTree.parse(filename)
	# get the root of the document
	root = tree.getroot()
	# extract each bounding box
	boxes = list()
	for box in root.findall('.//bndbox'):
		xmin = int(float(box.find('xmin').text))
		ymin = int(float(box.find('ymin').text))
		xmax = int(float(box.find('xmax').text))
		ymax = int(float(box.find('ymax').text))
		coors = [xmin, ymin, xmax, ymax]
		boxes.append(coors)
	# extract image dimensions
	width = int(root.find('.//size/width').text)
	height = int(root.find('.//size/height').text)
	return boxes, width, height
 
# extract details form annotation file
boxes, w, h = extract_boxes('/Users/natashalove/Dropbox/CV_training_resources/labels_all/labels/000049601.xml')
# summarize extracted details
print(boxes, w, h)

[[237, 246, 256, 256]] 256 256


In [21]:
from pathlib import Path
import os
import pandas as pd

In [22]:
images_path = Path('/Users/natashalove/Dropbox/CV_training_resources/labels_all/images')
anno_path = Path('/Users/natashalove/Dropbox/CV_training_resources/labels_all/labels')

In [23]:
def filelist(root, file_type):
    return [os.path.join(directory_path, f) for directory_path, directory_name,
           files in os.walk(root) for f in files if f.endswith(file_type)]

In [24]:
def generate_train_df (anno_path):
    annotations = filelist(anno_path, '.xml')
    anno_list = []
    for anno_path in annotations:
        root = ET.parse(anno_path).getroot()
        anno = {}
        anno['filename'] = Path(str(images_path) + '/' + root.find("./filename").text)
        anno['width'] = root.find("./size/width").text
        anno['height'] = root.find("./size/height").text
        anno['class'] = root.find("./object/name").text
        anno['xmin'] = int(float(root.find("./object/bndbox/xmin").text))
        anno['ymin'] = int(float(root.find("./object/bndbox/ymin").text))
        anno['xmax'] = int(float(root.find("./object/bndbox/xmax").text))
        anno['ymax'] = int(float(root.find("./object/bndbox/ymax").text))
        anno_list.append(anno)
    return pd.DataFrame(anno_list)

In [25]:
df_train = generate_train_df(anno_path)

In [26]:
print(df_train.shape)
df_train.head()

(49604, 8)


Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,/Users/natashalove/Dropbox/CV_training_resourc...,256,256,1,0,238,16,256
1,/Users/natashalove/Dropbox/CV_training_resourc...,256,256,1,0,89,231,256
2,/Users/natashalove/Dropbox/CV_training_resourc...,256,256,2,143,77,256,256
3,/Users/natashalove/Dropbox/CV_training_resourc...,256,256,2,0,0,140,124
4,/Users/natashalove/Dropbox/CV_training_resourc...,256,256,1,0,29,148,256


In [28]:
df_train.to_csv('df_train.csv', encoding='utf-8')

In [29]:
df_train.to_csv('df_train_backup_no_utf.csv')