In [None]:
def get_label2id(labels_lists: str):
    """id is 1 start"""
    labels_ids = list(range(1, len(labels_lists)+1))
    return dict(zip(labels_lists, labels_ids))


def get_annpaths(annpaths_list_path: str = None, ext: str = ''):
    if annpaths_list_path is not None:
        all_xml_file= []
        for filename in os.listdir(p):
            with open(os.path.join(p, filename), 'r') as f: # open in readonly mode
                all_xml_file.append(f.name)
        return all_xml_file
    else:
      print("the path is None")
      return


def get_image_info(annotation_root, extract_num_from_imgid=True):
    path = annotation_root.findtext('path')
    if path is None:
        filename = annotation_root.findtext('filename')
    else:
        filename = os.path.basename(path)
    img_name = os.path.basename(filename)
    img_id = os.path.splitext(img_name)[0]
    if extract_num_from_imgid and isinstance(img_id, str):
        img_id = img_id.split('_')[1]

    size = annotation_root.find('size')
    width = int(size.findtext('width'))
    height = int(size.findtext('height'))

    image_info = {
        'file_name': filename,
        'height': height,
        'width': width,
        'id': img_id
    }
    return image_info


def get_coco_annotation_from_obj(obj, label2id):
    label2id = {label.lower():id for label, id in label2id.items()}
    label = obj.findtext('name')
    assert label in label2id, f"Error: {label} is not in label2id !"
    category_id = label2id[label]
    bndbox = obj.find('bndbox')
    xmin = int(bndbox.findtext('xmin')) - 1
    ymin = int(bndbox.findtext('ymin')) - 1
    xmax = int(bndbox.findtext('xmax'))
    ymax = int(bndbox.findtext('ymax'))
    assert xmax > xmin and ymax > ymin, f"Box size error !: (xmin, ymin, xmax, ymax): {xmin, ymin, xmax, ymax}"
    o_width = xmax - xmin
    o_height = ymax - ymin
    ann = {
        'area': o_width * o_height,
        'iscrowd': 0,
        'bbox': [xmin, ymin, o_width, o_height],
        'category_id': category_id,
        'ignore': 0,
        'segmentation': []  # This script is not for segmentation
    }
    return ann


def convert_xmls_to_cocojson(annotation_paths,
                             label2id,
                             output_jsonpath_file: str,
                             extract_num_from_imgid: bool = True):
    output_json_dict = {
        "images": [],
        "type": "instances",
        "annotations": [],
        "categories": []
    }
    bnd_id = 1 # progressive id
    print('Start converting !')
    for a_path in tqdm(annotation_paths):
        # Read annotation xml
        ann_tree = ET.parse(a_path)
        ann_root = ann_tree.getroot()

        img_info = get_image_info(annotation_root=ann_root,
                                  extract_num_from_imgid=extract_num_from_imgid)
        img_id = img_info['id']
        output_json_dict['images'].append(img_info)

        for obj in ann_root.findall('object'):
            ann = get_coco_annotation_from_obj(obj=obj, label2id=label2id)
            ann.update({'image_id': img_id, 'id': bnd_id})
            output_json_dict['annotations'].append(ann)
            bnd_id = bnd_id + 1

    for label, label_id in label2id.items():
        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}
        output_json_dict['categories'].append(category_info)

    with open(output_jsonpath, 'w') as f:
        output_json = json.dumps(output_json_dict)
        f.write(output_json)



# README
'''
To be able to convert your xml annotations to json format, you need to provide three information:

1- all the labels within a list and assign the name `labels` to this list. Example:
labels = ['Person', 'Car', 'Bicycle', 'Bus', 'Motorbike', 'Train', 'Aeroplane', 'Chair', 'Bottle', 'Dining', 'Table', 'Potted', 'Plant TV/Monitor', 'Sofa', 'Bird', 'Cat', 'Cow', 'Dog', 'Horse', 'Sheep']

2- the path where all your xml files are present and name this variable `annpaths_list_path`. Example:
annpaths_list_path = /home/pptr/annotaions/xml/


3- where to save the created json file as long as the file name.
Save this string into a variable named `annpaths_list_path`. Example:

annpaths_list_path = /home/pptr/annotations/json/file.json

Once filled these three variables, you can just run the cell below and convert your Pascal VOC xml files to json COCO format.
'''

In [None]:
labels = ['Person', 'Car', 'Bicycle', 'Bus', 'Motorbike', 'Train', 'Aeroplane', 'Chair', 'Bottle', 'Dining', 'Table', 'Potted', 'Plant TV/Monitor', 'Sofa', 'Bird', 'Cat', 'Cow', 'Dog', 'Horse', 'Sheep']
annpaths_list_path = '/home/pptr/annotaions/xml/'
annpaths_list_path = '/home/pptr/annotations/json/file.json

label2id = get_label2id(labels)
ann_paths = get_annpaths(annpaths_list_path=annpaths_list_path, ext='xml')
convert_xmls_to_cocojson(
        annotation_paths=ann_paths,
        label2id=label2id,
        output_jsonpath_file=output_jsonpath_file,
        extract_num_from_imgid=True