In [4]:
import os
import json
import xml.etree.ElementTree as ET
from collections import defaultdict

def convert_voc_to_coco(voc_dir, output_file):
    categories = [
        {"id": 1, "name": "aeroplane"},
        {"id": 2, "name": "bicycle"},
        {"id": 3, "name": "bird"},
        {"id": 4, "name": "boat"},
        {"id": 5, "name": "bottle"},
        {"id": 6, "name": "bus"},
        {"id": 7, "name": "car"},
        {"id": 8, "name": "cat"},
        {"id": 9, "name": "chair"},
        {"id": 10, "name": "cow"},
        {"id": 11, "name": "diningtable"},
        {"id": 12, "name": "dog"},
        {"id": 13, "name": "horse"},
        {"id": 14, "name": "motorbike"},
        {"id": 15, "name": "person"},
        {"id": 16, "name": "pottedplant"},
        {"id": 17, "name": "sheep"},
        {"id": 18, "name": "sofa"},
        {"id": 19, "name": "train"},
        {"id": 20, "name": "tvmonitor"},
    ]
    
    category_mapping = {cat['name']: cat['id'] for cat in categories}
    
    images = []
    annotations = []
    annotation_id = 1
    
    for image_id, file_name in enumerate(os.listdir(os.path.join(voc_dir, 'JPEGImages'))):
        if not file_name.endswith('.jpg'):
            continue
        image = {
            "id": image_id,
            "file_name": file_name,
            "height": 0,  # will be updated later
            "width": 0    # will be updated later
        }
        images.append(image)
        
        xml_file = os.path.join(voc_dir, 'Annotations', file_name.replace('.jpg', '.xml'))
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        for size in root.findall('size'):
            width = int(size.find('width').text)
            height = int(size.find('height').text)
            image['width'] = width
            image['height'] = height
        
        for obj in root.findall('object'):
            category = obj.find('name').text
            category_id = category_mapping[category]
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            o_width = xmax - xmin
            o_height = ymax - ymin
            annotation = {
                "id": annotation_id,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": [xmin, ymin, o_width, o_height],
                "area": o_width * o_height,
                "iscrowd": 0
            }
            annotations.append(annotation)
            annotation_id += 1
    
    coco_format = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }
    
    with open(output_file, 'w') as f:
        json.dump(coco_format, f)

# Example usage
voc_dir = './Pascal_dataset_test/VOC2012/train'
output_file = 'voc2012_train.json'
convert_voc_to_coco(voc_dir, output_file)
