In [10]:
import os
import json
import logging
import structlog

logger = structlog.getLogger(__name__)


def convert_nested_to_flat_bbox(bb):
    """
    bb : [[X1,Y1],[X2,Y2]] BBOX
    return: [X1,Y1,X2,Y2] BBOX
    """
    return [bb[0][0], bb[0][1], bb[1][0], bb[1][1]]



def convert_relative_to_abs_bbox(bbox, image_width, image_height):
    """
    Converts a given bbox with relative coordinates to absolute coordinates
    """
    return [
        [bbox[0][0] * image_width, bbox[0][1] * image_height],
        [bbox[1][0] * image_width, bbox[1][1] * image_height],
    ]



def convert_page_to_annotation_page(page, categories):
    document_id = page["file_name"].split("_")[0]
    url = f"images/{page['file_name']}"
    return {
        "width": page["page_width"],
        "height": page["page_height"],
        "file_name": url,
        "url": url,
        "operator_job_id": document_id,
        "image_id": url,
        "annotations": [
            {
                "class": categories.index(area["class"]),
                "bbox": convert_nested_to_flat_bbox(
                    convert_relative_to_abs_bbox(area["bbox"], page["page_width"], page["page_height"])
                ),
                "bbox_mode": 0,
                "category_id": categories.index(area["class"]),
            }
            for area in page["areas"]
        ],
    }

def load_data_from_disc(data_dir):
    categories = ["table", "footer", "header", "order_line_item_header", "order_line_item"]
    all_annotations = []
    json_data_dir = f"{data_dir}/training_data_json"
    for file in os.listdir(json_data_dir):
        if file.endswith(".json"):
            with open(os.path.join(json_data_dir, file)) as f:
                page = json.load(f)
                try:
                    all_annotations.append(convert_page_to_annotation_page(page, categories))
                except Exception as e:
                    logger.error(
                        f"Could not convert page (id: {page['file_name'].split('_')[0]}) to annotation page:\n {e}"
                    )

    return all_annotations, categories

In [11]:
data_dir = 'training_data'
all_annotations, categories=load_data_from_disc(data_dir)

2022-12-19 15:10.38 [error    ] Could not convert page (id: e945dc6a-686a-4776-bae9-072f41d71b5e) to annotation page:
 'other' is not in list
2022-12-19 15:10.48 [error    ] Could not convert page (id: 1e1c1c6e-38e8-4eec-b554-61563a2da962) to annotation page:
 'other' is not in list
2022-12-19 15:10.48 [error    ] Could not convert page (id: 15896fc1-58ac-4ea1-ac20-ec3177cc70ae) to annotation page:
 'other' is not in list
2022-12-19 15:11.01 [error    ] Could not convert page (id: 464fe721-db0e-41c4-87a7-05267d11ccb2) to annotation page:
 'other' is not in list
2022-12-19 15:11.02 [error    ] Could not convert page (id: 50653579-1bf0-46cb-b0fa-bafb27ab6e09) to annotation page:
 'other' is not in list
2022-12-19 15:11.04 [error    ] Could not convert page (id: c54c0074-8066-4d1b-9789-dd6ffd4ef2ec) to annotation page:
 'other' is not in list
2022-12-19 15:11.06 [error    ] Could not convert page (id: 71bfcec7-1107-4460-b5e9-e27d276a9790) to annotation page:
 'other' is not in list
2022-1

In [16]:
all_annotations[0:2]

[{'width': 1654,
  'height': 2339,
  'file_name': 'images/3dca9056-ee77-49c9-afef-6f5373498352_page_2.png',
  'url': 'images/3dca9056-ee77-49c9-afef-6f5373498352_page_2.png',
  'operator_job_id': '3dca9056-ee77-49c9-afef-6f5373498352',
  'image_id': 'images/3dca9056-ee77-49c9-afef-6f5373498352_page_2.png',
  'annotations': [{'class': 1,
    'bbox': [66.87915802001953,
     2030.6337890625,
     1518.9124755859375,
     2219.17138671875],
    'bbox_mode': 0,
    'category_id': 1},
   {'class': 0,
    'bbox': [86.63439053890606,
     793.5302474194966,
     1558.9188953097703,
     1545.5781052156458],
    'bbox_mode': 0,
    'category_id': 0},
   {'class': 3,
    'bbox': [111.22433140706895,
     793.5302474194966,
     1558.9188953097703,
     821.3129743790756],
    'bbox_mode': 0,
    'category_id': 3},
   {'class': 4,
    'bbox': [86.63439053890606,
     856.7053902528831,
     1529.4443085606774,
     907.6589114967512],
    'bbox_mode': 0,
    'category_id': 4},
   {'class': 4,
  

In [15]:
with open(str(data_dir) + '/all_annotations.json','w') as json_file:
    json.dump(all_annotations, json_file)
with open(str(data_dir) + '/categories.json','w') as json_file:
    json.dump(categories, json_file)