In [None]:
import argparse
import json
from pathlib import Path
from typing import Any, Dict, List, Tuple
import math

from lxml import etree
from pdf2image import convert_from_path


COCO_INFO = {
    "year": 2014,
    "version": "1.0",
    "description": "The MS COCO format version of GROTOAP2.",
    "contributor": "Dominika Tkaczyk, Pawel Szostek and Łukasz Bolikowski",
    "url": "https://www.dlib.org/dlib/november14/tkaczyk/11tkaczyk.html",
}


COCO_LICENSES = [{"id": 1, "name": "CC-BY License"}]


COCO_CATEGORIES_MAP = {
    "BIB_INFO": 1,
    "BODY_CONTENT": 2,
    "REFERENCES": 3,
    "AFFILIATION": 4,
    "PAGE_NUMBER": 5,
    "ABSTRACT": 6,
    "AUTHOR": 7,
    "DATES": 8,
    "TITLE": 9,
    "COPYRIGHT": 10,
    "ACKNOWLEDGMENT": 11,
    "UNKNOWN": 12,
    "FIGURE": 13,
    "CORRESPONDENCE": 14,
    "CONFLICT_STATEMENT": 15,
    "TABLE": 16,
    "TYPE": 17,
    "KEYWORDS": 18,
    "EDITOR": 19,
    "AUTHOR_TITLE": 20,
    "GLOSSARY": 21,
    "EQUATION": 22,
}


COCO_CATEGORIES = [
    {"id": id_, "name": category_, "supercategory": ""}
    for category_, id_ in COCO_CATEGORIES_MAP.items()
]


DPI = 100


def pt_to_bbox_value(pt: float) -> float:
    return DPI * pt / 72


def zone_to_annotation(zone_node: etree._Element) -> Dict[str, Any]:
    classification_label = (
        zone_node.find("Classification").find("Category").get("Value")
    )
    vertexs = []
    for vertex in zone_node.find("ZoneCorners").findall("Vertex"):
        vertexs.append(
            (vertex.get("x"), vertex.get("y"))
        )  # the original vertex coordinates value is given as pt value
    x1, y1 = [pt_to_bbox_value(float(value_)) for value_ in vertexs[0]]
    x2, y2 = [pt_to_bbox_value(float(value_)) for value_ in vertexs[1]]
    w = math.ceil(x2 - x1)
    h = math.ceil(y2 - y1)

    return {
        "category_id": COCO_CATEGORIES_MAP[classification_label],
        "bbox": [int(x1), int(y1), w, h],
        "area": w * h,
        "iscrowd": 0,
    }


def read_annotations_from_xml(xml_path: Path) -> List[Dict[str, Any]]:
    et = etree.parse(xml_path)  # et: element tree
    tr = et.getroot()  # tr: tree root

    first_page_node = tr.find("Page")
    page_id = first_page_node.find("PageID").get("Value")
    assert (
        page_id == "0"
    ), f"First page node is NOT corresponding to the first page in the article: {str(xml_path)}"

    annotations = [
        zone_to_annotation(zone) for zone in first_page_node.findall("Zone")
    ]

    return annotations


def convert_pdf_first_page_to_image(
    pdf_path: Path, output_image_path: Path
) -> Dict[str, Any]:
    image = convert_from_path(
        pdf_path, dpi=DPI, first_page=1, last_page=1, fmt="jpeg"
    )[0]
    with output_image_path.open("w") as fw:
        image.save(fw)
    assert (
        output_image_path.exists()
    ), f"Cannot find converted image for this PDF: {pdf_path}"

    return {
        "width": image.width,
        "height": image.height,
        "file_name": output_image_path.name,
    }


def convert_one_article_to_image_and_annotations(
    article_id: str, pdf_path: Path, xml_path: Path, image_output_folder: Path
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
    article_file_stem = Path(article_id).stem
    # image_output_path = image_output_folder.joinpath(f'{article_id}_0.jpg')
    image_output_path = image_output_folder.joinpath(
        f"{article_file_stem}_0.jpg"
    )
    image = convert_pdf_first_page_to_image(pdf_path, image_output_path)
    anns = read_annotations_from_xml(xml_path)

    return image, anns


def convert_articles_to_coco_data(
    articles_id: List[str], root_pdf_xml_dir: Path, image_output_folder: Path
) -> Dict[str, Any]:
    coco_data = {
        "info": COCO_INFO,
        "licenses": COCO_LICENSES,
        "categories": COCO_CATEGORIES,
        "images": [],
        "annotations": [],
    }

    image_id = 1
    annotation_id = 1
    for article_id in articles_id:
        pdf_path = root_pdf_xml_dir.joinpath(f"{article_id}.pdf")
        xml_path = root_pdf_xml_dir.joinpath(f"{article_id}.cxml")
        if not pdf_path.exists():
            print(f"Cannot find PDF for this article: {pdf_path}")
            continue
        if pdf_path.stat().st_size == 0:
            # if os.stat(pdf_path).st_size == 0:
            print(f"This PDF file is empty, skip it: {pdf_path}")
            continue

        image, anns = convert_one_article_to_image_and_annotations(
            article_id, pdf_path, xml_path, image_output_folder
        )
        image["id"] = image_id
        for ann in anns:
            ann["id"] = annotation_id
            ann["image_id"] = image_id
            coco_data["annotations"].append(ann)  # type: ignore
            annotation_id += 1
        coco_data["images"].append(image)  # type: ignore
        image_id += 1

    return coco_data


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--articles_id_file",
        help="Article id should contain data segments as well, one example will be '99/3227843'",
        required=True,
    )

    parser.add_argument("--root_pdf_xml_dir", required=True)

    parser.add_argument("--image_output_folder", required=True)

    parser.add_argument("--output_coco_file", required=True)

    args, _ = parser.parse_known_args()

    articles_id_file = Path(args.articles_id_file)
    with articles_id_file.open() as fp:
        articles_id = [f.strip() for f in fp]
    root_pdf_xml_dir = Path(args.root_pdf_xml_dir)
    image_output_folder = Path(args.image_output_folder)

    coco_data = convert_articles_to_coco_data(
        articles_id, root_pdf_xml_dir, image_output_folder
    )
    output_coco_file = Path(args.output_coco_file)
    with output_coco_file.open("w") as fw:
        json.dump(coco_data, fw)