In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import json
from pprint import PrettyPrinter as pprint
from IPython.display import JSON
import re

In [27]:
class COCOData:
    """
        COCOData class allows DeepFigure dataset to be converted to COCO Format.

        Functions Available:

        1. read_src_folder(src_path, dest_path): Reads all the DeepFigure JSON label files from the provided parent path and stores the destination path for later use to save the converted labels.
        2. create_dict_layout(): Creates a basic layout for COCO format with basic static information.
        3. set_image_properties(file_name, image_id): Sets Image Properties. Used in convert_to_coco() function.
        4. set_caption_properties(object_dict, doc_object): Set Caption Properties. Used in set_object_properties() function.
        5. set_object_properties(doc_object, doc_object_id, image_id): Set the Object properties. Used in convert_to_coco() function.
        6. convert_to_coco(): Convert the source dataset to COCO format and store the converted data in coco_dictionary.
        7. save_coco_dataset(): Saves the converted dataset into the destination folder (Destination Folder was provided in read_src_folder function).
    """
    
    import os
    import numpy as np
    import matplotlib.pyplot as plt
    from pathlib import Path

    import json
    from pprint import PrettyPrinter as pprint
    from IPython.display import JSON
    import re
    from tqdm import tqdm
    import traceback
    
    def __init__(self):
        self.src_file_path = []
        self.coco_file_path = []
        self.src_dictionary = []
        self.coco_dictionary = []
    
    def read_src_folder(self, src_path, dest_path):
       """
       Stores the full path of the JSON files into self.src_file_path
       Stores the content of the source JSON files into self.coco_dictionary
       Stores the full path to the new files (in COCO format)
       """
       i=0
       # Fetch each JSON file from the folders
       for path in self.tqdm(self.Path(src_path).rglob('*.json'), desc="Loading Source Files"):
            if i == 100000:
                return
            i+=1
            # Open the file and read the content in JSON datatype
            file = open(path)
            json_file = self.json.load(file)

            # Prepare string for coco format json file
            coco_file_path = str(path).replace(".json", "-coco.json")
            coco_file_path = coco_file_path.replace("deepfigures-labels", "deepfigures-labels-coco")
            # coco_file_path = self.os.path.join(dest_path, self.re.split("jcdl-deepfigures-labels", coco_file_path)[1])
            coco_file_path = coco_file_path.replace(src_path, dest_path)

            self.src_file_path.append(str(path))
            self.src_dictionary.append(json_file)
            self.coco_file_path.append(str(coco_file_path))

    def create_dict_layout(self):
        temp_dict = {}
        temp_dict["info"] = {
            "year": "",
            "version": "1",
            "description": "",
            "contributor": "",
            "url": "",
            "date_created": "",
        }
        temp_dict["licenses"] = []
        temp_dict["categories"] = [
            {
                "id": 0,
                "name": "Table",
                "supercategory": ""
            },
            {
                "id": 1,
                "name": "Figure",
                "supercategory": ""
            }
        ]
        temp_dict["images"] = []
        temp_dict["annotations"] = []
        
        return temp_dict
    
    # Image denotes the image of a page where a set of objects exist
    def set_image_properties(self, file_name, image_id):
        # Get parent folder and the json file name separately.
        image_path, image_name = self.os.path.split(file_name)
        image_dict = {
            "id": image_id,
            "license": "",
            "file_name": image_name,
            "height": "",
            "width": "",
            "date_captured": "",
        } 
        return image_dict
     
    # Object denotes either a Table or Figure
    def set_object_properties(self, doc_object, doc_object_id, image_id):
        object_dict = {}
        object_dict["id"] = doc_object_id
        object_dict["image_id"] = image_id
        object_dict["iscrowd"] = 0
        object_dict["segmentation"] = []

        # Checks if the figure type is Table or Figure category
        # if doc_object["figure_type"] == "Figure":
        #     object_dict["category_id"] = 1
        # elif doc_object["figure_type"] == "Table":
        #     object_dict["category_id"] = 0
        
        object_dict["category_id"] = 1 if doc_object["figure_type"] == "Figure" else 0
        object_width = doc_object["figure_boundary"]["x2"] - doc_object["figure_boundary"]["x1"],
        object_height = doc_object["figure_boundary"]["y2"] - doc_object["figure_boundary"]["y1"],

        object_dict["bbox"] = [
            int(doc_object["figure_boundary"]["x1"]),
            int(doc_object["figure_boundary"]["y1"]),
            int(object_width[0]),
            int(object_height[0])
        ]
        object_dict["area"] = int(object_width[0] * object_height[0]) 
        self.set_caption_properties(object_dict, doc_object)
        
        return object_dict
    
    def set_caption_properties(self, object_dict, doc_object):
        object_dict["caption"] = doc_object["caption_text"]
                               
    def convert_to_coco(self):
        try:
            # Fetch each JSON file present in the folders
            for i in self.tqdm(range(len(self.src_file_path)), desc="Convering Source JSON to COCO JSON"):
                json_dict = self.create_dict_layout()
                json_dict["info"]["contributor"] = "PMC Open Access Subset" if (self.re.search("pmc", self.src_file_path[i]))  else "arXiv"
                json_dict["info"]["description"] = "Exported from PMC Open Access Subset" if (self.re.search("pmc", self.src_file_path[i]))  else "Exported from arXiv Open-access Archive"
                
                # Init Image ID
                image_id = 0
                # Init Object ID
                doc_object_id = 0
                
                # Each Image present in the file is fetched and added to a cocoData object
                for file_name, content in self.src_dictionary[i].items():
                    image_dict = self.set_image_properties(file_name, image_id)
                    for doc_object in content:
                        object_dict = self.set_object_properties(doc_object, doc_object_id, image_id)
                        # Add the object properties to the annotations key in COCO
                        json_dict["annotations"].append(object_dict)
                        # Increment the object ID for next annotated object in the file
                        doc_object_id += 1
                                                                 
                    # Increment the Image ID for the next Image in the file
                    image_id+=1
                    # Extract Image width and height if annotations exist. There has to be atleast one annotation for an image to have the dimension attributes.
                    if len(content) > 0:
                        image_dict["width"] = content[0]["page_width"]
                        image_dict["height"] = content[0]["page_height"]
                    # Add the image properties to the Images key in COCO
                    json_dict["images"].append(image_dict)

                self.coco_dictionary.append(json_dict)
        except:
            self.traceback.print_exc()

    # Converts final dictionary in COCO format for storing into file.
    def save_coco_dataset(self):
        try:
            for i in self.tqdm(range(len(self.coco_file_path))):
                coco_file_dir = self.os.path.split(self.coco_file_path[i])[0]
                if not self.os.path.exists(coco_file_dir):
                    # Creates the parent folder and all the subfolders for the file.
                    #   Does not throw an error if parent or any subfolders already exists.
                    self.Path(coco_file_dir).mkdir(parents=True, exist_ok=True)

                output_file = open(self.coco_file_path[i], mode="w")
                output_file.writelines(self.json.dumps(self.coco_dictionary[i], indent=4))
        except:
            self.traceback.print_exc()
        finally:
            output_file.close()

In [28]:
coco = COCOData()


In [29]:
coco.read_src_folder(src_path="/home/shivamsnaik/Downloads/jcdl-deepfigures-labels/", dest_path="/home/shivamsnaik/Downloads/jcdl-deepfigures-labels/")

Loading Source Files: 100000it [00:07, 12989.36it/s]


In [30]:
coco.convert_to_coco()

Convering Source JSON to COCO JSON: 100%|██████████| 100000/100000 [00:07<00:00, 12972.04it/s]


In [32]:
coco.coco_file_path[:10]

['/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.3974-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.2412-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.5948-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.1375-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.3836-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.0012-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-jsons/1203/1203.4190-coco.json',
 '/home/shivamsnaik/Downloads/jcdl-deepfigures-labels-coco/deepfigures-labels-coco/arxiv/figure-j

In [None]:
ls