## Generating Document Level Json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Standardize the format of json files from different annotators:

In [None]:
import json
import os

# Function to standardize the format of file name from different annotators
def modify_file_name(file_name):
    # Split filename and extension
    name_parts = file_name.rsplit('.', 1)

    # Extract filename without extension
    filename_without_extension = name_parts[0]
    # Extract and modify the page number
    filename_main, page_number = filename_without_extension.rsplit('-', 1)
    new_page_number = "page-" + str(int(page_number) - 1)

    # Construct new file_name
    new_filename = f"{filename_main}_{new_page_number}"

    # If there was an extension, add it back
    if len(name_parts) > 1:
        new_filename += '.' + name_parts[1]

    return new_filename

# Function to modify json data
def modify_json(data):
    for img in data["images"]:
        img["file_name"] = modify_file_name(img["file_name"])
    return data

input_folder_path = '/content/drive/MyDrive/data/annotation/annotated_jsons_prashitha/'
output_folder_path = '/content/drive/MyDrive/data/annotation/modified_annotated_jsons/'

# Ensure the output directory exists, if not create it
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all JSON files in the folder
for file_name in os.listdir(input_folder_path):
    #print(file_name)
    if file_name.endswith('.json'):
        input_file_path = os.path.join(input_folder_path, file_name)
        output_file_path = os.path.join(output_folder_path, file_name)

        # Load the JSON data
        with open(input_file_path, 'r') as file:
            data = json.load(file)

        # Modify the JSON data
        modified_data = modify_json(data)

        # Write the modified JSON data back to the new file in output_folder_path
        with open(output_file_path, 'w') as file:
            json.dump(modified_data, file, indent=4)



In [None]:
import json
import os

# Function to standardize the format of file name from different annotators
def modify_file_name(file_name):
    # Split filename and extension
    name_parts = file_name.rsplit('.', 1)

    # Extract filename without extension
    filename_without_extension = name_parts[0]
    # Extract and modify the page number
    filename_main, page_number = filename_without_extension.rsplit('_', 1)
    new_page_number = "page-" + str(int(page_number) - 1)

    # Construct new file_name
    new_filename = f"{filename_main}_{new_page_number}"

    # If there was an extension, add it back
    if len(name_parts) > 1:
        new_filename += '.' + name_parts[1]

    return new_filename

# Function to modify json data
def modify_json(data):
    for img in data["images"]:
        img["file_name"] = modify_file_name(img["file_name"])
    return data

input_folder_path = '/content/drive/MyDrive/data/annotation/annotated_jsons_yan/'
output_folder_path = '/content/drive/MyDrive/data/annotation/modified_annotated_jsons/'

# Ensure the output directory exists, if not create it
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all JSON files in the folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):
        input_file_path = os.path.join(input_folder_path, file_name)
        output_file_path = os.path.join(output_folder_path, file_name)

        # Load the JSON data
        with open(input_file_path, 'r') as file:
            data = json.load(file)

        # Modify the JSON data
        modified_data = modify_json(data)

        # Write the modified JSON data back to the new file in output_folder_path
        with open(output_file_path, 'w') as file:
            json.dump(modified_data, file, indent=4)



2. Merge all the annotated json files:

In [None]:
import json
import glob
# purpose of this section is to merge all the annotated json files
# Path to the folder containing my JSON files
path_to_json = '/content/drive/MyDrive/data/annotation/modified_annotated_jsons/*.json'

def merge_jsons(json_files):
    merged_json = {'info': [], 'categories': [], 'images': [], 'annotations': []}

    for file in json_files:
        #print(file)
        with open(file, 'r') as f:
            data = json.load(f)
            merged_json['info'].extend(data['info'])
            merged_json['categories'].extend(data['categories'])
            merged_json['images'].extend(data['images'])
            merged_json['annotations'].extend(data['annotations'])

    return merged_json

def save_merged_json(merged_json, output_file):
    with open(output_file, 'w') as f:
        json.dump(merged_json, f)

# Retrieve all .json files from my specified folder
json_files = glob.glob(path_to_json)

# Merge the JSON files
merged_json = merge_jsons(json_files)

# Specify the path to save the merged JSON file
output_file_path = '/content/drive/MyDrive/data/annotation/merged_jsons.json'

# Save the merged JSON
save_merged_json(merged_json, output_file_path)



3. Save the generated document level jsons as "merged_jsons.json":

In [None]:
with open('/content/drive/MyDrive/data/annotation/merged_jsons.json') as f:
  anno_json = json.load(f)
print(anno_json.keys())

dict_keys(['info', 'categories', 'images', 'annotations'])


## Reading order functions

In [None]:
def detect_left(bbox,box_list):
  # purpose of this section is to find whether there is a bbox
  upper_y = bbox[1]
  lower_y = bbox[1]+bbox[3]
  for box in box_list:
    if box != bbox:
      if box[0]<bbox[0] and abs(bbox[0]-box[0])-box[2] > 0:
        if box[1]+box[3]<upper_y or box[1]>lower_y:
          continue
        else:
          return True

def detect_right(bbox,box_list):
  upper_y = bbox[1]
  lower_y = bbox[1]+bbox[3]
  for box in box_list:
    if box != bbox:
      if box[0]>bbox[0] and abs(bbox[0]-box[0])-bbox[2] > 0:
        if box[1]+box[3]<upper_y or box[1]>lower_y:
          continue
        else:
          return True
def y_order(box_list):
  output = []
  box_list.sort(key=lambda x:x[1])
  return box_list
def reading_order(box_list):
  if len(box_list) == 0:
    box = []
    return box
  ordered_list = y_order(box_list)
  result_list = []
  left_box = []
  right_box = []
  for box in ordered_list:
    if detect_left(box,box_list):
      left_box.append(box)
    elif detect_right(box,box_list):
      right_box.append(box)
    else:
      result_list.append(box)
      result_list.extend(reading_order(left_box))
      result_list.extend(reading_order(right_box))
      left_box = []
      right_box = []
  result_list.extend(reading_order(right_box))
  result_list.extend(reading_order(left_box))
  return result_list

In [None]:
def reading_order_id(obj_dict,ro_list):
  ordered_id =[]
  ordered_label = []
  for i,box in enumerate(ro_list):
    for obj in obj_dict:
      if obj_dict[obj]['bbox'] == box:
        ordered_id.append(obj)
        ordered_label.append(obj_dict[obj]['category_id'])
  return ordered_id, ordered_label

In [None]:
#anno_json['images']
print(len(anno_json['images'][0].keys()))
print(anno_json['images'][0].keys())
anno_json['images'][1]

9
dict_keys(['id', 'datatorch_id', 'storage_id', 'path', 'width', 'height', 'file_name', 'metadata', 'date_captured'])


{'id': 2,
 'datatorch_id': 'acf7f749-43ec-4961-8bd0-45c64e799193',
 'storage_id': 'b030e81e-0259-4f73-84ac-33ab52ab8d91',
 'path': 'a079127_c137_2008_bow river2008a_11669409-03.png',
 'width': 1700,
 'height': 2200,
 'file_name': 'a079127_c137_2008_bow river2008a_11669409_page-2.png',
 'metadata': {},
 'date_captured': '2023-09-19T02:57:18.303Z'}

In [None]:
file_json = {} # define a json file to store bounding box information for each page of a report
for img in anno_json['images']:
  img_name = img['file_name']
  file_name = img_name.split('_page-')[0]
  page_id = int(img_name.split('_page-')[1].split('.')[0])
  if file_name not in file_json.keys():
    #print(file_name)
    file_json[file_name] = {}
    file_json[file_name]['page'] = {}
    file_json[file_name]['page'][page_id] = {}
    file_json[file_name]['page'][page_id]['objects'] = {}
    file_json[file_name]['page'][page_id]['width'] = img['width']
    file_json[file_name]['page'][page_id]['height'] = img['height']
    #file_json[file_name]['page'][page_id]['category_ids'] = img['category_ids']
    #file_json[file_name]['page'][page_id]['dataset_id'] = img['dataset_id']
    file_json[file_name]['page'][page_id]['image_name'] = img['file_name']
    file_json[file_name]['page'][page_id]['image_id'] = img['id']
    #file_json[file_name]['page'][page_id]['thumbnail'] = img['regenerate_thumbnail']
    file_json[file_name]['page'][page_id]['box_list'] = []
  else:
    file_json[file_name]['page'][page_id] = {}
    file_json[file_name]['page'][page_id]['objects'] = {}
    file_json[file_name]['page'][page_id]['width'] = img['width']
    file_json[file_name]['page'][page_id]['height'] = img['height']
    #file_json[file_name]['page'][page_id]['category_ids'] = img['category_ids']
    #file_json[file_name]['page'][page_id]['dataset_id'] = img['dataset_id']
    file_json[file_name]['page'][page_id]['image_name'] = img['file_name']
    file_json[file_name]['page'][page_id]['image_id'] = img['id']
    #file_json[file_name]['page'][page_id]['thumbnail'] = img['regenerate_thumbnail']
    file_json[file_name]['page'][page_id]['box_list'] = []
# Readering page dictionary keys based on natural page order
new_file_json = {}
for doc in file_json:
  pages = sorted(file_json[doc]['page'])
  new_file_json[doc] = {}
  new_file_json[doc]['page'] = {}
  for page in pages:
    new_file_json[doc]['page'][page] = file_json[doc]['page'][page]
print(len(new_file_json))

1


In [None]:
# Create a lookup dictionary for images
img_id_dict = {img['id']: img for img in anno_json['images']}
print(img_id_dict)

{1: {'id': 1, 'datatorch_id': '2c091924-70e4-461c-8c9a-25fa3598edf3', 'storage_id': 'b030e81e-0259-4f73-84ac-33ab52ab8d91', 'path': 'a079127_c137_2008_bow river2008a_11669409-02.png', 'width': 1700, 'height': 2200, 'file_name': 'a079127_c137_2008_bow river2008a_11669409_page-1.png', 'metadata': {}, 'date_captured': '2023-09-19T02:57:19.162Z'}, 2: {'id': 2, 'datatorch_id': 'acf7f749-43ec-4961-8bd0-45c64e799193', 'storage_id': 'b030e81e-0259-4f73-84ac-33ab52ab8d91', 'path': 'a079127_c137_2008_bow river2008a_11669409-03.png', 'width': 1700, 'height': 2200, 'file_name': 'a079127_c137_2008_bow river2008a_11669409_page-2.png', 'metadata': {}, 'date_captured': '2023-09-19T02:57:18.303Z'}, 3: {'id': 3, 'datatorch_id': '9fca344b-b94f-4423-b2d9-6e7c64ba6271', 'storage_id': 'b030e81e-0259-4f73-84ac-33ab52ab8d91', 'path': 'a079127_c137_2008_bow river2008a_11669409-05.png', 'width': 1700, 'height': 2200, 'file_name': 'a079127_c137_2008_bow river2008a_11669409_page-4.png', 'metadata': {}, 'date_capt

In [None]:
# Create a lookup dictionary for categories
# Extracting 'id' and 'name' and creating a new dictionary
cate_id_dict = {category['id']: category['name'] for category in anno_json['categories']}

In [None]:
cate_id_dict

{1: 'references',
 2: 'appendix_list',
 3: 'list',
 4: 'form_body',
 5: 'form_title',
 6: 'cross',
 7: 'table_caption',
 8: 'table',
 9: 'figure_caption',
 10: 'figure',
 11: 'list_of_tables',
 12: 'list_of_figures',
 13: 'table_of_contents',
 14: 'subsubsubsubsubsection',
 15: 'subsubsubsubsubsection',
 16: 'subsubsubsection',
 17: 'subsubsection',
 18: 'subsection',
 19: 'summary',
 20: 'abstract',
 21: 'section',
 22: 'paragraph',
 23: 'title',
 24: 'other',
 25: 'form',
 26: 'report_title'}

In [None]:
for objt in anno_json['annotations']:
  img_name = img_id_dict[objt['image_id']]['file_name']
  file_name = img_name.split('_page-')[0]
  page_id = int(img_name.split('_page-')[1].split('.')[0])
  new_file_json[file_name]['page'][page_id]['box_list'].append(objt['bbox'])
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']] = {}
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['bbox'] = objt['bbox']
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['segmentation'] = objt['segmentation']
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['category_id'] = objt['category_id']
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['category'] = cate_id_dict[objt['category_id']]
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['page'] = page_id
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations'] = {}
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations']['child'] = []
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations']['parent'] = []
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations']['above'] = [] # For Cross-page bottom component
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations']['following'] = [] # For Cross-page top componet
  new_file_json[file_name]['page'][page_id]['objects'][objt['id']]['relations']['context'] = []
  #print(new_file_json[file_name]['page'][page_id]['objects'])

In [None]:
# Flatening all document component into one list based on reading order.
for doc in new_file_json:
  new_file_json[doc]['components'] = []
  new_file_json[doc]['object_page_list'] = []
  new_file_json[doc]['ordered_id'] = []
  new_file_json[doc]['ordered_label'] = []
  for page in new_file_json[doc]['page']:
    box_list = new_file_json[doc]['page'][page]['box_list']
    ro_list = reading_order(box_list)
    reordered_id,_ = reading_order_id(new_file_json[doc]['page'][page]['objects'],ro_list)
    new_file_json[doc]['page'][page]['reading_order'] = reordered_id
    for objt_id in reordered_id:
      new_file_json[doc]['components'].append(new_file_json[doc]['page'][page]['objects'][objt_id])
      new_file_json[doc]['object_page_list'].append(new_file_json[doc]['page'][page]['objects'][objt_id]['page'])
      new_file_json[doc]['ordered_id'].append(objt_id)
      #new_file_json[doc]['ordered_label'].append(new_file_json[doc]['page'][page]['objects'][objt_id]['category'])

In [None]:
new_file_json[doc]['components']

[{'bbox': [24.8144728332124,
   78.2993028769126,
   2273.85418208303,
   3208.38326351056],
  'segmentation': [[24.8144728332124,
    78.2993028769126,
    2298.6686549162423,
    78.2993028769126,
    2298.6686549162423,
    3286.6825663874724,
    24.8144728332124,
    3286.6825663874724]],
  'category_id': 13,
  'category': 'figure',
  'page': 0,
  'relations': {'child': [],
   'parent': [],
   'above': [],
   'following': [],
   'context': []}},
 {'bbox': [147.226975091042,
   83.0534947924253,
   1338.82520145666,
   671.178860888565],
  'segmentation': [[147.226975091042,
    83.0534947924253,
    1486.052176547702,
    83.0534947924253,
    1486.052176547702,
    754.2323556809904,
    147.226975091042,
    754.2323556809904]],
  'category_id': 3,
  'category': 'table_of_contents',
  'page': 0,
  'relations': {'child': [],
   'parent': [],
   'above': [],
   'following': [],
   'context': []}},
 {'bbox': [197.971520466132,
   158.135905863074,
   1289.36991697014,
   395.642275

# OCR and Matching

1. Read textline from pdfminer processing results:

In [None]:
import os
json_list = os.listdir('/content/drive/MyDrive/data/pdfminer_processing/textline')

In [None]:
print(len(json_list))

2179


In [None]:
json_list[0]

'a072600_m38_0008_2006a_15492738_page-1.json'

In [None]:
with open('/content/drive/MyDrive/data/pdfminer_processing/textline/'+json_list[0]) as f:
  check = json.load(f)

In [None]:
print(check)

{'a072600_m38_0008_2006a_15492738': {'0': {'LTTextBox': {'bbox': [126, 127, 129, 111], 'text': ' \n'}}, '1': {'LTTextBox': {'bbox': [90, 127, 93, 111], 'text': ' \n'}}, '2': {'LTTextBox': {'bbox': [90, 140, 93, 124], 'text': ' \n'}}, '3': {'LTTextBox': {'bbox': [90, 153, 165, 138], 'text': 'Project Name:  \n'}}, '4': {'LTTextBox': {'bbox': [90, 166, 93, 151], 'text': ' \n'}}, '5': {'LTTextBox': {'bbox': [90, 178, 93, 164], 'text': ' \n'}}, '6': {'LTTextBox': {'bbox': [90, 191, 93, 176], 'text': ' \n'}}, '7': {'LTTextBox': {'bbox': [90, 204, 93, 189], 'text': ' \n'}}, '8': {'LTTextBox': {'bbox': [90, 216, 152, 202], 'text': 'Report Title: \n'}}, '9': {'LTTextBox': {'bbox': [90, 242, 93, 227], 'text': ' \n'}}, '10': {'LTTextBox': {'bbox': [90, 254, 93, 240], 'text': ' \n'}}, '11': {'LTTextBox': {'bbox': [90, 267, 93, 252], 'text': ' \n'}}, '12': {'LTTextBox': {'bbox': [90, 279, 93, 265], 'text': ' \n'}}, '13': {'LTTextBox': {'bbox': [90, 292, 129, 278], 'text': 'Author:  \n'}}, '14': {'L

In [None]:
from tqdm import tqdm
for doc in tqdm(new_file_json):
  print(doc)
  for page in new_file_json[doc]['page']:
    try:
      path = '/content/drive/MyDrive/data/pdfminer_processing/textline/'+doc+"_page-" + str(page) + ".json"
      with open(path) as f:
        text_line = json.load(f)
      new_file_json[doc]['page'][page]['pdfminer_textline'] = text_line
      print(text_line)
    except:
      print(doc)

100%|██████████| 1/1 [00:00<00:00, 102.84it/s]

a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539
a079201_e47-1427_2008 s_14830539





In [None]:
with open('/content/drive/MyDrive/data/annotation/annotation_textline_matched_text.json','w') as f:
  json.dump(new_file_json,f)

In [None]:
def get_intersection_rate(bbox1, bbox2, h):
  bbox1 = [bbox1[0],bbox1[1],bbox1[0]+bbox1[2],bbox1[1]+bbox1[3]]
  bbox2 = [bbox2[0],bbox2[3],bbox2[2],bbox2[1]]
  area1 = (bbox1[3]-bbox1[1]) * (bbox1[2]-bbox1[0])
  area2 = (bbox2[3]-bbox2[1]) * (bbox2[2]-bbox2[0])

  x1 = max(bbox1[0],bbox2[0])
  y1 = max(bbox1[1],bbox2[1])
  x2 = min(bbox1[2],bbox2[2])
  y2 = min(bbox1[3],bbox2[3])

  height = y2-y1
  width = x2-x1


  if height <= 0 or width<=0:
      return 0
  else:
      area = height*width
      #return area / (area1+area2-area)
      return area / area2

In [None]:
for doc in new_file_json:
  for page in new_file_json[doc]['page']:
    for obj in new_file_json[doc]['page'][page]['objects']:
      objt = new_file_json[doc]['page'][page]['objects'][obj]
      objt['textline'] = {}
      objt['textline']['lines'] = {}
      objt['textline']['bbox'] = []
      objt['textline']['ordered_list'] = []

In [None]:
new_file_json[doc]['page'][page]['objects']


{6: {'bbox': [199.349355863074,
   167.756704151493,
   1066.9510199563,
   71.7680058266569],
  'segmentation': [[199.349355863074,
    167.756704151493,
    1266.3003758193738,
    167.756704151493,
    1266.3003758193738,
    239.52470997814993,
    199.349355863074,
    239.52470997814993]],
  'category_id': 8,
  'category': 'section',
  'page': 8,
  'relations': {'child': [],
   'parent': [],
   'above': [],
   'following': [],
   'context': []},
  'textline': {'lines': {}, 'bbox': [], 'ordered_list': []}},
 22: {'bbox': [60.5978779315378,
   689.270879825201,
   1057.38195251275,
   440.177102403496],
  'segmentation': [[60.5978779315378,
    689.270879825201,
    1117.979830444288,
    689.270879825201,
    1117.979830444288,
    1129.447982228697,
    60.5978779315378,
    1129.447982228697]],
  'category_id': 7,
  'category': 'paragraph',
  'page': 8,
  'relations': {'child': [],
   'parent': [],
   'above': [],
   'following': [],
   'context': []},
  'textline': {'lines': {}

In [None]:
issue_doc_list = []
for doc in new_file_json:
  for page in new_file_json[doc]['page']:
    if 'pdfminer_textline' in new_file_json[doc]['page'][page].keys():
      textline = new_file_json[doc]['page'][page]['pdfminer_textline'][doc]
      for line in textline:
        if 'LTTextBox' in textline[line].keys():
          bbox_line = textline[line]['LTTextBox']['bbox']
          text_list = textline[line]['LTTextBox']['text']
          merged_objt_id = -1
          rate = 0
          if text_list ==' \n':
            continue
          for obj in new_file_json[doc]['page'][page]['objects']:
            height = new_file_json[doc]['page'][page]['height']
            objt = new_file_json[doc]['page'][page]['objects'][obj]
            if 'textline' not in objt.keys():
              print(doc)
            bbox_objt = objt['bbox']
            new_rate = get_intersection_rate(bbox_objt,bbox_line,height)
            if new_rate > rate:
              merged_objt_id = obj
              rate = new_rate
          if merged_objt_id != -1:
            new_file_json[doc]['page'][page]['objects'][merged_objt_id]['textline']['lines'][line] = textline[line]['LTTextBox']
            tbbox = textline[line]['LTTextBox']['bbox']
            textline_bbox = [tbbox[0],tbbox[3],tbbox[2],tbbox[1]]
            new_file_json[doc]['page'][page]['objects'][merged_objt_id]['textline']['lines'][line]['bbox'] = textline_bbox
            new_file_json[doc]['page'][page]['objects'][merged_objt_id]['textline']['bbox'].append(textline_bbox)
    else:
      if doc not in issue_doc_list:
        issue_doc_list.append(doc)

In [None]:
new_file_json.keys()

dict_keys(['a075987_e15_851_2007a_9796980'])

In [None]:
new_file_json['a076051_p29_1642_2007a_11417069']['page']


{0: {'objects': {21: {'bbox': [197.679650400583,
     1423.50391114348,
     1319.06936635106,
     481.775965040058],
    'segmentation': [[197.679650400583,
      1423.50391114348,
      1516.749016751643,
      1423.50391114348,
      1516.749016751643,
      1905.2798761835381,
      197.679650400583,
      1905.2798761835381]],
    'category_id': 2,
    'category': 'other',
    'page': 0,
    'relations': {'child': [],
     'parent': [],
     'above': [],
     'following': [],
     'context': []},
    'textline': {'lines': {}, 'bbox': [], 'ordered_list': []},
    'text': ''},
   31: {'bbox': [304.00262199563,
     170.88640203933,
     953.584151493081,
     1033.32638018937],
    'segmentation': [[304.00262199563,
      170.88640203933,
      1257.586773488711,
      170.88640203933,
      1257.586773488711,
      1204.2127822287,
      304.00262199563,
      1204.2127822287]],
    'category_id': 1,
    'category': 'report_title',
    'page': 0,
    'relations': {'child': [],
   

In [None]:
def detect_left(bbox,box_list):
  # purpose of this section is to find whether there is a bbox
  upper_y = bbox[1]
  lower_y = bbox[1]+bbox[3]
  for box in box_list:
    if box != bbox:
      if box[0]<bbox[0] and abs(bbox[0]-box[0])-box[2] > 0:
        if box[1]+box[3]<upper_y or box[1]>lower_y:
          continue
        else:
          return True

def detect_right(bbox,box_list):
  upper_y = bbox[1]
  lower_y = bbox[1]+bbox[3]
  for box in box_list:
    if box != bbox:
      if box[0]>bbox[0] and abs(bbox[0]-box[0])-bbox[2] > 0:
        if box[1]+box[3]<upper_y or box[1]>lower_y:
          continue
        else:
          return True
def y_order(box_list):
  output = []
  box_list.sort(key=lambda x:x[1])
  return box_list
def reading_order(box_list):
  if len(box_list) == 0:
    box = []
    return box
  ordered_list = y_order(box_list)
  result_list = []
  left_box = []
  right_box = []
  for box in ordered_list:
    if detect_left(box,box_list):
      left_box.append(box)
    elif detect_right(box,box_list):
      right_box.append(box)
    else:
      result_list.append(box)
      result_list.extend(reading_order(left_box))
      result_list.extend(reading_order(right_box))
      left_box = []
      right_box = []
  result_list.extend(reading_order(right_box))
  result_list.extend(reading_order(left_box))
  return result_list

In [None]:
def reading_order_id_textline(obj_dict,ro_list):
  ordered_id =[]
  ordered_label = []
  for i,box in enumerate(ro_list):
    for obj in obj_dict:
      if box_convertor(obj_dict[obj]['bbox']) == box:
        ordered_id.append(obj)
  return ordered_id, ordered_label

In [None]:
def box_convertor(box):
  new_box = [box[0],box[1],box[2]-box[0],box[3]-box[1]]
  return new_box

In [None]:
for doc in tqdm(new_file_json):
  for page in new_file_json[doc]['page']:
    for obj in new_file_json[doc]['page'][page]['objects']:
      objt = new_file_json[doc]['page'][page]['objects'][obj]
      if objt['category'] in ['table','figure']:
        continue
      line_box_list = objt['textline']['bbox']
      new_line_box_list = []
      for box in line_box_list:
        new_line_box_list.append(box_convertor(box))
      ro_list = reading_order(new_line_box_list)
      ordered_id,_ = reading_order_id_textline(objt['textline']['lines'],ro_list)
      objt['textline']['ordered_list'] = ordered_id

100%|██████████| 1/1 [00:00<00:00, 2327.58it/s]


In [None]:
objt['textline']['ordered_list']

[]

In [None]:
objt['textline']

{'lines': {}, 'bbox': [], 'ordered_list': []}

2. Save the extracted text content of each annotated json file as a new json file:

In [None]:
for doc in tqdm(new_file_json):
  objt_data = []
  for page in new_file_json[doc]['page']:
    for obj in new_file_json[doc]['page'][page]['objects']:

      objt = new_file_json[doc]['page'][page]['objects'][obj]
      if objt['category'] in ['table','figure']:
        continue
      objt['text'] = ''
      ordered_id = objt['textline']['ordered_list']
      for id in ordered_id:
        objt['text'] = objt['text']+' '+objt['textline']['lines'][id]['text']
      objt['text'] = objt['text'][1:]
      objt_data.append(objt)
  # Write back the data to the JSON file.
  # Check if data was appended to objt_data
  if objt_data:
    print(f"{doc}: Data found and will be saved.")
    with open(f"/content/drive/MyDrive/data/annotated_extraction_text/{doc}.json", "w") as file:
      json.dump(objt_data, file)
  else:
    print(f"{doc}: No data found, saving an empty file.")
    with open(f"/content/drive/MyDrive/data/annotated_extraction_text/{doc}.json", "w") as file:
      json.dump(objt_data, file)

1


100%|██████████| 1/1 [00:00<00:00, 98.88it/s]

a079127_c137_2008_bow river2008a_11669409: Data found and will be saved.





In [None]:
objt

{'bbox': [175.426687254189,
  397.414322796796,
  1315.74677348871,
  287.072023306628],
 'segmentation': [[175.426687254189,
   397.414322796796,
   1491.173460742899,
   397.414322796796,
   1491.173460742899,
   684.486346103424,
   175.426687254189,
   684.486346103424]],
 'category_id': 22,
 'category': 'list',
 'page': 8,
 'relations': {'child': [],
  'parent': [],
  'above': [],
  'following': [],
  'context': []},
 'textline': {'lines': {}, 'bbox': [], 'ordered_list': []},
 'text': ''}