In [1]:
import os
import json
import shutil
import zipfile

SOURCE2_DIR = '/content/SOURCE2/'
SOURCE3_DIR = '/content/SOURCE3/'

os.makedirs(SOURCE2_DIR, exist_ok=True)
os.makedirs(SOURCE3_DIR, exist_ok=True)

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection.zip', 'r') as zf:
  zf.extractall(SOURCE2_DIR)

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE3/detection.zip', 'r') as zf:
  zf.extractall(SOURCE3_DIR)

In [2]:
import numpy as np

source3_train_pages = [os.path.join(SOURCE3_DIR, 'train', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'train'))]
source3_test_pages = [os.path.join(SOURCE3_DIR, 'test', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'test'))]
source3_validation_pages = [os.path.join(SOURCE3_DIR, 'validation', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'validation'))]

source3_test_annotations = os.path.join(SOURCE3_DIR, 'test_annotations.json')
source3_train_annotations = os.path.join(SOURCE3_DIR, 'train_annotations.json')
source3_validation_annotations = os.path.join(SOURCE3_DIR, 'validation_annotations.json')

source2_train_pages = [os.path.join(SOURCE2_DIR, 'train', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'train'))]
source2_test_pages = [os.path.join(SOURCE2_DIR, 'test', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'test'))]
source2_validation_pages = [os.path.join(SOURCE2_DIR, 'validation', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'validation'))]

source2_test_annotations = os.path.join(SOURCE2_DIR, 'test_annotations.json')
source2_train_annotations = os.path.join(SOURCE2_DIR, 'train_annotations.json')
source2_validation_annotations = os.path.join(SOURCE2_DIR, 'validation_annotations.json')

train_pages = source3_train_pages + source2_train_pages
validation_pages = source3_validation_pages + source2_validation_pages
test_pages = source3_test_pages + source2_test_pages

In [3]:
def get_source2_annotation(img_id, json_path):
    with open(json_path, 'r') as f:
        annotations = json.loads(f.read())

    words = annotations[img_id]["words"]

    bboxes = [w['bbox'] for w in words]
    labels = [0 for i in range(len(words))]

    return bboxes, labels

def get_source3_annotation(img_id, json_path):
    with open(json_path, 'r') as f:
        annotations = json.loads(f.read())

    regions_contents = annotations[img_id]["regions_contents"]

    bboxes = []
    for rc in regions_contents:
        bboxes += rc['bboxes']

    labels = [0 for i in range(len(bboxes))]

    return bboxes, labels

In [5]:
def parse_annotations(pages, source2_annotations, source3_annotations, annotations):
  for page_path in pages:
      page_id = os.path.basename(page_path[:-4])
      if SOURCE2_DIR in page_path:
          bboxes, labels = get_source2_annotation(page_id, source2_annotations)
          if page_id not in [*annotations]:
              annotations[page_id] = {"bboxes": bboxes, "labels": labels}
          else:
              raise ValueError(f"Page with id: {page_id} is already in annotations")

      elif SOURCE3_DIR in page_path:
          bboxes, labels = get_source3_annotation(page_id, source3_annotations)
          if page_id not in [*annotations]:
              annotations[page_id] = {"bboxes": bboxes, "labels": labels}
          else:
              raise ValueError(f"Page with id: {page_id} is already in annotations")

In [6]:
train_annotations = {}
validation_annotations = {}
test_annotations = {}

parse_annotations(train_pages, source2_train_annotations,
                  source3_train_annotations, train_annotations)

parse_annotations(validation_pages, source2_validation_annotations,
                  source3_validation_annotations, validation_annotations)

parse_annotations(test_pages, source2_test_annotations,
                  source3_test_annotations, test_annotations)

In [7]:
from PIL import Image

def construct_data_for_yolo(pages, out_dir, annotations):
  for page_path in pages:
        page_id = os.path.basename(page_path[:-4])
        img_width, img_height = Image.open(page_path).size
        shutil.copy(page_path, os.path.join(out_dir, os.path.basename(page_path)))

        an_dct = annotations[page_id]
        txt_name = os.path.join(out_dir, page_id + '.txt')
        with open(txt_name, 'w') as an_file:
          for idx in range(len(an_dct['bboxes'])):
            height = an_dct['bboxes'][idx][3] - an_dct['bboxes'][idx][1]
            width = an_dct['bboxes'][idx][2] - an_dct['bboxes'][idx][0]
            x_center = (an_dct['bboxes'][idx][0] + width / 2) / img_width
            y_center = (an_dct['bboxes'][idx][1] + height / 2) / img_height
            new_an = [an_dct['labels'][idx], x_center, y_center, width / img_width, height / img_height]
            an_file.write(" ".join([str(e) for e in new_an]) + "\n")


In [8]:
out_dir = '/content/detection/train'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(train_pages, out_dir, train_annotations)

out_dir = '/content/detection/validation'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(validation_pages, out_dir, validation_annotations)

out_dir = '/content/detection/test'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(test_pages, out_dir, test_annotations)

In [9]:
shutil.make_archive('/content/detection', 'zip', '/content/detection')

'/content/detection.zip'