In [None]:
import os
import json
import shutil
import zipfile

SOURCE2_DIR = '/content/SOURCE2/'
SOURCE3_DIR = '/content/SOURCE3/'

os.makedirs(SOURCE2_DIR, exist_ok=True)
os.makedirs(SOURCE3_DIR, exist_ok=True)

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection.zip', 'r') as zf:
  zf.extractall(SOURCE2_DIR)

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE3/detection.zip', 'r') as zf:
  zf.extractall(SOURCE3_DIR)

In [None]:
import numpy as np

source3_train_pages = [os.path.join(SOURCE3_DIR, 'train', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'train'))]
source3_test_pages = [os.path.join(SOURCE3_DIR, 'test', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'test'))]
source3_validation_pages = [os.path.join(SOURCE3_DIR, 'validation', f) for f in os.listdir(os.path.join(SOURCE3_DIR, 'validation'))]

source3_test_annotations = os.path.join(SOURCE3_DIR, 'test_annotations.json')
source3_train_annotations = os.path.join(SOURCE3_DIR, 'train_annotations.json')
source3_validation_annotations = os.path.join(SOURCE3_DIR, 'validation_annotations.json')

source2_train_pages = [os.path.join(SOURCE2_DIR, 'train', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'train'))]
source2_test_pages = [os.path.join(SOURCE2_DIR, 'test', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'test'))]
source2_validation_pages = [os.path.join(SOURCE2_DIR, 'validation', f) for f in os.listdir(os.path.join(SOURCE2_DIR, 'validation'))]

source2_test_annotations = os.path.join(SOURCE2_DIR, 'test_annotations.json')
source2_train_annotations = os.path.join(SOURCE2_DIR, 'train_annotations.json')
source2_validation_annotations = os.path.join(SOURCE2_DIR, 'validation_annotations.json')

train_pages = source3_train_pages + source2_train_pages
validation_pages = source3_validation_pages + source2_validation_pages
test_pages = source3_test_pages + source2_test_pages

In [None]:
def get_source2_annotation(img_id, json_path):
    with open(json_path, 'r') as f:
        annotations = json.loads(f.read())

    words = annotations[img_id]["words"]
    lines_bboxes = annotations[img_id]["lines_bboxes"]

    bboxes = [w['bbox'] for w in words]
    transcriptions = [w['transcription'] for w in words]
    labels = [0 for i in range(len(words))]
    lines_labels = [0 for i in range(len(lines_bboxes))]
    return bboxes, labels, lines_bboxes, lines_labels, transcriptions

def get_source3_annotation(img_id, json_path):
    with open(json_path, 'r') as f:
        annotations = json.loads(f.read())
    regions_bboxes = []

    regions = annotations[img_id]["regions"]
    for region in regions:
      regions_bboxes.append(region["region_bbox"])

    regions_contents = annotations[img_id]["regions_contents"]

    bboxes = []
    transcriptions = []
    for rc in regions_contents:
        bboxes += rc['bboxes']
        transcriptions += rc['transcriptions']

    labels = [0 for i in range(len(bboxes))]
    regions_labels = [0 for i in range(len(regions_bboxes))]
    return bboxes, labels, regions_bboxes, regions_labels, transcriptions

In [None]:
symbol_list =[
    '.', ',', ';', ':', '!', '?', '-', '_', '(', ')', '[', ']', '{', '}', '<', '>',
    '@', '#', '$', '%', '^', '&', '*', '+', '=', '~', '`', '"', "'", '\\', '|', '/',
    '…', '“', '”', '‘', '’', '«', '»'
]

def parse_annotations(pages, source2_annotations, source3_annotations, annotations):
  for page_path in pages:
      page_id = os.path.basename(page_path[:-4])
      if SOURCE2_DIR in page_path:
          bboxes, labels, lines_bboxes, lines_labels, transcriptions = get_source2_annotation(page_id, source2_annotations)

          symbols_indices = [idx for idx in range(len(transcriptions)) if any(sym == transcriptions[idx].strip() for sym in symbol_list)]

          bboxes = [bbox for idx, bbox in enumerate(bboxes) if idx not in symbols_indices]
          labels = [label for idx, label in enumerate(labels) if idx not in symbols_indices]

          if page_id not in [*annotations]:
              annotations[page_id] = {"bboxes": bboxes,
                                      "labels": labels,
                                      "lines_bboxes": lines_bboxes,
                                      "lines_labels": lines_labels}
          else:
              raise ValueError(f"Page with id: {page_id} is already in annotations")

      elif SOURCE3_DIR in page_path:
          bboxes, labels, lines_bboxes, lines_labels, transcriptions = get_source3_annotation(page_id, source3_annotations)
          symbols_indices = [idx for idx in range(len(transcriptions)) if any(sym == transcriptions[idx].strip() for sym in symbol_list)]

          bboxes = [bbox for idx, bbox in enumerate(bboxes) if idx not in symbols_indices]
          labels = [label for idx, label in enumerate(labels) if idx not in symbols_indices]

          if page_id not in [*annotations]:
              annotations[page_id] = {"bboxes": bboxes,
                                      "labels": labels,
                                      "lines_bboxes": lines_bboxes,
                                      "lines_labels": lines_labels}
          else:
              raise ValueError(f"Page with id: {page_id} is already in annotations")

In [None]:
from PIL import Image, ImageDraw
def draw_bboxes(image_path, annotations, output_path, line_color="green", word_color="blue"):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    # Draw line bounding boxes
    for line_bbox in annotations["lines_bboxes"]:
        draw.rectangle(line_bbox, outline=line_color, width=2)

    # Draw word bounding boxes
    for bbox in annotations["bboxes"]:
        draw.rectangle(bbox, outline=word_color, width=2)

    image.save(output_path)
    print(f"Annotated image saved at {output_path}")

In [None]:
train_annotations = {}
validation_annotations = {}
test_annotations = {}

parse_annotations(train_pages, source2_train_annotations,
                  source3_train_annotations, train_annotations)

parse_annotations(validation_pages, source2_validation_annotations,
                  source3_validation_annotations, validation_annotations)

parse_annotations(test_pages, source2_test_annotations,
                  source3_test_annotations, test_annotations)

In [None]:
draw_bboxes(source3_train_pages[0], train_annotations['0265-1'], '/content/source3_example.jpg')
draw_bboxes(source2_train_pages[0], train_annotations['c03-007a'], '/content/source2_example.jpg')

In [None]:
from PIL import Image

def construct_data_for_yolo(pages, out_dir,
                            annotations, bbox_naming,
                            label_naming):
  for page_path in pages:
        page_id = os.path.basename(page_path[:-4])
        img_width, img_height = Image.open(page_path).size
        shutil.copy(page_path, os.path.join(out_dir, os.path.basename(page_path)))

        an_dct = annotations[page_id]
        txt_name = os.path.join(out_dir, page_id + '.txt')
        with open(txt_name, 'w') as an_file:
          for idx in range(len(an_dct[bbox_naming])):

            height = an_dct[bbox_naming][idx][3] - an_dct[bbox_naming][idx][1]
            width = an_dct[bbox_naming][idx][2] - an_dct[bbox_naming][idx][0]

            x_center = (an_dct[bbox_naming][idx][0] + width / 2) / img_width
            y_center = (an_dct[bbox_naming][idx][1] + height / 2) / img_height

            new_an = [an_dct[label_naming][idx], x_center, y_center, width / img_width, height / img_height]

            an_file.write(" ".join([str(e) for e in new_an]) + "\n")


In [None]:
out_dir = '/content/words_detection/train'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(train_pages, out_dir,
                        train_annotations,
                        bbox_naming="bboxes",
                        label_naming="labels")

out_dir = '/content/words_detection/validation'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(validation_pages, out_dir,
                        validation_annotations,
                        bbox_naming="bboxes",
                        label_naming="labels")

out_dir = '/content/words_detection/test'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(test_pages, out_dir,
                        test_annotations,
                        bbox_naming="bboxes",
                        label_naming="labels")


In [None]:
out_dir = '/content/lines_detection/train'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(train_pages, out_dir,
                        train_annotations,
                        bbox_naming="lines_bboxes",
                        label_naming="lines_labels")

out_dir = '/content/lines_detection/validation'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(validation_pages, out_dir,
                        validation_annotations,
                        bbox_naming="lines_bboxes",
                        label_naming="lines_labels")

out_dir = '/content/lines_detection/test'
os.makedirs(out_dir, exist_ok=True)
construct_data_for_yolo(test_pages, out_dir,
                        test_annotations,
                        bbox_naming="lines_bboxes",
                        label_naming="lines_labels")

In [None]:
shutil.make_archive('/content/words_detection', 'zip', '/content/words_detection')

In [None]:
shutil.make_archive('/content/lines_detection', 'zip', '/content/lines_detection')