In [None]:
import os
import zipfile
import shutil

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection/pages.zip' ,'r') as f:
  f.extractall('/content/')

In [None]:
with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection/xml.zip', 'r') as f:
  f.extractall('/content/')

In [None]:
test_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/test.uttlist', 'r') as split_f:
  test_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

train_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/train.uttlist', 'r') as split_f:
  train_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

validation_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/validation.uttlist', 'r') as split_f:
  validation_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

print(test_pages_ids)
print(train_pages_ids)
print(validation_pages_ids)

In [None]:
import os

def list_nonhidden_dirs(path):
    return [f for f in os.listdir(path) if not f.startswith('.') and os.path.isdir(os.path.join(path, f))]

def list_png_files(path):
    return [f for f in os.listdir(path) if f.endswith('.png')]

def find_and_append_files(ids, files_paths, dir, filename, data_dir):
    matches = [id for id in ids if id == filename]
    if len(matches) > 1:
        print(matches)
        raise ValueError(f'More than one match occurred for {filename}.')
    elif matches:
        files_paths.append(os.path.join(data_dir, dir, filename + '.png'))
        return True
    return False

def process_files(data_dir, test_ids, train_ids, validation_ids):
    test_files, train_files, validation_files = [], [], []

    for dir in list_nonhidden_dirs(data_dir):
        dir_path = os.path.join(data_dir, dir)
        for filename in list_png_files(dir_path):
            is_test = find_and_append_files(
                test_ids, test_files, dir, filename[:-4], data_dir)
            is_train = find_and_append_files(
                train_ids, train_files, dir, filename[:-4], data_dir)
            is_validation = find_and_append_files(
                validation_ids, validation_files, dir, filename[:-4], data_dir)

            if not (is_test or is_train or is_validation):
                train_files.append(os.path.join(data_dir, dir, filename))
                print(f"No match found for {filename}, added to train set.")

    return test_files, train_files, validation_files

pages_data_dir = '/content/data'

test_files_paths, train_files_paths, validation_files_paths = process_files(
    pages_data_dir, test_pages_ids, train_pages_ids, validation_pages_ids
)


In [None]:
print(len(test_pages_ids), len(test_files_paths))
print(len(train_pages_ids), len(train_files_paths))
print(len(validation_pages_ids), len(validation_files_paths))

In [None]:
print(len([f for f in os.listdir('/content/xml')]),
      len(test_pages_ids) + len(train_pages_ids) + len(validation_pages_ids))

In [None]:
out_train_dir = '/content/SOURCE2/train'
out_test_dir = '/content/SOURCE2/test'
out_validation_dir = '/content/SOURCE2/validation'

os.makedirs(out_train_dir, exist_ok=True)
os.makedirs(out_test_dir, exist_ok=True)
os.makedirs(out_validation_dir, exist_ok=True)


In [None]:

import xml.etree.ElementTree as ET

def parse_iam_annotation(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    print(root.attrib['id'])
    form_info = {
        "id": root.attrib['id'],
        "writer_id": root.attrib['writer-id']
    }

    word_info = []
    handwriting_bbox = {"min_x": float("inf"), "min_y": float("inf"),
                        "max_x": float("-inf"), "max_y": float("-inf")}

    for handwritten_part in root.findall(".//handwritten-part"):
        lines_bboxes = []
        for line in handwritten_part.findall("line"):
            line_bbox = {"min_x": float("inf"), "min_y": float("inf"),
                    "max_x": float("-inf"), "max_y": float("-inf")}
            for word in line.findall("word"):
                word_id = word.attrib['id']
                transcription = word.attrib['text']
                bboxes = []

                coord_blocks = word.findall("cmp")

                if len(coord_blocks) > 0:
                  for cmp in coord_blocks:
                      x, y, width, height = int(cmp.attrib['x']), int(cmp.attrib['y']), int(cmp.attrib['width']), int(cmp.attrib['height'])
                      bboxes.append({"x": x, "y": y, "width": width, "height": height})

                      handwriting_bbox['min_x'] = min(handwriting_bbox['min_x'], x)
                      handwriting_bbox['min_y'] = min(handwriting_bbox['min_y'], y)
                      handwriting_bbox['max_x'] = max(handwriting_bbox['max_x'], x + width)
                      handwriting_bbox['max_y'] = max(handwriting_bbox['max_y'], y + height)

                      line_bbox['min_x'] = min(line_bbox['min_x'], x)
                      line_bbox['min_y'] = min(line_bbox['min_y'], y)
                      line_bbox['max_x'] = max(line_bbox['max_x'], x + width)
                      line_bbox['max_y'] = max(line_bbox['max_y'], y + height)

                  min_x = min(bbox['x'] for bbox in bboxes)
                  min_y = min(bbox['y'] for bbox in bboxes)
                  max_x = max(bbox['x'] + bbox['width'] for bbox in bboxes)
                  max_y = max(bbox['y'] + bbox['height'] for bbox in bboxes)
                  word_bbox = [min_x, min_y, max_x, max_y]

                  word_info.append({
                      "word_id": word_id,
                      "transcription": transcription,
                      "bbox": word_bbox
                  })
                else:
                  print(f"Skipped word: {transcription}, word id: {word_id}, because no coords provided for bbox")

            lines_bboxes.append([line_bbox["min_x"], line_bbox['min_y'], line_bbox['max_x'], line_bbox['max_y'] ])

    handwriting_bbox_final = [
        handwriting_bbox['min_x'], handwriting_bbox['min_y'],
        handwriting_bbox['max_x'], handwriting_bbox['max_y']
    ]

    result = {
        "form_info": form_info,
        "words": word_info,
        "lines_bboxes": lines_bboxes,
        "handwriting_bbox": handwriting_bbox_final
    }

    return result

xml_file = "/content/xml/a01-000u.xml"
parsed_data = parse_iam_annotation(xml_file)
print(parsed_data)



In [None]:
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

def draw_bboxes(image_path, annotations,
                output_path="/content/annotated_image.jpg",
                draw_handwriting=True):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    if draw_handwriting:
      if annotations["handwriting_bbox"]:
          bbox = annotations["handwriting_bbox"]
          draw.rectangle(bbox, outline="red", width=2)

    for line_bbox in annotations["lines_bboxes"]:
        if line_bbox[0] < float('inf'):
            draw.rectangle(line_bbox, outline="green", width=2)

    for word in annotations["words"]:
        if word["bbox"]:
            bbox = word["bbox"]
            draw.rectangle(bbox, outline="blue", width=2)

    image.save(output_path)
    print(f"Annotated image saved at {output_path}")

draw_bboxes('/content/data/000/a01-000u.png', parsed_data)


In [None]:
def crop_from_bbox(image_path, handwriting_bbox, out_path, annotation):
  image = Image.open(image_path).crop(handwriting_bbox)
  w, h = image.size
  for word in annotation['words']:
    x_min, y_min, x_max, y_max = word['bbox']
    word['bbox'] = [
        x_min - handwriting_bbox[0],
        y_min - handwriting_bbox[1],
        x_max - handwriting_bbox[0],
        y_max - handwriting_bbox[1]
    ]

    if min(word['bbox']) < 0 or word['bbox'][-2] > w or word['bbox'][-1] > h:
      raise ValueError(f"bbox is out of bounds {word['bbox']} size: {image.size}")
  new_lines_bboxes = []
  for line_bbox in annotation['lines_bboxes']:
    x_min, y_min, x_max, y_max = line_bbox
    line_bbox = [
        x_min - handwriting_bbox[0],
        y_min - handwriting_bbox[1],
        x_max - handwriting_bbox[0],
        y_max - handwriting_bbox[1]
    ]
    new_lines_bboxes.append(line_bbox)
  annotation['lines_bboxes'] = new_lines_bboxes

  if min(line_bbox) < 0 or line_bbox[-2] > w or line_bbox[-1] > h:
    raise ValueError(f"bbox is out of bounds {line_bbox} size: {image.size}")

  image.save(out_path, "JPEG")
  return annotation

In [None]:
img_p = test_files_paths[0]
id = os.path.basename(img_p)[:-4]
xml_path = os.path.join('/content/xml', id + '.xml')
parsed_data = parse_iam_annotation(xml_path)
annotation = crop_from_bbox(img_p, parsed_data['handwriting_bbox'],
                            '/content/cropping_example.jpg', parsed_data)
draw_bboxes('/content/cropping_example.jpg', annotation, draw_handwriting=False)

In [None]:
def extract_base_filename(file_path):
    base = os.path.basename(file_path)
    while base.endswith('.png'):
        base = os.path.splitext(base)[0]
    return base
def crop_save_and_extract(files_paths, out_dir,
                          annotation_data,
                          annotations_json_path):

  for img_p in files_paths:
    id = extract_base_filename(img_p)
    xml_path = os.path.join('/content/xml', id + '.xml')
    parsed_data = parse_iam_annotation(xml_path)

    if parsed_data['form_info']['id'] != id:
      raise ValueError("Id's don't match")

    out_img_path = os.path.join(out_dir, id + '.jpg')

    parsed_data = crop_from_bbox(img_p, parsed_data["handwriting_bbox"],
                                out_img_path, parsed_data)

    annotation_data[id] = {"writer_id": parsed_data['form_info']['writer_id'],
                           "words": parsed_data['words'],
                           "lines_bboxes": parsed_data['lines_bboxes']}

  with open(annotations_json_path, 'w') as f:
    json.dump(annotation_data, f)

In [None]:
import json

test_annotation_data = {}
train_annotation_data = {}
validation_annotation_data = {}

crop_save_and_extract(test_files_paths,
                      out_test_dir,
                      test_annotation_data,
                      '/content/SOURCE2/test_annotations.json')

crop_save_and_extract(train_files_paths,
                      out_train_dir,
                      train_annotation_data,
                      '/content/SOURCE2/train_annotations.json')

crop_save_and_extract(validation_files_paths,
                      out_validation_dir,
                      validation_annotation_data,
                      '/content/SOURCE2/validation_annotations.json')

In [None]:
shutil.make_archive('/content/detection', "zip", '/content/SOURCE2/')