In [19]:
import os
import zipfile
import shutil

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection/pages.zip' ,'r') as f:
  f.extractall('/content/')

In [20]:
with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE2/detection/xml.zip', 'r') as f:
  f.extractall('/content/')

In [21]:
test_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/test.uttlist', 'r') as split_f:
  test_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

train_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/train.uttlist', 'r') as split_f:
  train_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

validation_pages_ids = []
with open('/content/drive/MyDrive/Datasets/SOURCE2/splits/validation.uttlist', 'r') as split_f:
  validation_pages_ids = [line.strip().replace('\n', '') for line in split_f.readlines()]

print(test_pages_ids)
print(train_pages_ids)
print(validation_pages_ids)

['c04-110', 'c04-116', 'c04-134', 'c04-139', 'c04-144', 'c04-150', 'c04-165', 'c04-170', 'c06-011', 'd01-016', 'd01-019', 'd01-049', 'd01-052', 'd01-080', 'd01-085', 'd01-098', 'd01-104', 'd01-118', 'd01-123', 'd03-117', 'd04-012', 'd04-016', 'd04-021', 'd04-028', 'd04-032', 'd04-037', 'd04-047', 'd04-050', 'd04-053', 'd04-058', 'd04-062', 'd04-066', 'd04-086', 'd04-089', 'd04-096', 'd04-101', 'd04-111', 'd04-125', 'd04-131', 'd05-008', 'd05-013', 'd05-021', 'd05-025', 'd05-030', 'd05-040', 'd06-003', 'd06-008', 'd06-011', 'd06-020', 'd06-025', 'd06-027', 'd06-037', 'd06-041', 'd06-046', 'd06-056', 'd06-060', 'd06-063', 'd06-067', 'd06-072', 'd06-076', 'd06-086', 'd06-096', 'd06-100', 'd06-104', 'd06-107', 'd06-111', 'd06-113', 'd07-082', 'd07-085', 'd07-089', 'd07-093', 'd07-096', 'd07-100', 'd07-102', 'e01-055', 'e06-000', 'e06-003', 'e06-010', 'e06-015', 'e06-021', 'e06-026', 'e06-030', 'e06-033', 'e06-046', 'e06-049', 'e06-053', 'e06-070', 'f04-032', 'f04-035', 'f04-039', 'f04-043'

In [22]:
def list_nonhidden_dirs(path):
    return [f for f in os.listdir(path) if not f.startswith('.') and os.path.isdir(os.path.join(path, f))]

def list_png_files(path):
    return [f for f in os.listdir(path) if f.endswith('.png')]

def find_and_append_files(ids, files_paths, dir, filename, data_dir):
    matches = [id for id in ids if id == filename]
    if len(matches) > 1:
        print(matches)
        raise ValueError(f'More than one match occurred for {filename}.')
    elif matches:
        files_paths.append(os.path.join(data_dir, dir, filename + '.png'))

def process_files(data_dir, test_ids, train_ids, validation_ids):
    test_files, train_files, validation_files = [], [], []

    for dir in list_nonhidden_dirs(data_dir):
        dir_path = os.path.join(data_dir, dir)
        for filename in list_png_files(dir_path):
            find_and_append_files(
                test_ids, test_files, dir, filename[:-4], data_dir)
            find_and_append_files(
                train_ids, train_files, dir, filename[:-4], data_dir)
            find_and_append_files(
                validation_ids, validation_files, dir, filename[:-4], data_dir)

    return test_files, train_files, validation_files

pages_data_dir = '/content/data'

test_files_paths, train_files_paths, validation_files_paths = process_files(
    pages_data_dir, test_pages_ids, train_pages_ids, validation_pages_ids
)


In [23]:
print(len(test_pages_ids), len(test_files_paths))
print(len(train_pages_ids), len(train_files_paths))
print(len(validation_pages_ids), len(validation_files_paths))

336 336
747 747
116 116


In [24]:
print(len([f for f in os.listdir('/content/xml') if f[:-4] in test_pages_ids]))
print(len([f for f in os.listdir('/content/xml') if f[:-4] in train_pages_ids]))
print(len([f for f in os.listdir('/content/xml') if f[:-4] in validation_pages_ids]))

336
747
116


Counts seems to be fine, so we can proceed and extract final results.

In [25]:
out_train_dir = '/content/SOURCE2/train'
out_test_dir = '/content/SOURCE2/test'
out_validation_dir = '/content/SOURCE2/validation'

os.makedirs(out_train_dir, exist_ok=True)
os.makedirs(out_test_dir, exist_ok=True)
os.makedirs(out_validation_dir, exist_ok=True)

import xml.etree.ElementTree as ET

def parse_iam_annotation(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    print(root.attrib['id'])
    form_info = {
        "id": root.attrib['id'],
        "writer_id": root.attrib['writer-id']
    }

    word_info = []
    handwriting_bbox = {"min_x": float("inf"), "min_y": float("inf"),
                        "max_x": float("-inf"), "max_y": float("-inf")}

    for handwritten_part in root.findall(".//handwritten-part"):
        for line in handwritten_part.findall("line"):
            for word in line.findall("word"):
                word_id = word.attrib['id']
                transcription = word.attrib['text']
                bboxes = []

                coord_blocks = word.findall("cmp")

                if len(coord_blocks) > 0:
                  for cmp in coord_blocks:
                      x, y, width, height = int(cmp.attrib['x']), int(cmp.attrib['y']), int(cmp.attrib['width']), int(cmp.attrib['height'])
                      bboxes.append({"x": x, "y": y, "width": width, "height": height})

                      handwriting_bbox['min_x'] = min(handwriting_bbox['min_x'], x)
                      handwriting_bbox['min_y'] = min(handwriting_bbox['min_y'], y)
                      handwriting_bbox['max_x'] = max(handwriting_bbox['max_x'], x + width)
                      handwriting_bbox['max_y'] = max(handwriting_bbox['max_y'], y + height)

                  min_x = min(bbox['x'] for bbox in bboxes)
                  min_y = min(bbox['y'] for bbox in bboxes)
                  max_x = max(bbox['x'] + bbox['width'] for bbox in bboxes)
                  max_y = max(bbox['y'] + bbox['height'] for bbox in bboxes)
                  word_bbox = [min_x, min_y, max_x, max_y]

                  word_info.append({
                      "word_id": word_id,
                      "transcription": transcription,
                      "bbox": word_bbox
                  })
                else:
                  print(f"Skipped word: {transcription}, word id: {word_id}, because no coords provided for bbox")

    handwriting_bbox_final = [
        handwriting_bbox['min_x'], handwriting_bbox['min_y'],
        handwriting_bbox['max_x'], handwriting_bbox['max_y']
    ]

    result = {
        "form_info": form_info,
        "words": word_info,
        "handwriting_bbox": handwriting_bbox_final
    }

    return result

xml_file = "/content/xml/a01-000u.xml"
parsed_data = parse_iam_annotation(xml_file)
print(parsed_data)



a01-000u
{'form_info': {'id': 'a01-000u', 'writer_id': '000'}, 'words': [{'word_id': 'a01-000u-00-00', 'transcription': 'A', 'bbox': [408, 768, 435, 819]}, {'word_id': 'a01-000u-00-01', 'transcription': 'MOVE', 'bbox': [507, 766, 720, 814]}, {'word_id': 'a01-000u-00-02', 'transcription': 'to', 'bbox': [796, 764, 866, 814]}, {'word_id': 'a01-000u-00-03', 'transcription': 'stop', 'bbox': [919, 757, 1085, 835]}, {'word_id': 'a01-000u-00-04', 'transcription': 'Mr.', 'bbox': [1185, 754, 1311, 815]}, {'word_id': 'a01-000u-00-05', 'transcription': 'Gaitskell', 'bbox': [1438, 746, 1820, 819]}, {'word_id': 'a01-000u-00-06', 'transcription': 'from', 'bbox': [1896, 757, 2069, 829]}, {'word_id': 'a01-000u-01-00', 'transcription': 'nominating', 'bbox': [395, 932, 836, 1032]}, {'word_id': 'a01-000u-01-01', 'transcription': 'any', 'bbox': [901, 958, 1048, 1037]}, {'word_id': 'a01-000u-01-02', 'transcription': 'more', 'bbox': [1112, 958, 1320, 1000]}, {'word_id': 'a01-000u-01-03', 'transcription': 'La

In [26]:
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

def draw_bboxes(image_path, annotations, output_path="/content/annotated_image.jpg"):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    if annotations["handwriting_bbox"]:
        bbox = annotations["handwriting_bbox"]
        draw.rectangle(bbox, outline="red", width=2)

    for word in annotations["words"]:
        if word["bbox"]:
            bbox = word["bbox"]
            draw.rectangle(bbox, outline="blue", width=2)

    image.save(output_path)
    print(f"Annotated image saved at {output_path}")

draw_bboxes('/content/data/000/a01-000u.png', parsed_data)

Annotated image saved at /content/annotated_image.jpg


In [27]:
def crop_from_bbox(image_path, bbox, out_path, annotation):
  image = Image.open(image_path).crop(bbox)
  for word in annotation['words']:
    x_min, y_min, x_max, y_max = word['bbox']
    word['bbox'] = [x_min - bbox[0], y_min - bbox[1], x_max - bbox[0], y_max - bbox[1]]
    w, h = image.size
    if min(word['bbox']) < 0 or word['bbox'][-2] > w or word['bbox'][-1] > h:
      raise ValueError(f"bbox is out of bounds {word['bbox']} size: {image.size}")
  image.save(out_path, "JPEG")
  return annotation

In [28]:
test_annotation_data = {}

for img_p in test_files_paths:
  id = os.path.basename(img_p)[:-4]

  xml_path = os.path.join('/content/xml', id + '.xml')
  parsed_data = parse_iam_annotation(xml_path)

  if parsed_data['form_info']['id'] != id:
    raise ValueError("Id's don't match")

  out_img_path = os.path.join(out_test_dir, id + '.jpg')

  parsed_data = crop_from_bbox(img_p, parsed_data["handwriting_bbox"], out_img_path, parsed_data)

  test_annotation_data[id] = {"writer_id": parsed_data['form_info']['writer_id'],
                              "words": parsed_data['words']}



p03-181
p03-173
p03-027
m03-033
p02-027
g04-052
n03-126
p02-150
d06-104
e06-049
d06-067
d06-025
e06-026
g03-040
Skipped word: to, word id: g03-040-02-01, because no coords provided for bbox
p02-115
n01-036
d01-098
d01-104
g04-043
g04-048
p02-000
d04-131
d04-125
f04-039
f04-043
m01-125
f04-071
f04-068
n02-033
n02-037
p03-029
c06-011
d06-063
d06-046
d06-020
d06-008
m02-112
p03-012
n04-114
Skipped word: ", word id: n04-114-04-12, because no coords provided for bbox
n04-092
n04-107
Skipped word: ", word id: n04-107-06-10, because no coords provided for bbox
n04-084
n04-100
g03-000
n02-054
d04-016
d04-012
m02-080
p02-101
d05-030
d05-008
d05-025
d05-021
Skipped word: ., word id: d05-021-08-08, because no coords provided for bbox
m04-078
m04-100
m04-093
m04-081
Skipped word: ", word id: m04-081-10-07, because no coords provided for bbox
m04-072
d04-089
d04-086
m01-095
n01-052
d06-100
e06-033
d06-037
e06-070
Skipped word: ., word id: e06-070-09-06, because no coords provided for bbox
d06-072
S

In [29]:
import json

with open('/content/SOURCE2/test_annotations.json', 'w') as f:
  json.dump(test_annotation_data, f)

In [30]:
train_annotation_data = {}

for img_p in train_files_paths:
  id = os.path.basename(img_p)[:-4]

  xml_path = os.path.join('/content/xml', id + '.xml')
  parsed_data = parse_iam_annotation(xml_path)

  if parsed_data['form_info']['id'] != id:
    raise ValueError("Id's don't match")

  out_img_path = os.path.join(out_train_dir, id + '.jpg')

  parsed_data = crop_from_bbox(img_p, parsed_data["handwriting_bbox"],
                               out_img_path, parsed_data)

  train_annotation_data[id] = {"writer_id": parsed_data['form_info']['writer_id'],
                              "words": parsed_data['words']}

with open('/content/SOURCE2/train_annotations.json', 'w') as f:
  json.dump(train_annotation_data, f)


r02-054
c04-061
c04-066
a06-008
b01-136
b01-127
b01-132
g06-026k
g06-050k
g06-031k
g06-042k
g06-018k
g06-045k
g06-037k
g06-047k
g06-011k
a04-092
a04-096
a04-099
a04-103
a06-057
f02-044
r03-096
c04-004
c04-000
a01-043
a01-049
e02-025
f02-076
r02-081
r06-011
r06-027
r06-007
r06-022
r06-000
r06-143
r06-003
p06-248
r06-018
p06-242
e04-068
e04-062
b05-098
b05-083
e02-021
e01-029
e01-032
e01-035
e02-091
Skipped word: ., word id: e02-091-03-06, because no coords provided for bbox
a01-003
e02-000
r03-056
a03-017
c06-058
b06-068
c06-095
b06-042
e07-072
e07-061
e07-012
e07-066
a06-044
c03-081b
c03-003b
c03-000b
c03-007b
c03-021b
Skipped word: ., word id: c03-021b-08-04, because no coords provided for bbox
c03-094b
c03-084b
Skipped word: formal, word id: c03-084b-01-07, because no coords provided for bbox
c03-016b
c03-087b
c03-096b
c04-122
c04-075
b05-079
f01-085
f01-081
f02-038
a06-110
a02-062
a06-039
e07-112
c02-089
e04-114
e04-109
e04-103
b03-109
c03-087c
c03-021c
c03-096c
c03-081c
c03-094c
c0

As we can see most of the skipped "words" are punctuation marks, fixing this annotations could take time but in fact this marks will be added in postprocessing by LM so i will do nothing about this discrepancy.

In [31]:
validation_annotation_data = {}

for img_p in validation_files_paths:
  id = os.path.basename(img_p)[:-4]

  xml_path = os.path.join('/content/xml', id + '.xml')
  parsed_data = parse_iam_annotation(xml_path)

  if parsed_data['form_info']['id'] != id:
    raise ValueError("Id's don't match")

  out_img_path = os.path.join(out_validation_dir, id + '.jpg')

  parsed_data = crop_from_bbox(img_p, parsed_data["handwriting_bbox"],
                               out_img_path, parsed_data)

  validation_annotation_data[id] = {"writer_id": parsed_data['form_info']['writer_id'],
                              "words": parsed_data['words']}

with open('/content/SOURCE2/validation_annotations.json', 'w') as f:
  json.dump(validation_annotation_data, f)

m03-095
d01-024
g04-055
m02-083
Skipped word: ,, word id: m02-083-11-08, because no coords provided for bbox
m01-090
f04-093
d04-081
Skipped word: ., word id: d04-081-03-02, because no coords provided for bbox
f04-096
f04-100
g03-016
m04-007
m04-012
m04-024
m04-019
m04-000
f07-046b
g04-060
g04-063
m04-209
m04-222
m04-190
m04-200
m04-216
Skipped word: ", word id: m04-216-06-05, because no coords provided for bbox
m03-062
p03-185
n02-098
p03-057
p03-087
p03-103
p03-096
p03-112
Skipped word: ., word id: p03-112-06-09, because no coords provided for bbox
p02-131
n06-182
n06-156
n06-148
n06-163
n06-201
n06-175
n06-169
n06-186
m06-019
n06-194
Skipped word: ", word id: n06-194-06-09, because no coords provided for bbox
Skipped word: ", word id: n06-194-07-13, because no coords provided for bbox
p03-151
m02-052
f07-039b
f07-028b
f07-036
f07-042b
f07-032b
g04-039
g04-036
d03-112
Skipped word: ., word id: d03-112-01-11, because no coords provided for bbox
n04-022
n04-015
n04-044
n04-039
n04-031


In [32]:
shutil.make_archive('/content/detection', "zip", '/content/SOURCE2/')

'/content/detection.zip'