In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import shutil
import numpy
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/ICDAR 2015/batch1/batch1.zip', 'r') as batch1:
  batch1.extractall('/content')

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/ICDAR 2015/batch1/xml.zip', 'r') as xml:
  xml.extractall('/content')

In [None]:
from lxml import etree

def extract_bbox_coords(points):
    xs = []
    ys = []

    if not points:
        print("No points found for bounding box.")
        return None

    for point in points:
        x = point.get('x')
        y = point.get('y')

        if x is None or y is None:

            print(f"Point missing x or y attribute: {point.attrib}")
            return None

        xs.append(float(x))
        ys.append(float(y))

    return [min(xs), min(ys), max(xs), max(ys)]

def bbox_to_yolo(bbox):
    bbox_width = bbox[2] - bbox[0]
    bbox_height = bbox[3] - bbox[1]
    return [
        bbox[0] + bbox_width / 2,
        bbox[1] + bbox_height / 2,
        bbox_width,
        bbox_height
    ]

def extract_line_bboxes(xml_filepath):
    ns = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19'}

    with open(xml_filepath, 'rb') as file:
        tree = etree.parse(file)

    line_bboxes = []

    for textline in tree.findall('.//ns:TextLine', namespaces=ns):
        coords = textline.find('ns:Coords', namespaces=ns)
        min_max_coords = extract_bbox_coords(coords.findall('ns:Point', namespaces=ns))
        line_bboxes.append(bbox_to_yolo(min_max_coords))

    return line_bboxes


In [None]:
def draw_bboxes(image_path, line_bboxes, output_path="/content/annotated_image.jpg"):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    for bbox in line_bboxes:
      min_max_box = [
          bbox[0] - bbox[2] / 2,
          bbox[1] - bbox[3] / 2,
          bbox[0] + bbox[2],
          bbox[1] + bbox[3]
      ]
      draw.rectangle(min_max_box, outline="green", width=2)

    image.save(output_path)
    print(f"Annotated image saved at {output_path}")

In [None]:
xml_folder = '/content/PAGE/'
img_folder = '/content/Pages/'
output_folder = '/content/output/'
os.makedirs(output_folder, exist_ok=True)

xml_files = [f for f in os.listdir(xml_folder)]
for xml_file in xml_files[:10]:
  xml_filepath = os.path.join(xml_folder, xml_file)
  img_file = os.path.basename(xml_file)[:-4] + '.jpg'
  img_filepath = os.path.join(img_folder, img_file)
  output_path = os.path.join(output_folder, img_file)

  line_bboxes = extract_line_bboxes(xml_filepath)
  draw_bboxes(img_filepath, line_bboxes, output_path)


In [None]:
shutil.make_archive('/content/annotated', 'zip', '/content/output/')

In [None]:
print(len(xml_files))

In [None]:
dataset_dir = '/content/dataset/'
os.makedirs(dataset_dir, exist_ok=True)

val_xml = xml_files[:46]
train_xml = xml_files[46:]

with open('/content/dataset/train_xml.txt', 'w') as f:
  for filename in train_xml:
    f.write(filename + '\n')

with open('/content/dataset/val_xml.txt', 'w') as f:
  for filename in val_xml:
    f.write(filename + '\n')

In [None]:
def save_yolo_annotation(width, height, bboxes, filepath):
  with open(filepath, 'w') as f:
    for bbox in bboxes:
      f.write(f"0 {bbox[0]/width} {bbox[1]/height} {bbox[2]/width} {bbox[3]/height}\n")

def prepare_dataset(xml_files, output_folder):
  for xml_file in xml_files:
    xml_filepath = os.path.join(xml_folder, xml_file)
    img_file = os.path.basename(xml_file)[:-4] + '.jpg'
    img_filepath = os.path.join(img_folder, img_file)
    output_path = os.path.join(output_folder, img_file)

    line_bboxes = extract_line_bboxes(xml_filepath)
    img_width, img_height = Image.open(img_filepath).size

    txt_filepath = os.path.join(output_folder, os.path.basename(xml_file)[:-4] + ".txt")

    save_yolo_annotation(img_width, img_height, line_bboxes, txt_filepath)
    shutil.copy(img_filepath, output_path)

In [None]:
val_dataset_dir = '/content/dataset/val'
os.makedirs(val_dataset_dir, exist_ok=True)

train_dataset_dir = '/content/dataset/train'
os.makedirs(train_dataset_dir, exist_ok=True)

prepare_dataset(val_xml, val_dataset_dir)
prepare_dataset(train_xml, train_dataset_dir)

In [None]:
shutil.make_archive('/content/dataset', 'zip', '/content/dataset')