In [40]:
import os
import shutil
import zipfile

SOURCE2_DIR = '/content/source2-recognition'
source2_test_annotations = os.path.join(SOURCE2_DIR, 'test_annotations.json')
source2_train_annotations = os.path.join(SOURCE2_DIR, 'train_annotations.json')
source2_validation_annotations = os.path.join(SOURCE2_DIR, 'validation_annotations.json')

with zipfile.ZipFile("/content/drive/MyDrive/Datasets/SOURCE2/recognition.zip", "r") as f:
  f.extractall(SOURCE2_DIR)

In [41]:
import json
from PIL import Image
import pandas as pd

filenames_list = []
transcriptions_list = []

def process_images(source_dir, out_dir, mode="train"):
    json_path = os.path.join(source_dir, mode + "_annotations.json")
    with open(json_path, 'r') as f:
        annotations = json.loads(f.read())

    words_dir = os.path.join(source_dir, "words")
    word_dirs = [d for d in os.listdir(words_dir) if os.path.isdir(os.path.join(words_dir, d))]

    for page_id in [*annotations]:
        pages_dir, _ = page_id.split("-")
        page_words_dir = os.path.join(words_dir, pages_dir, page_id)
        page_words_paths = [os.path.join(page_words_dir, f) for f in os.listdir(page_words_dir)
         if os.path.isfile(os.path.join(page_words_dir, f))
         and f[:-4] != "r06-022-03-05" and f[:-4] != "a01-117-05-02"]
        page_words_ids = [os.path.basename(p)[:-4] for p in page_words_paths]
        for word_dict in annotations[page_id]["words"]:
            word_id = word_dict["word_id"]
            if word_id in page_words_ids:
              out_path = os.path.join(out_dir, mode, word_id + '.jpg')
              os.makedirs(os.path.join(out_dir, mode), exist_ok=True)

              word_img = Image.open(page_words_paths[page_words_ids.index(word_id)]).convert("RGB")
              word_img.save(out_path)

              filenames_list.append(word_id + '.jpg')
              transcriptions_list.append(word_dict["transcription"])
            else:
                print(f"{word_id} is not in data")

def save_to_csv_and_archive(filenames_list, transcriptions_list, csv_path, archive_path, out_dir):
    df = pd.DataFrame({
        'filename': filenames_list,
        'transcription': transcriptions_list
    })

    df.to_csv(csv_path, index=False)

    shutil.make_archive(archive_path, "zip", out_dir)


In [42]:

process_images(SOURCE2_DIR, "/content/SOURCE2/", "test")
save_to_csv_and_archive(filenames_list, transcriptions_list, "/content/SOURCE2/test.csv", "/content/SOURCE2/test", "/content/SOURCE2/test")

filenames_list = []
transcriptions_list = []


In [43]:
process_images(SOURCE2_DIR, "/content/SOURCE2/", "validation")
save_to_csv_and_archive(filenames_list, transcriptions_list, "/content/SOURCE2/validation.csv", "/content/SOURCE2/validation", "/content/SOURCE2/validation")

filenames_list = []
transcriptions_list = []

process_images(SOURCE2_DIR, "/content/SOURCE2/", "train")
save_to_csv_and_archive(filenames_list, transcriptions_list, "/content/SOURCE2/train.csv", "/content/SOURCE2/train", "/content/SOURCE2/train")


a01-117-05-02 is not in data
r06-022-03-05 is not in data


In [44]:
json_path = os.path.join("/content/source2-recognition/test_annotations.json")
with open(json_path, 'r') as f:
    test_annotations = json.loads(f.read())

json_path = os.path.join("/content/source2-recognition/train_annotations.json")
with open(json_path, 'r') as f:
    train_annotations = json.loads(f.read())

json_path = os.path.join("/content/source2-recognition/validation_annotations.json")
with open(json_path, 'r') as f:
    validation_annotations = json.loads(f.read())

all_annot_ids = [*test_annotations] + [*train_annotations] + [*validation_annotations]

In [45]:
words_dir = "/content/source2-recognition/words"
word_dirs = [d for d in os.listdir(words_dir) if os.path.isdir(os.path.join(words_dir, d))]

all_words_data_ids = []

for dir in word_dirs:
  word_subdirs = [subdir for subdir in os.listdir(
      os.path.join(words_dir, dir))
  if os.path.isdir(os.path.join(words_dir, dir, subdir))]
  all_words_data_ids += word_subdirs

In [46]:
print([id for id in all_words_data_ids if id not in all_annot_ids])

[]
