In [None]:
import os
import re

import cv2
import numpy as np
import pandas as pd

In [None]:
!unzip input.zip

In [None]:
# for i in $(find ../lines -type f -name "*.png"); do echo $i; cp $i ./;done

1. Download dataset from https://fki.tic.heia-fr.ch/databases/iam-handwriting-database. (ascii, lines, split-indexes)
2. Create root directory '../input/iam'
3. Put dataset into the root directory and unzip it 

In [None]:
ROOT = f'./input/iam'

In [None]:
# train_ids = [line.strip() for line in open(f'{ROOT}/trainset.txt').readlines()]
# valid1_ids = [line.strip() for line in open(f'{ROOT}/validationset1.txt').readlines()]
# valid2_ids = [line.strip() for line in open(f'{ROOT}/validationset2.txt').readlines()]
# test_ids = [line.strip() for line in open(f'{ROOT}/testset.txt').readlines()]

train_ids = [line.strip() for line in open(f'{ROOT}/train.txt').readlines()]
valid1_ids = [line.strip() for line in open(f'{ROOT}/valid.txt').readlines()]
valid2_ids = [line.strip() for line in open(f'{ROOT}/valid.txt').readlines()]
test_ids = [line.strip() for line in open(f'{ROOT}/test.txt').readlines()]

def get_stage(image_id):
    if image_id in train_ids:
        return 'train'
    if image_id in valid1_ids:
        return 'valid'
    if image_id in valid2_ids:
        return 'valid'
    if image_id in test_ids:
        return 'test'
    return 'unknown'

In [None]:
print(len(train_ids))
print(len(valid1_ids))
print(len(valid2_ids))
print(len(test_ids))

In [None]:
dataset = []


for line in open(f'{ROOT}/ascii/lines.txt').readlines():
    line = line.strip()
    if line.startswith('#'):
        print(line)
        continue
        
    sample_id, text = re.findall(
        r'([\S]+)\s\w+\s\d+\s\d+\s\d+\s\d+\s\d+\s\d+\s([\w\W]+)',
        line
    )[0]
    folder, subfolder, _ = sample_id.split('-')
    subfolder = f'{folder}-{subfolder}'
    
    dataset.append({
        'sample_id': sample_id,
        'text': text.replace('|', ' ').strip(),
        'path': f'iam/images/{folder}/{subfolder}/{sample_id}.png',
        'stage': get_stage(sample_id),
    })

In [None]:
!mkdir dataset

In [None]:
# !zip -r dataset.zip dataset

In [None]:
marking = pd.DataFrame(dataset).set_index('sample_id')
marking[:3]

In [None]:
marking['stage'].value_counts()

In [None]:
train_dataset = []
eval_dataset = []
test_dataset = []
unkown_dataset = []

for index, row in marking.iterrows():

    # print(index)
    # print(row)
    # break
    
    text = row['text']
    stage = row['stage']
    image_name = row['path'].split("/")[-1]
    font_type = row['path'].split("/")[-2]
    
    if stage == 'train':
        train_dataset.append({
            'image': image_name,
            'font_type': font_type,
            'text': text
        })
    elif stage == 'valid':
        eval_dataset.append({
            'image': image_name,
            'font_type': font_type,
            'text': text
        })
    elif stage == 'test':
        test_dataset.append({
        'image': image_name,
        'font_type': font_type,
        'text': text
    })
    elif stage == 'unknown':
        unkown_dataset.append({
        'image': image_name,
        'font_type': font_type,
        'text': text
    })
    


In [None]:
print(len(train_dataset))
print(len(eval_dataset))
print(len(test_dataset))
print(len(unkown_dataset))

In [None]:
!mkdir dataset

In [None]:
train_df = pd.DataFrame.from_dict(train_dataset)
train_df.to_csv("./dataset/train.csv", index=False)

eval_df = pd.DataFrame.from_dict(eval_dataset)
eval_df.to_csv("./dataset/eval.csv", index=False)

test_df = pd.DataFrame.from_dict(test_dataset)
test_df.to_csv("./dataset/test.csv", index=False)

unknown_df = pd.DataFrame.from_dict(unkown_dataset)
unknown_df.to_csv("./dataset/unknown.csv", index=False)

marking.to_csv('./dataset/marking.csv')

In [None]:
!zip -r dataset.zip dataset