In [None]:
import sys
sys.path.append("src")

import generator

from pathlib import Path

from PIL import Image
import cv2
from skimage import filters

import pytesseract
from pytesseract import Output
import numpy as np
from tqdm import tqdm

In [None]:
config = {
"digits": "data/digits/semeion.data",
"circles": "data/circles/full_numpy_bitmap_circle.npy",
"arrows": "data/arrow_foreground",
"lines": "data/lines/full_numpy_bitmap_line.npy",
"words": "data/word_foreground",
"sentences": "data/sentence_foreground"
}

gen = generator.generator(config)

# folder that contains archive document images without handwriting
training_path = # path
training_background = [f for f in training_path.glob('**/*.png')]

In [None]:
len(training_background)

In [None]:
N_SAMPLES = 10
OUTPUT_DIR = "data/train/"

In [None]:


for index in tqdm(range(N_SAMPLES), total = N_SAMPLES):
    # choose random background
    random_choice = np.random.randint(len(training_background))
    path = training_background[random_choice]
    
    # prepare background
    background = cv2.imread(path.as_posix(), cv2.IMREAD_COLOR)
    background = cv2.resize(background, (2048, 2048), cv2.INTER_AREA)
    background = cv2.cvtColor(background, cv2.COLOR_BGR2GRAY)
    th = filters.threshold_sauvola(background, window_size=15, k=0.2)
    background = background > th
    background = background.astype('uint8')
    background_w, background_h = background.shape

    # OCR is needed to identify random text for circling, underlining or crossing
    tesseract = pytesseract.image_to_data(background, output_type=Output.DICT)
    n_boxes = len(tesseract['level'])

    foreground = np.ones((background_w, background_h))
    foreground = foreground.astype('float64')  
    background = background.astype('float64')
    
    # set the foreground
    gen.set_foreground(foreground)
    
    background[background==1] = 2
    background[background==0] = 1
    background[background==2] = 0
    
    # randomly skip annotations 5% of the time
    random_skip = np.random.randint(100)
    
    if random_skip > 5:
        # circle words
        num_circles = 0
        iteration = 0
        total_circles = round(np.random.uniform(0, 5))
        while num_circles < total_circles :
            text_found = False

            skip = False
            while not text_found:
                i = round(np.random.uniform(0, n_boxes - 1))
                if tesseract['text'][i].strip() == '' or len(tesseract['text'][i].strip()) < 3:
                    iteration += 1
                else:
                    text_found = True
                    iteration += 1

                if iteration > 10:
                    skip = True
                    text_found = True
            if skip:
                num_circles = total_circles
            else:
                bbox = (tesseract['left'][i], tesseract['top'][i], tesseract['width'][i], tesseract['height'][i])

                found = gen.draw_circles(background, bbox)
                if found:
                    num_circles += 1

        # underline words
        num_lines = 0
        iteration = 0
        total_lines = round(np.random.uniform(0, 5))
        while num_lines < total_lines :
            text_found = False
            skip = False

            while not text_found:
                i = round(np.random.uniform(0, n_boxes - 1))
                if tesseract['text'][i].strip() == '' or len(tesseract['text'][i].strip()) < 3:
                    iteration += 1
                    pass
                else:
                    iteration += 1
                    text_found = True

                if iteration > 10:
                    skip = True
                    text_found = True

            if skip:
                num_lines = total_lines
            else:

                bbox = (tesseract['left'][i], tesseract['top'][i], tesseract['width'][i], tesseract['height'][i])

                found = gen.underline_words(background, bbox)

                if found:
                    num_lines += 1

        # cross words
        num_lines = 0
        total_lines = round(np.random.uniform(0, 5))
        iteration = 0
        while num_lines < total_lines :
            text_found = False
            skip = False

            while not text_found:
                i = round(np.random.uniform(0, n_boxes - 1))
                if tesseract['text'][i].strip() == '' or len(tesseract['text'][i].strip()) < 3:
                    iteration += 1
                    pass
                else:
                    text_found = True
                    iteration += 1

                if iteration > 10:
                    skip = True
                    text_found = True

            if skip:
                num_lines = total_lines
            else:


                bbox = (tesseract['left'][i], tesseract['top'][i], tesseract['width'][i], tesseract['height'][i])

                found = gen.cross_words(background, bbox)

                if found:
                    num_lines += 1
        # add sentences
        num_lines = 0
        total_lines = round(np.random.uniform(0, 7))
        while num_lines < total_lines:
            found = gen.add_words(background)
            num_lines += 1


        # add PII
        num_lines = 0
        total_lines = round(np.random.uniform(0, 5))
        while num_lines < total_lines:
            found = gen.add_PII(background)
            num_lines += 1

        # add arrows
        num_arrows = 0
        total_arrows = round(np.random.uniform(0, 5))
        while num_arrows < total_arrows:
            found = gen.add_arrows(background)
            num_arrows += 1
    
    foreground = gen.foreground.copy()

    foreground[foreground==1] = 2
    foreground[foreground==0] = 1
    foreground[foreground==2] = 0
    
    # create training masks
    comb = background + foreground

    comb[comb==1] = 2
    comb[comb==0] = 1
    comb[comb==2] = 0
    
    
    
    
    # reset foreground
    gen.reset_foreground()
    
    
    filename = path.stem
    dest_filename_gt = OUTPUT_DIR + filename + '_' + str(index) + "_" +   '_synthetic' + '_gt.png'
    cv2.imwrite(dest_filename_gt, foreground.astype('uint8') * 255)

        
    dest_filename_gt = OUTPUT_DIR + filename + '_' + str(index) + "_" +   '_synthetic' + '_orig.png'
    cv2.imwrite(dest_filename_gt, background.astype('uint8') * 255)


    # save TR
    dest_filename_tr = OUTPUT_DIR + filename + '_' + str(index) + "_" + '_synthetic' +'_tr.png'
    cv2.imwrite(dest_filename_tr, comb.astype('uint8') * 255)