In [1]:
from glob import glob
import os

import cv2
import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *
from border_and_title import *

good = list(glob("../../data/original/Good/*.pdf"))
ugly = list(glob("../../data/original/Ugly/*.pdf"))

len(good), len(ugly)

(16, 25)

In [2]:
def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files))

    for f in files:
        pages = convert_from_path(f, dpi=300)
        img = np.array(pages[0])
        img = im_resize(img)
        img_comp = img.copy()
        im_h, im_w, _ = img.shape

        sorted_indices, contours, hierarchy_area = detect_objects(img)
        title_contours, line_90, data = detect_probable_title_sections(img, return_states=True)
        ocr_result = data["ocr_result"]
        mask = data["mask"]
        inner_border_lines = data["inner_border_lines"]  #  0: Bottom, 1: Top, 2: Right, 3: Left
        words = process_text(ocr_result, im_h, im_w)

        border_1, border_2 = detect_borders(contours, sorted_indices, words)
        contours = detect_text_tables(img, words, mask)
        title_contours = title_contours + contours

        boundary = get_boundary(border_1, border_2, inner_border_lines)
        title_boundary = get_title_boundary(boundary, line_90, title_contours, words, im_h)

        cv2.rectangle(img_comp, boundary[0], boundary[1], color, thickness)
        cv2.rectangle(img_comp, title_boundary[0], title_boundary[1], color, thickness)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", img_comp)
        progress.update(1)

In [3]:
process(ugly, "../data/boundary-detection/ugly")
process(good, "../data/boundary-detection/good")

100%|██████████| 25/25 [00:25<00:00,  1.01s/it]
100%|██████████| 16/16 [00:17<00:00,  1.10s/it]
