In [1]:
from glob import glob
import os

import cv2
import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *

good = list(glob("../../data/original/Good/*.pdf"))
ugly = list(glob("../../data/original/Ugly/*.pdf"))

len(good), len(ugly)

(16, 25)

In [2]:
from doctr.models import ocr_predictor


ocr_model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
ocr_model.to(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))


def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files))

    for f in files:
        pages = convert_from_path(f, dpi=300)
        img = np.array(pages[0])
        img = im_resize(img)
        img_comp = img.copy()

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, img_bin = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
        image_height, image_width = img_bin.shape

        SCALE = 40
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, image_height // SCALE))
        vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
        vertical_lines = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (2, 40)))

        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (image_width // SCALE, 1))
        horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
        horizontal_lines = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 2)))

        mask = vertical_lines + horizontal_lines
        mask_comp = mask.copy()

        ocr_result = ocr_model([img_comp])
        h, w, _ = img_comp.shape

        for block in ocr_result.pages[0].blocks:
            for line in block.lines:
                for word in line.words:
                    (x_min, y_min), (x_max, y_max) = word.geometry
                    x_min = int(x_min * w)
                    y_min = int(y_min * h)
                    x_max = int(x_max * w)
                    y_max = int(y_max * h)

                    cv2.rectangle(mask_comp, (x_min, y_min), (x_max, y_max), (255, 255, 255), cv2.FILLED)
                    cv2.rectangle(img_comp, (x_min, y_min), (x_max, y_max), (0, 255, 0), thickness)

        mask_comp = cv2.dilate(mask_comp, np.ones((5, 5), np.uint8), iterations=5)
        mask_comp = cv2.erode(mask_comp, np.ones((8, 8), np.uint8), iterations=5)
        mask_comp = cv2.dilate(mask_comp, np.ones((10, 10), np.uint8), iterations=3)

        # gray = cv2.cvtColor(mask_comp, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(mask_comp, 128, 255, cv2.THRESH_BINARY)
        contours, hierarchy = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            aspect_ratio = w / float(h)
            area = w * h

            if area > 5000 and (0.5 < aspect_ratio < 10):
                cv2.rectangle(img_comp, (x, y), (x + w, y + h), (255, 0, 0), 2)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", img_comp)
        progress.update(1)

In [3]:
process(good, "../../data/title-section/good")
process(ugly, "../../data/title-section/ugly")

100%|██████████| 16/16 [00:16<00:00,  1.02s/it]
100%|██████████| 25/25 [00:21<00:00,  1.15it/s]
