In [1]:
from glob import glob
import os

import cv2
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *
from doctr.models import ocr_predictor
from doctr.io import DocumentFile

good = list(glob("../data/original/Good/*.pdf"))
ugly = list(glob("../data/original/Ugly/*.pdf"))

len(good), len(ugly)

(16, 25)

In [2]:
ocr_model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)


def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    color = (255, 0, 0)
    thickness = 1

    progress = tqdm(total=len(files))

    for f in files:
        pages = convert_from_path(f, dpi=300)
        img = np.array(pages[0])
        img = im_resize(img)
        img_comp = img.copy()
        h, w, _ = img.shape

        gray = cv2.cvtColor(img_comp, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)

        edges = cv2.Canny(blurred, 50, 150)
        lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=50, maxLineGap=10)

        img_doc = DocumentFile.from_pdf(f)
        ocr_result = ocr_model(img_doc)

        for line in lines:
            x1, y1, x2, y2 = line[0]
            cv2.line(img_comp, (x1, y1), (x2, y2), (0, 255, 0), 1)

        for block in ocr_result.pages[0].blocks:
            for line in block.lines:
                for word in line.words:
                    (x_min, y_min), (x_max, y_max) = word.geometry
                    x_min = int(x_min * w)
                    y_min = int(y_min * h)
                    x_max = int(x_max * w)
                    y_max = int(y_max * h)

                    cv2.rectangle(img_comp, (x_min, y_min), (x_max, y_max), color, 2)
                    # cv2.putText(img_comp, word.value, (x_min, y_min - 5), font, fontScale, color, thickness)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", img_comp)
        progress.update()


In [3]:
process(good, "../data/text-intersections/good")

100%|██████████| 16/16 [01:20<00:00,  5.03s/it]


In [4]:
process(ugly, "../data/text-intersections/ugly")

100%|██████████| 25/25 [01:01<00:00,  2.47s/it]
