In [1]:
from glob import glob
import os

import cv2
import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *
from border_and_title import *

good = list(glob("../data/original/Good/*.pdf"))
ugly = list(glob("../data/original/Ugly/*.pdf"))

len(good), len(ugly)

(16, 25)

In [2]:
def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files))

    for f in files:
        pages = convert_from_path(f, dpi=300)
        img = np.array(pages[0])
        img = im_resize(img)
        img_comp = img.copy()
        im_h, im_w, _ = img.shape

        sorted_indices, contours, hierarchy_area = detect_objects(img)
        border_1, border_2 = detect_borders(contours, sorted_indices)
        x, y, w, h = border_1
        cv2.rectangle(img_comp, (x, y), (x+w, y+h), (0, 255, 0), 10)
        x, y, w, h = border_2
        cv2.rectangle(img_comp, (x, y), (x+w, y+h), (0, 255, 0), 10)

        title_contours = detect_probable_title_sections(img)

        for cnt in title_contours:
            x, y, w, h = cv2.boundingRect(cnt)
            aspect_ratio = w / float(h)
            area = w * h

            if area > 7000 and 1.3 < aspect_ratio < 10:
                if y > int((im_h * 70) / 100) and y+h > int((im_h * 93) / 100):
                    cv2.rectangle(img_comp, (x, y), (x + w, y + h), (255, 0, 0), 10)
                elif y > int((im_h * 80) / 100) and y+h > int((im_h * 90) / 100):
                    cv2.rectangle(img_comp, (x, y), (x + w, y + h), (255, 0, 0), 10)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", img_comp)
        progress.update(1)

In [3]:
process(ugly, "../data/border-and-title/ugly")
process(good, "../data/border-and-title/good")

100%|██████████| 25/25 [00:23<00:00,  1.08it/s]
100%|██████████| 16/16 [00:15<00:00,  1.04it/s]
