In [1]:
from glob import glob
import os

import cv2
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *

good = list(glob("../data/original/Good/*.pdf"))
ugly = list(glob("../data/original/Ugly/*.pdf"))

len(good), len(ugly)

(16, 25)

In [2]:
def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files))

    for f in files:
        pages = convert_from_path(f, dpi=300)
        img = np.array(pages[0])
        img = im_resize(img)
        img_comp = img.copy()

        sorted_indices, contours, hierarchy_area = detect_objects(img)
        border_1 = sorted_indices[0]
        border_2 = sorted_indices[1]

        parents = hierarchy_area[(hierarchy_area[:, 0] != border_1) & (hierarchy_area[:, 0] != border_2) & (hierarchy_area[:, 5] > 10000)]

        cv2.putText(img_comp, f'{len(hierarchy_area[hierarchy_area[:, 5] > 10000])}', (100, 100), font, fontScale, color, thickness, cv2.LINE_AA)
        exclude = []

        area = 0
        for index in parents:
            index = int(index[0])
            exclude.extend(parents[parents[:, 4] == index][:, 0])
            x, y, w, h = cv2.boundingRect(contours[index])

            if index not in exclude:
                area = area + (w * h)
                cv2.rectangle(img_comp, (x, y), (x+w, y+h), (0, 255, 0), 10)

        score = (area / (int((img.shape[0] * 50) / 100) * img.shape[1]))
        score = 1 if score > 1 else score
        cv2.putText(img_comp, f'{score:.2f} {area:.2f}', (100, 70), font, fontScale, color, thickness, cv2.LINE_AA)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", img_comp)
        progress.update(1)

In [3]:
process(good, "../data/white-space/good")

100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


In [4]:
process(ugly, "../data/white-space/ugly")

100%|██████████| 25/25 [00:13<00:00,  1.82it/s]
