In [1]:
from glob import glob
import os

import cv2
import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *
from border_and_title import *
from main import generate_output
from template_extraction import *

good = list(glob("../data/original/Good/*.pdf")) + list(glob("../data/original/Good/*.png")) + list(glob("../data/original/Good/*.tif"))
ugly = list(glob("../data/original/Ugly/*.pdf")) + list(glob("../data/original/Ugly/*.png")) + list(glob("../data/original/Ugly/*.tif"))

len(good), len(ugly)

(20, 25)

In [2]:
def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files))

    for f in files:
        try:
            data = generate_output(f)
            img = data["img"]
            mask = data["mask"]
            boundary = data["boundary"]
            title_blocks = data["title_blocks"]
            im_h = data["im_h"]
            im_w = data["im_w"]
            words = data["words"]
            border_lines = data["border_lines"]

            borders = get_border_lines(border_lines, im_h)
            template_lines, intersection_lines, intersection_points, intersected_words = detect_intersection_with_template(
                img,
                boundary,
                borders,
                words,
                x_index_titles,
                y_index_titles,
                tolerance=30
            )

            img_comp = img.copy()
            cv2.rectangle(img_comp, boundary[0], boundary[1], (0, 255, 0), 2)
            for title_block in title_blocks:
                bx1, by1, bx2, by2 = title_block
                cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (0, 255, 0), 2)

            for idx, row in intersected_words.iterrows():
                bx1, by1, bx2, by2 = row[["x1", "y1", "x2", "y2"]].astype(int)
                cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (0, 0, 255), 2)

            for point in intersection_points:
                if point is not None:
                    x1, y1 = point
                    cv2.circle(img_comp, (int(x1), int(y1)), 10, (0, 0, 255), 2)
            cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", np.hstack((img, img_comp)))
        except Exception as e:
            print(f"{f}: {e}")
        progress.update(1)

In [3]:
process(ugly, "../data/intersection-with-boundary/ugly")
process(good, "../data/intersection-with-boundary/good")

100%|██████████| 25/25 [00:38<00:00,  1.56s/it]
  5%|▌         | 1/20 [00:01<00:24,  1.28s/it]

../data/original/Good/15316-100-D-040_Sheet1.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


 10%|█         | 2/20 [00:02<00:23,  1.32s/it]

../data/original/Good/15316-100-D-006_Sheet1.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


 20%|██        | 4/20 [00:07<00:30,  1.89s/it]

../data/original/Good/15316-100-D-041_Sheet1.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


 30%|███       | 6/20 [00:09<00:20,  1.46s/it]

../data/original/Good/Apple.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


 50%|█████     | 10/20 [00:18<00:18,  1.86s/it]

../data/original/Good/300-014833.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


 80%|████████  | 16/20 [00:33<00:08,  2.15s/it]

../data/original/Good/300-014821.pdf: too many indices for array: array is 1-dimensional, but 2 were indexed


100%|██████████| 20/20 [00:41<00:00,  2.08s/it]

../data/original/Good/M2037041020001.TIF.tif: not enough values to unpack (expected 3, got 2)
../data/original/Good/M1142248090001.TIF.tif: not enough values to unpack (expected 3, got 2)



