In [1]:
from glob import glob
import os

import cv2
import numpy as np
import torch
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from tqdm import tqdm
from scoring import *
from border_and_title import *
from template_extraction import *

In [2]:

# files = [
#     "../../data/original/Good/300-014820.pdf",
#     "../../data/original/Good/300-014823.pdf",
#     "../../data/original/Good/300-014821.pdf",
#     "../../data/original/Good/300-014822.pdf"
# ]
# files = [
#     "../../data/original/Ugly/000.000.537.pdf",
#     "../../data/original/Ugly/000.001.296.pdf",
#     "../../data/original/Ugly/000.001.360.pdf",
#     "../../data/original/Ugly/000.001.389.pdf",
#     "../../data/original/Ugly/000.895.652.pdf",
#     # "../../data/original/Ugly/000.895.659.pdf",
#     # "../../data/original/Ugly/000.895.681.pdf",
#     # "../../data/original/Ugly/000.895.684.pdf",
#     # "../../data/original/Ugly/000.895.701 (1).pdf"
# ]
# files = [
#     "../../data/original/Ugly/000.001.024.pdf",
#     "../../data/original/Ugly/000.001.303.pdf",
#     "../../data/original/Ugly/000.001.373.pdf",
#     "../../data/original/Ugly/000.001.423.pdf",
#     "../../data/original/Ugly/000.895.692.pdf",
# ]
# files = [
#     "../../data/original/Ugly/1.pdf",
#     "../../data/original/Ugly/2.pdf",
#     "../../data/original/Ugly/3.pdf",
#     "../../data/original/Ugly/4.pdf",
#     "../../data/original/Ugly/5.pdf",
#     "../../data/original/Ugly/6.pdf",
#     "../../data/original/Ugly/7.pdf",
#     "../../data/original/Ugly/8.pdf",
#     "../../data/original/Ugly/9.pdf",
#     "../../data/original/Ugly/10.pdf",
#     "../../data/original/Ugly/11.pdf"
# ]
files = [
    "../../data/original/Good/15316-100-D-006_Sheet1.pdf",
    "../../data/original/Good/15316-100-D-020_Sheet2.pdf",
    "../../data/original/Good/15316-100-D-040_Sheet1.pdf",
    "../../data/original/Good/15316-100-D-041_Sheet1.pdf"
]

In [7]:
def process(files, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2

    progress = tqdm(total=len(files), desc="Processing Input", leave=True, position=0)
    structures = []
    image_data = []
    for f in files:
        img = read_pdf(f)

        sorted_indices, contours, hierarchy_area = detect_objects(img)
        title_contours, line_90, data = detect_probable_title_sections(img, return_states=True)
        ocr_result = data["ocr_result"]
        mask = data["mask"]
        inner_border_lines = data["inner_border_lines"]  #  0: Bottom, 1: Top, 2: Right, 3: Left
        structures.append(mask)
        image_data.append([f, img, ocr_result, inner_border_lines, title_contours, line_90, sorted_indices, contours, hierarchy_area])
        progress.update()

    borders, template = get_template_borders_from_structures(structures)

    progress = tqdm(total=len(files), desc="Processing Output", leave=True, position=0)
    for mask, (f, img, ocr_result, inner_border_lines, title_contours, line_90, sorted_indices, contours, hierarchy_area) in zip(structures, image_data):
        img_comp = img.copy()
        im_h, im_w, _ = img.shape
        words = process_text(ocr_result, im_h, im_w)

        border_1, border_2 = detect_borders(contours, sorted_indices, words)
        contours = detect_text_tables(img, words, mask)
        title_contours = title_contours + contours

        boundary = get_boundary(border_1, border_2, inner_border_lines)
        title_boundary = get_title_boundary(boundary, line_90, title_contours, words, im_h)

        for b in borders:
            bx1, by1, bx2, by2 = b
            cv2.line(img_comp, (bx1, by1), (bx2, by2), (0, 0, 255), 2)

        # intersected_extended_lines, boundary_lines, boundary_title_block_lines, intersected_points = detect_intersection_with_borders(img.copy(), boundary, title_boundary, return_states=True)
        #
        # intersected_words = detect_intersected_texts(words, boundary_lines, boundary_title_block_lines)
        # for idx, row in intersected_words.iterrows():
        #     bx1, by1, bx2, by2 = row[["x1", "y1", "x2", "y2"]].astype(int)
        #     cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (0, 0, 255), 2)
        #
        # cv2.rectangle(img_comp, boundary[0], boundary[1], color, thickness)
        # cv2.rectangle(img_comp, title_boundary[0], title_boundary[1], color, thickness)
        #
        # for x, y in intersected_points:
        #     cv2.circle(img_comp, (int(x), int(y)), 20, (0, 0, 255), 2)
        #
        # for line in borders:
        #     x1, y1, x2, y2 = line
        #     cv2.line(img_comp, (x1, y1), (x2, y2), (255, 0, 0), thickness)

        cv2.imwrite(f"{output_dir}/{os.path.basename(f)}.jpg", np.hstack((img, img_comp)))
        progress.update(1)


process(files, "../../data/templates/template_1")

Processing Input: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]

Image Similarity: 93.8496%
Image Similarity: 93.2276%
Image Similarity: 91.8845%


Processing Input: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]
Processing Output: 100%|██████████| 4/4 [00:00<00:00,  6.13it/s]
