In [None]:
import math
from glob import glob
import os

import cv2
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
from imutils import resize
from sklearn.metrics import euclidean_distances
from tqdm import tqdm
from scoring import *
from border_and_title import *

# f = "../../data/original/Ugly/10.pdf"
# f = "../../data/original/Ugly/2.pdf"
# f = "../../data/original/Ugly/3.pdf"
# f = "../../data/original/Ugly/000.895.652.pdf"
# f = "../../data/original/Good/300-014823.pdf"
f = "../../data/original/Good/300-014822.pdf"

font = cv2.FONT_HERSHEY_SIMPLEX
fontScale = 1
color = (255, 0, 0)
thickness = 2

pages = convert_from_path(f, dpi=300)
img = np.array(pages[0])
img = im_resize(img)
img_comp = img.copy()

im_h, im_w, _ = img.shape
sorted_indices, contours, hierarchy_area = detect_objects(img)
title_contours, line_90, data = detect_probable_title_sections(img, return_states=True)
ocr_result = data["ocr_result"]
mask = data["mask"]
inner_border_lines = data["inner_border_lines"]  #  0: Bottom, 1: Top, 2: Right, 3: Left
words = process_text(ocr_result, im_h, im_w)

border_1, border_2 = detect_borders(contours, sorted_indices, words)
contours = detect_text_tables(img, words, mask)
title_contours = title_contours + contours

boundary = get_boundary(border_1, border_2, inner_border_lines)

for idx, row in words.iterrows():
    bx1, by1, bx2, by2 = row[["x1", "y1", "x2", "y2"]].astype(int)
    cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (255, 0, 0), 2)

Image.fromarray(img_comp)

In [None]:
tables = detect_table(img, mask, words)

img_comp = img.copy()
print(len(tables))
for i, rect in enumerate(tables):
# for i, rect in enumerate(rectangles):
    x1, y1, x2, y2 = rect
    # print(f"Rectangle {i+1}: ({x1}, {y1}) to ({x2}, {y2}) — Area: {(x2 - x1) * (y2 - y1)}")
    cv2.rectangle(img_comp, (x1, y1), (x2, y2), color, thickness)

Image.fromarray(img_comp)

In [None]:
height = abs(words["y1"] - words["y2"])
height

In [None]:
img_comp = np.zeros_like(img)

for idx, row in words.iterrows():
    bx1, by1, bx2, by2 = row[["x1", "y1", "x2", "y2"]].astype(int)
    w, h = abs(bx1 - bx2), abs(by1 - by2)
    a_ratio = float(h) / float(w)
    area = abs(bx2 - bx1) * abs(by1 - by2)

    # if area < 4000 and a_ratio < 0.3 and h < 50.0:
    if h < 50.0:
        cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (0, 255, 0), 2)
    else:
        cv2.rectangle(img_comp, (bx1, by1), (bx2, by2), (255, 0, 0), 2)

Image.fromarray(img_comp)

In [None]:
word_boxes = words[["x1", "y1", "x2", "y2"]].loc[height < 50.0].astype(int).values

img_comp = img.copy()

for rect in word_boxes:
    x1, y1, x2, y2 = rect
    cv2.rectangle(img_comp, (x1, y1), (x2, y2), (0, 255, 0), 2)

Image.fromarray(img_comp)

In [None]:
from shapely import unary_union

word_boxes = words[["x1", "y1", "x2", "y2"]].loc[height < 50.0].astype(int).values
# word_boxes = words[["x1", "y1", "x2", "y2"]].astype(int).values

boxes = [
    box(x1, y1, x2, y2)
    for x1, y1, x2, y2 in word_boxes
    if abs(x2 - x1) > 0 and abs(y2 - y1) > 0
]
merged = unary_union(boxes)

if merged.geom_type == 'Polygon':
    merged = [merged]
elif merged.geom_type == 'MultiPolygon':
    merged = list(merged.geoms)

merged_rects = [b.bounds for b in merged]
merged_rects_sorted = sorted(merged_rects, key=lambda r: (r[0], r[1]))

img_copy = img.copy()
for rect in merged_rects_sorted:
    x1, y1, x2, y2 = map(int, rect)
    w, h = abs(x1 - x2), abs(y1 - y2)
    if w * h > 5000:
        cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 2)

Image.fromarray(img_copy)

In [None]:
filter_boxes = [box(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)) for x1, y1, x2, y2 in np.array(tables)]

filtered_array2 = []
for x1, y1, x2, y2 in word_boxes:
    b = box(min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
    if not any(fb.contains(b) for fb in filter_boxes):
        filtered_array2.append((x1, y1, x2, y2))

img_copy = img.copy()

for rect in filtered_array2:
    x1, y1, x2, y2 = map(int, rect)
    # if abs(x1 - x2) * abs(y1 - y2) > 5000:
    cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 2)

Image.fromarray(img_copy)

In [None]:
boxes = [
    box(x1, y1, x2, y2)
    for x1, y1, x2, y2 in filtered_array2
    if abs(x2 - x1) > 0 and abs(y2 - y1) > 0
]
merged = unary_union(boxes)

if merged.geom_type == 'Polygon':
    merged = [merged]
elif merged.geom_type == 'MultiPolygon':
    merged = list(merged.geoms)

merged_rects = [b.bounds for b in merged]
merged_rects_sorted = sorted(merged_rects, key=lambda r: (r[0], r[1]))

text_areas = []
img_copy = img.copy()
for rect in merged_rects_sorted:
    x1, y1, x2, y2 = map(int, rect)
    w, h = abs(x1 - x2), abs(y1 - y2)
    if abs(x1 - x2) * abs(y1 - y2) > 5000 and w / float(h) > 4:
        cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 2)
        text_areas.append((x1, y1, x2, y2))

Image.fromarray(img_copy)

In [None]:
name_boxes = words[["x1", "y1", "x2", "y2"]].loc[height > 50.0].astype(int).values

img_copy = img.copy()
for rect in name_boxes:
    x1, y1, x2, y2 = rect
    cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 2)

Image.fromarray(img_copy)

In [None]:
template_area = np.vstack((text_areas, tables, name_boxes))

(x1, y1), (x2, y2) = boundary
left = min(x1, x2)
right = max(x1, x2)
top = min(y1, y2)
bottom = max(y1, y2)
tolerance = 100

img_copy = img.copy()
cv2.rectangle(img_copy, boundary[0], boundary[1], color, thickness)
for rect in template_area:
    x1, y1, x2, y2 = rect.astype(int)
    w, h = abs(x1 - x2), abs(y1 - y2)

    if w * h > 1000:
        x_min = min(x1, x2)
        x_max = max(x1, x2)
        y_min = min(y1, y2)
        y_max = max(y1, y2)
        if y_min < top + tolerance:
            print("top")
            y_min = top
        if x_min < left + tolerance:
            print("left")
            x_min = left
        # if x_max > right - tolerance:
        #     print("right")
        #     x_max = right
        # if y_max > bottom - tolerance:
        #     print("bottom")
        #     y_max = bottom
        cv2.rectangle(img_copy, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

Image.fromarray(img_copy)