In [None]:
from sklearn.cluster import AgglomerativeClustering
from pytesseract import Output
from tabulate import tabulate
import pandas as pd
import numpy as np
import pytesseract
import argparse
import imutils
import cv2
from PIL import Image
from matplotlib import pyplot as plt

In [None]:
# PREPROCESS
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]

    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [None]:
original_file = 'Images/P00296_B014_page2.jpg'
orig = cv2.imread(original_file)

## SECONDLY, OCR -- run every contour crop through tesseract OCR engine
inspo: https://nanonets.com/blog/ocr-with-tesseract/
https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/

In [None]:
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
                help="path to input image to be OCR'd")
ap.add_argument("-o", "--output", required=True,
                help="path to output CSV file")
ap.add_argument("-c", "--min-conf", type=int, default=0,
                help="minimum confidence value to filter weak text detection")
ap.add_argument("-d", "--dist-thresh", type=float, default=25.0,
                help="distance threshold cutoff for clustering")
ap.add_argument("-s", "--min-size", type=int, default=2,
                help="minimum cluster size (i.e., # of entries in column)")

# args = vars(ap.parse_args())
args = vars(ap.parse_args(["-i", "Images/P00296_B014_page2.jpg", "-o", "output_df.csv"]))


In [None]:
# set a seed for our random number generator
np.random.seed(42)
# load the input image and convert it to grayscale
image = cv2.imread(args["image"])
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [None]:
# initialize a rectangular kernel that is ~5x wider than it is tall,
# then smooth the image using a 3x3 Gaussian blur and then apply a
# blackhat morphological operator to find dark regions on a light
# background
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (51, 11))
gray = cv2.GaussianBlur(gray, (3, 3), 0)
blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

# compute the Scharr gradient of the blackhat image and scale the
# result into the range [0, 255]
grad = cv2.Sobel(blackhat, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
grad = np.absolute(grad)
(minVal, maxVal) = (np.min(grad), np.max(grad))
grad = (grad - minVal) / (maxVal - minVal)
grad = (grad * 255).astype("uint8")

# apply a closing operation using the rectangular kernel to close
# gaps in between characters, apply Otsu's thresholding method, and
# finally a dilation operation to enlarge foreground regions
grad = cv2.morphologyEx(grad, cv2.MORPH_CLOSE, kernel)
thresh = cv2.threshold(grad, 0, 255,
                       cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
thresh = cv2.dilate(thresh, None, iterations=3)
cv2.imshow("Thresh", thresh)

In [None]:
# find contours in the thresholded image and grab the largest one,
# which we will assume is the stats table
cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
                        cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
tableCnt = max(cnts, key=cv2.contourArea)

# compute the bounding box coordinates of the stats table and extract
# the table from the input image
(x, y, w, h) = cv2.boundingRect(tableCnt)
table = image[y:y + h, x:x + w]

# show the original input image and extracted table to our screen
cv2.imshow("Input", image)
cv2.imshow("Table", table)

In [None]:
# set the PSM mode to detect sparse text, and then localize text in
# the table
options = "--psm 6"
results = pytesseract.image_to_data(
    cv2.cvtColor(table, cv2.COLOR_BGR2RGB),
    config=options,
    output_type=Output.DICT)

# initialize a list to store the (x, y)-coordinates of the detected
# text along with the OCR'd text itself
coords = []
ocrText = []

In [None]:
# loop over each of the individual text localizations
for i in range(0, len(results["text"])):
    # extract the bounding box coordinates of the text region from the current result
    x = results["left"][i]
    y = results["top"][i]
    w = results["width"][i]
    h = results["height"][i]

    # extract the OCR text itself along with the confidence of the text localization
    text = results["text"][i]
    conf = int(results["conf"][i])

    # filter out weak confidence text localizations
    if conf > args["min_conf"]:
        # update our text bounding box coordinates and OCR'd text, respectively
        coords.append((x, y, w, h))
        ocrText.append(text)

In [None]:
# extract all x-coordinates from the text bounding boxes, setting the y-coordinate value to zero
xCoords = [(c[0], 0) for c in coords]

# apply hierarchical agglomerative clustering to the coordinates
clustering = AgglomerativeClustering(
    n_clusters=None,
    linkage="complete",
    distance_threshold=args["dist_thresh"],
    affinity="manhattan")  # Use 'manhattan' metric here
clustering.fit(xCoords)

# initialize our list of sorted clusters
sortedClusters = []



In [None]:
# loop over all clusters
for l in np.unique(clustering.labels_):
    # extract the indexes for the coordinates belonging to the current cluster
    idxs = np.where(clustering.labels_ == l)[0]

    # verify that the cluster is sufficiently large
    if len(idxs) > args["min_size"]:
        # compute the average x-coordinate value of the cluster and
        avg = np.average([coords[i][0] for i in idxs])
        # update our clusters list with the current label and the average x-coordinate
        sortedClusters.append((l, avg))
# sort the clusters by their average x-coordinate and initialize our data frame
sortedClusters.sort(key=lambda x: x[1])
df = pd.DataFrame()

In [None]:
# loop over the clusters again, this time in sorted order
for (l, _) in sortedClusters:
    # extract the indexes for the coordinates belonging to the current cluster
    idxs = np.where(clustering.labels_ == l)[0]

    # extract the y-coordinates from the elements in the current cluster, then sort them from top-to-bottom
    yCoords = [coords[i][1] for i in idxs]
    sortedIdxs = idxs[np.argsort(yCoords)]

    # generate a random color for the cluster
    color = np.random.randint(0, 255, size=(3,), dtype="int")
    color = [int(c) for c in color]

    # loop over the sorted indexes
    for i in sortedIdxs:
        # extract the text bounding box coordinates
        (x, y, w, h) = coords[i]
        # draw the bounding box surrounding the current element
        cv2.rectangle(table, (x, y), (x + w, y + h), color, 2)

    # extract the OCR'd text for the current column
    cols = [ocrText[i].strip() for i in sortedIdxs]
    # construct a df for the data where the first entry in our column serves as the header
    currentDF = pd.DataFrame({cols[0]: cols[1:]})

    # concatenate *original* data frame with the *current* df (we do this to handle columns that may have a varying number of rows)
    df = pd.concat([df, currentDF], axis=1)

In [None]:
# replace NaN values with an empty string and then show a nicely
# formatted version of our multi-column OCR'd text
df.fillna("", inplace=True)
print(tabulate(df, headers="keys", tablefmt="psql"))
# write our table to disk as a CSV file
print("[INFO] saving CSV file to disk...")
df.to_csv(args["output"], index=False)
# show the output image after performing multi-column OCR
cv2.imshow("Output", image)
cv2.waitKey(0)

+----+--------+---------+--------+----------+------+--------+-------+--------+-------+------+
|    | Test   | care,   | FLOW   | s7s2ie   | #4   | Nor    | vor   | nue    | ELS   | y    |
|----+--------+---------+--------+----------+------+--------+-------+--------+-------+------|
|  0 | rime   | Sut     | ae     | PERIOD   | near | yout   | 5     | ERIE   | aan   | ‘ss, |
|  1 | wide   | SMa     | bce    | OEE      | URE  | 006.6, | EA    | ann    | i     | io   |
|  2 | Sang   | SH      | ante   | gage     | OU   | ina    |       | ten    | BE    | ag   |
|  3 | BEG    | EARS    |        | OBES     | oy   | OED    |       | HESS   | HEE   | dag  |
|  4 | RG     | care    |        | Bee      | aay  | 5      |       | BE     | EE    | ER   |
|  5 | esr    | Steir   |        | BR       | TEE  | vor    |       | woe    | BES   | Ege  |
|  6 | rg     | re      |        | BSS      | Pa   | gous   |       | tenets | EOD   | bad  |
|  7 | PEGE   | SORE    |        | Beg      | OER  | S00,   

# Assessing this result

- dont want data from entire image; only below phase1: shutin period -- use regex to change where it it detects?
- no numbers being detected, only words -- is there a setting i need to use maybe within OCR? to improve number detection over word
- how to capture header names given that header names are sometimes in 2-3 lines; how so dont only take 1st line as the header? -- maybe use the ****** as the horizontal line dividing
- need to assess if detecting everything (beyond quality, quantity being detected)
- need to detect every single value in column so that aligned well compared to all columns; make sure does not skip