Task 1: Hough transform for document skew estimation

I’m estimating the skew angle of a document image using the Hough transform. I load the image, binarize it, and create a negative version. Then, I select candidate points, apply the Hough transform to detect lines, calculate the median angle, and deskew the image.

In [3]:
import cv2 as cv
import numpy as np

def load_and_binarize(image_path, threshold=200):
    img = cv.imread(image_path, 0)
    _, doc_bin = cv.threshold(img, threshold, 255, cv.THRESH_BINARY)
    return img, doc_bin

def get_negative(binary_image):
    return 255 - binary_image

def extract_connected_components(binary_image):
    return cv.connectedComponentsWithStats(binary_image)

if __name__ == "__main__":
    image_path = 'doc.jpg'
    original, binary = load_and_binarize(image_path)
    negative = get_negative(binary)
    num_labels, labels, stats, centroids = extract_connected_components(negative)

    cv.imwrite('original.jpg', original)
    cv.imwrite('binary.jpg', binary)
    cv.imwrite('negative.jpg', negative)

    print(f"Number of connected components: {num_labels}")

Number of connected components: 3972


In [5]:
import numpy as np
import cv2 as cv
import time

def select_candidate_points(strategy, negative, num_labels, labels, centroids):
    if strategy == 'a':
        return np.column_stack(np.where(negative > 0))
    elif strategy == 'b':
        return centroids[1:]
    elif strategy == 'c':
        candidate_points = []
        for label in range(1, num_labels):
            component = np.where(labels == label)
            max_y = np.max(component[0])
            x = component[1][np.argmax(component[0])]
            candidate_points.append([x, max_y])
        return np.array(candidate_points)
    else:
        raise ValueError("Invalid strategy. Choose 'a', 'b', or 'c'.")

def remove_non_candidates(shape, candidate_points):
    result = np.zeros(shape)
    for point in candidate_points:
        x, y = point
        if 0 <= x < shape[1] and 0 <= y < shape[0]:
            result[int(y), int(x)] = 255
    return result

if __name__ == "__main__":
    negative = cv.imread('negative.jpg', 0)
    num_labels, labels, stats, centroids = cv.connectedComponentsWithStats(negative)

    strategies = ['a', 'b', 'c']
    for strategy in strategies:
        start_time = time.time()
        candidate_points = select_candidate_points(strategy, negative, num_labels, labels, centroids)
        selection_time = time.time() - start_time

        result = remove_non_candidates(negative.shape, candidate_points)
        cv.imwrite(f'candidate_points_{strategy}.jpg', result)

        print(f"Strategy {strategy}:")
        print(f"  Number of candidate points: {len(candidate_points)}")
        print(f"  Selection time: {selection_time:.4f} seconds")

Strategy a:
  Number of candidate points: 414800
  Selection time: 0.0148 seconds
Strategy b:
  Number of candidate points: 6058
  Selection time: 0.0001 seconds
Strategy c:
  Number of candidate points: 6058
  Selection time: 26.6902 seconds


In [6]:
import cv2 as cv
import numpy as np
import statistics
import time

def apply_hough_transform(binary_image, distance_resolution=1, angular_resolution=np.pi/180, density_threshold=10):
    return cv.HoughLines(binary_image, distance_resolution, angular_resolution, density_threshold)

def calculate_document_angle(lines):
    if lines is not None:
        angles = [line[0][1] for line in lines]
        median_angle = statistics.median(angles)
        document_angle = np.pi/2 - median_angle
        return np.degrees(document_angle)
    return 0

def deskew_image(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv.getRotationMatrix2D(center, -angle, 1.0)
    return cv.warpAffine(image, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)

if __name__ == "__main__":
    original = cv.imread('original.jpg', 0)
    strategies = ['a', 'b', 'c']
    density_thresholds = [5, 10, 20]

    for strategy in strategies:
        for threshold in density_thresholds:
            candidate_points = cv.imread(f'candidate_points_{strategy}.jpg', 0)

            start_time = time.time()
            lines = apply_hough_transform(candidate_points, density_threshold=threshold)
            hough_time = time.time() - start_time

            angle = calculate_document_angle(lines)
            deskewed = deskew_image(original, angle)

            cv.imwrite(f'deskewed_{strategy}_{threshold}.jpg', deskewed)

            print(f"Strategy: {strategy}, Density Threshold: {threshold}")
            print(f"  Estimated angle: {angle:.2f} degrees")
            print(f"  Hough transform time: {hough_time:.4f} seconds")

Strategy: a, Density Threshold: 5
  Estimated angle: 5.00 degrees
  Hough transform time: 0.0558 seconds
Strategy: a, Density Threshold: 10
  Estimated angle: 5.00 degrees
  Hough transform time: 0.0530 seconds
Strategy: a, Density Threshold: 20
  Estimated angle: 5.00 degrees
  Hough transform time: 0.0534 seconds
Strategy: b, Density Threshold: 5
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0187 seconds
Strategy: b, Density Threshold: 10
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0181 seconds
Strategy: b, Density Threshold: 20
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0176 seconds
Strategy: c, Density Threshold: 5
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0194 seconds
Strategy: c, Density Threshold: 10
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0195 seconds
Strategy: c, Density Threshold: 20
  Estimated angle: -0.00 degrees
  Hough transform time: 0.0188 seconds


In [7]:
import cv2 as cv
import pytesseract
import time

def perform_ocr(image):
    return pytesseract.image_to_string(image)

def generate_pdf(image, output_path):
    pdf = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')
    with open(output_path, 'w+b') as f:
        f.write(pdf)

if __name__ == "__main__":
    original = cv.imread('original.jpg')
    strategies = ['a', 'b', 'c']
    density_thresholds = [5, 10, 20]

    # Process original image
    start_time = time.time()
    original_text = perform_ocr(original)
    ocr_time = time.time() - start_time

    print("Original Image OCR:")
    print(original_text)
    print(f"OCR Time: {ocr_time:.4f} seconds")

    generate_pdf(original, 'original.pdf')

    # Process deskewed images
    for strategy in strategies:
        for threshold in density_thresholds:
            deskewed = cv.imread(f'deskewed_{strategy}_{threshold}.jpg')
            
            start_time = time.time()
            deskewed_text = perform_ocr(deskewed)
            ocr_time = time.time() - start_time

            print(f"\nDeskewed Image OCR (Strategy: {strategy}, Threshold: {threshold}):")
            print(deskewed_text)
            print(f"OCR Time: {ocr_time:.4f} seconds")

            generate_pdf(deskewed, f'deskewed_{strategy}_{threshold}.pdf')

    # Process test images
    test_images = ['01.jpg', '02.jpg', '03.jpg', '04.jpg', '05.jpg']
    for test_image in test_images:
        img = cv.imread(test_image)
        
        start_time = time.time()
        test_text = perform_ocr(img)
        ocr_time = time.time() - start_time

        print(f"\nOCR Result for {test_image}:")
        print(test_text)
        print(f"OCR Time: {ocr_time:.4f} seconds")

        generate_pdf(img, f'{test_image[:-4]}.pdf')

Original Image OCR:
A Fielg Mode fo Pairing 3D Shapes«
Uc Thanh Neuyent2 Binh-s a", Mi Oi Tray 2) Quang. lieu m?, and Sai-Kit Yeung?
180 OO! of I, n logy, Deakin University Australia
Manor Univ, ty of hnolog id sign, M8apore
Abstract
This: pay, So Rel moges L ePairing 3p)
Shapes ¢ ‘Onstriy rom Multi-vioy, RGR data, pecificalty
we "Present q 3 Shape in @ Mar, ov aNdom, Mi i
Which the SeOMetric infoyy ation is en
oY Variables a
|

{7}, p, Vi.
by Perform in,
8, ), Ve enableg This Proce:
Scene data and this ‘aS raiseg be defi
Sented in an
When Duc Thanh Neuyen wag Working ap
mology ang Design,

OCR Time: 0.7699 seconds

Deskewed Image OCR (Strategy: a, Threshold: 5):

OCR Time: 0.4316 seconds

Deskewed Image OCR (Strategy: a, Threshold: 10):

OCR Time: 0.4129 seconds

Deskewed Image OCR (Strategy: a, Threshold: 20):

OCR Time: 0.4097 seconds

Deskewed Image OCR (Strategy: b, Threshold: 5):
2

> Quang. lieu m?, and Sai-Kit Yeung?
a logy, D, akin University, Australia
Manor, Univ, ty of hnol