In [1]:
import cv2
import numpy as np
import json
from pdf2image import convert_from_path

# Convert the PDF file to a list of images
pages = convert_from_path(r"C:\Users\sevan\Jupyter Projects\IIT-B\Sanskrit_Text.pdf")

# Initialize the dictionary to store the bounding box coordinates
box_dict = {}

# Loop through each page and extract the bounding boxes and line images
for i, page in enumerate(pages):
    # Convert the page image to grayscale
    image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to binarize the image
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Get the contours of the text regions
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Loop through each contour and extract the bounding box
    for j, contour in enumerate(contours):
        (x, y, w, h) = cv2.boundingRect(contour)
        box_dict[f"page{i+1}_line{j+1}"] = {"top_left": [x, y], "top_right": [x + w, y],
                                     "bottom_left": [x, y + h], "bottom_right": [x + w, y + h]}
        # Draw the bounding box on the image
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        # Crop and save the line image
        line_img = image[y:y+h, x:x+w]
        cv2.imwrite(f"page{i+1}_line{j+1}.jpg", line_img)

    # Save the image with bounding boxes
    cv2.imwrite(f"page{i+1}_with_boxes.jpg", image)

# Save the bounding box coordinates as JSON
with open("bounding_boxes.json", "w") as f:
    json.dump(box_dict, f, indent=4)
