In [6]:
import cv2
import numpy as np
import json

# read the image file
img = cv2.imread(r"C:\Users\sevan\Desktop\11.jpg")

# convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# apply thresholding to the grayscale image
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# find the contours of the lines in the image
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# create a dictionary to store the bounding box coordinates
bounding_boxes = {}

# loop through the contours to get the bounding box for each line
for i, contour in enumerate(contours):
    x, y, w, h = cv2.boundingRect(contour)
    bounding_boxes[f"box{i+1}"] = {
        "top_left": [x, y],
        "top_right": [x+w, y],
        "bottom_left": [x, y+h],
        "bottom_right": [x+w, y+h]
    }
    # crop the line from the image and save it as a separate file
    line = img[y:y+h, x:x+w]
    cv2.imwrite(f"line{i+1}.jpg", line)

    # draw the bounding box on the original image
    cv2.rectangle(img, (x,y), (x+w,y+h), (0, 255, 0), 2)

# save the bounding box coordinates as a JSON file
with open("bounding_boxes.json", "w") as f:
    json.dump(bounding_boxes, f)

# save the image with bounding boxes
cv2.imwrite("bounding_boxes.jpg", img)


True

In [None]:
import cv2

# read the image file
img = cv2.imread(r"C:\Users\sevan\Desktop\11.jpg")

# convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# save the grayscale image as a new file


In [5]:
import cv2
import numpy as np
import json

# read the image file


# convert the image to grayscale
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# apply thresholding to the grayscale image
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# apply morphological closing to connect nearby text regions
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)

# detect the lines in the image using the Hough transform
lines = cv2.HoughLinesP(closed, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)

# group the lines into text lines using heuristics
text_lines = []
for line in lines:
    x1, y1, x2, y2 = line[0]
    theta = np.arctan2(y2-y1, x2-x1)
    if np.abs(theta) < np.pi/4:
        # horizontal line
        if len(text_lines) == 0 or np.abs(y1 - text_lines[-1][-1]) > 20:
            text_lines.append([y1, y2])
        else:
            text_lines[-1][-1] = y2
    else:
        # vertical line, ignore
        pass

# create a dictionary to store the bounding box coordinates
bounding_boxes = {}

# loop through the text lines to get the bounding box for each sentence
for i, line in enumerate(text_lines):
    y1, y2 = line
    x1 = 0
    x2 = img.shape[1]
    bounding_boxes[f"sentence{i+1}"] = {
        "top_left": [x1, y1],
        "top_right": [x2, y1],
        "bottom_left": [x1, y2],
        "bottom_right": [x2, y2]
    }
    # crop the sentence from the image and save it as a separate file
    sentence = img[y1:y2, x1:x2]
    cv2.imwrite(f"sentence{i+1}.jpg", sentence)

    # draw the bounding box on the original image
    cv2.rectangle(img, (x1,y1), (x2,y2), (0, 255, 0), 2)

# save the bounding box coordinates as a JSON file
with open("bounding_boxes.json", "w") as f:
    json.dump(bounding_boxes, f)

# save the image with bounding boxes
cv2.imwrite('output_image.jpg', gray)



error: OpenCV(4.6.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:801: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'


In [3]:
import pytesseract
from PIL import Image
import json
import io
import PyPDF2

# Open the input PDF file
with open(r"C:\Users\sevan\Jupyter Projects\IIT-B\Sanskrit_Text.pdf") as f:
    # Read the PDF file using PyPDF2
    pdf = PyPDF2.PdfFileReader(f)
    # Loop through each page in the PDF
    for i in range(pdf.getNumPages()):
        # Get the page as an image using PIL
        page = pdf.getPage(i)
        img_bytes = io.BytesIO(page['/Resources']['/XObject'].getObject()['/Im0'].getData())
        image = Image.open(img_bytes)
        # Perform OCR on the image
        text = pytesseract.image_to_string(image, lang="san")
        # Split the text into lines
        lines = text.split("\n")
        # Remove empty lines
        lines = [line for line in lines if line.strip()]
        # Create a dictionary to store the bounding box coordinates
        bounding_boxes = {}
        # Loop through each line and create a bounding box
        for j, line in enumerate(lines):
            # Get the bounding box coordinates for the line
            bounding_box = pytesseract.image_to_boxes(image, lang="san", config=f"--psm 6 -c tessedit_char_whitelist=अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह\n --oem 3 -c preserve_interword_spaces=1", boxes=True, output_type=pytesseract.Output.DICT)
            left = int(bounding_box['left'][j])
            top = int(bounding_box['top'][j])
            right = int(bounding_box['right'][j])
            bottom = int(bounding_box['bottom'][j])
            # Store the bounding box coordinates in the dictionary
            bounding_boxes[f"box{j+1}"] = {
                "top_left": [left, top],
                "top_right": [right, top],
                "bottom_left": [left, bottom],
                "bottom_right": [right, bottom]
            }
            # Crop the image to the bounding box and save as a separate file
            line_image = image.crop((left, top, right, bottom))
            line_image.save(f"output_{i+1}_{j+1}.jpg")
        # Save the bounding box coordinates as a JSON file
        with open(f"bounding_boxes_{i+1}.json", "w") as f:
            json.dump(bounding_boxes, f, indent=4)


ModuleNotFoundError: No module named 'PyPDF2'