In [1]:
import numpy as np
import cv2

def order_points(pts):
	# initializing the list of coordinates to be ordered
	rect = np.zeros((4, 2), dtype = "float32")

	s = pts.sum(axis = 1)
	# top-left point will have the smallest sum
	rect[0] = pts[np.argmin(s)]
	# bottom-right point will have the largest sum
	rect[2] = pts[np.argmax(s)]

	'''computing the difference between the points, the
	top-right point will have the smallest difference,
	whereas the bottom-left will have the largest difference'''
	diff = np.diff(pts, axis = 1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]

	# returns ordered coordinates
	return rect


def perspective_transform(image, pts):
	# unpack the ordered coordinates individually
	rect = order_points(pts)
	(tl, tr, br, bl) = rect

	'''compute the width of the new image, which will be the
	maximum distance between bottom-right and bottom-left
	x-coordiates or the top-right and top-left x-coordinates'''
	widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
	widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
	maxWidth = max(int(widthA), int(widthB))

	'''compute the height of the new image, which will be the
	maximum distance between the top-left and bottom-left y-coordinates'''
	heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
	heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
	maxHeight = max(int(heightA), int(heightB))

	'''construct the set of destination points to obtain an overhead shot'''
	dst = np.array([
		[0, 0],
		[maxWidth - 1, 0],
		[maxWidth - 1, maxHeight - 1],
		[0, maxHeight - 1]], dtype = "float32")

	# compute the perspective transform matrix
	transform_matrix = cv2.getPerspectiveTransform(rect, dst)
	# Apply the transform matrix
	warped = cv2.warpPerspective(image, transform_matrix, (maxWidth, maxHeight))

	# return the warped image
	return warped

In [2]:
import os
import csv
import cv2
import imutils
from skimage.filters import threshold_local
from transform import perspective_transform

def empty_folder(folder_path):
    try:
        for item in os.listdir(folder_path):
            item_path = os.path.join(folder_path, item)
            if os.path.isfile(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                empty_folder(item_path)
                os.rmdir(item_path)
    except Exception as e:
        print(f"Error emptying folder: {e}")

def create_folder(folder_path):
    try:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"[Log] Created folder: {folder_path}")
    except Exception as e:
        print(f"Error creating folder: {e}")

def create_csv(csv_path):
    try:
        if not os.path.exists(csv_path):
            with open(csv_path, 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(["filename", "text"])
    except Exception as e:
        print(f"Error creating csv: {e}")


create_folder('./output')
empty_folder('./output')


def scanDoc(image):
    try:
        original_img = cv2.imread("./input/"+image)
        copy = original_img.copy()
        # cv2.waitKey(1)

        ratio = original_img.shape[0] / 500.0
        img_resize = imutils.resize(original_img, height=500)
        # cv2.imshow('Resized image', img_resize)
        # cv2.waitKey(1)

        gray_image = cv2.cvtColor(img_resize, cv2.COLOR_BGR2GRAY)
        # cv2.imshow('Grayed Image', gray_image)
        # cv2.waitKey(1)

        blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        edged_img = cv2.Canny(blurred_image, 75, 200)
        # cv2.imshow('Image edges', edged_img)
        # cv2.waitKey(1)

        cnts, _ = cv2.findContours(edged_img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:5]
        for c in cnts:
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 0.02 * peri, True)
            if len(approx) == 4:
                doc = approx
                break

        p = []
        for d in doc:
            tuple_point = tuple(d[0])
            cv2.circle(img_resize, tuple_point, 3, (0, 0, 255), 4)
            p.append(tuple_point)
        # cv2.imshow('Circled corner points', img_resize)
        # cv2.waitKey(1)

        warped_image = perspective_transform(copy, doc.reshape(4, 2) * ratio)
        warped_image = cv2.cvtColor(warped_image, cv2.COLOR_BGR2GRAY)
        # cv2.imshow("Warped Image", imutils.resize(warped_image, height=650))
        # cv2.waitKey(1)

        T = threshold_local(warped_image, 11, offset=10, method="gaussian")
        warped = (warped_image > T).astype("uint8") * 255

        max_allowed_difference = int(original_img.shape[0] * 0.5)

    # Compare dimensions of the original and warped images
        
        if abs(original_img.shape[0] - warped.shape[0]) > max_allowed_difference or \
            abs(original_img.shape[1] - warped.shape[1]) > max_allowed_difference:
            cv2.imwrite('./output/'+image, original_img)
            print("[Success (warp skipped)] ", image)
        else:
            cv2.imwrite('./output/'+image, warped)
            print("[Success] ", image)
    except:
        cv2.imwrite('./output/'+image,original_img)
        print("[Success (preprocessing skipped)] ", image)

    # cv2.imshow("Final Scanned image", imutils.resize(warped, height=650))
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()


In [3]:
from PIL import Image
import pytesseract
import os
# from scan import scanDoc
from pytesseract import Output


import cv2

min_conf=0
     
def ocr(path, file_name):
	
	im = Image.open(path)
	image = cv2.imread(path)
	# print(path,file_name)
	results = pytesseract.image_to_data(im, output_type= Output.DICT)
	# print(results)
	extracted_data=[]
	space_width = -1
	# loop over each of the individual text localizations
	for i in range(0, len(results["text"])):
		# extract the bounding box coordinates of the text region from
		# the current result
		if(space_width==-1 and (results["text"][i] == " " or results["text"][i] == "  ")):
			space_width = results["width"][i]
			# print(space_width)
		if results["text"][i] == " " or results["text"][i] == "" or results["text"][i] == "  ":
			continue
		# if i==5:
			# break
		x = results["left"][i]
		y = results["top"][i]
		w = results["width"][i]
		h = results["height"][i]
		x2=x+w
		y2=y+h
		# extract the OCR text itself along with the confidence of the
		# text localization
		text = results["text"][i]

		# print(text,x,y,w,h)
		extracted_data.append({
			"word":text,
			"coordinates":(x,y,x2,y2)
		})
		
		
	merged_keys = []
	current_key = []
	# merged_coordinates = (float('inf'), float('inf'), float('-inf'), float('-inf'))
	for word_info in extracted_data:
		word = word_info['word']
		x1, y1, x2, y2 = word_info['coordinates']

		# Check if the current word is adjacent to the previous word
		if current_key and (abs(current_key[-1][3] - x1) <= 3*space_width and abs(current_key[-1][2] - y1) <= space_width) :
			current_key.append((word, x1, y1, x2, y2))			

		else:
			if len(current_key)>0:
				min_x1 = min(item[1] for item in current_key)
				min_y1 = min(item[2] for item in current_key)
				max_x2 = max(item[3] for item in current_key)
				max_y2 = max(item[4] for item in current_key)

				merged_coordinates = (min_x1, min_y1, max_x2, max_y2)
				merged_key = ' '.join([w[0] for w in current_key])
				
				merged_keys.append( {
				"word":merged_key,
				"coordinates":merged_coordinates
				})
				# print(current_key)
				# print(merged_key)	

			current_key = [(word, x1, y1, x2, y2)]
			
	if len(current_key)>0:
		min_x1 = min(item[1] for item in current_key)
		min_y1 = min(item[2] for item in current_key)
		max_x2 = max(item[3] for item in current_key)
		max_y2 = max(item[4] for item in current_key)
		merged_coordinates = (min_x1, min_y1, max_x2, max_y2)
		merged_key = ' '.join([w[0] for w in current_key])
		
		merged_keys.append( {
		"word":merged_key,
		"coordinates":merged_coordinates
		})
		# print(current_key)
		# print(merged_key)	
	current_key = [(word, x1, y1, x2, y2)]
		
	# print(merged_keys)
	# print(current_key)
	# for i in extracted_data:
	# 	print(i)
	
	# for key, value in merged_keys.items():
	# 	print(f"{key}: {value}")	
	for i in merged_keys:
		print(i)	

	for i in merged_keys:
		word,x1,y1,x2,y2=i['word'],i['coordinates'][0],i['coordinates'][1],i['coordinates'][2],i['coordinates'][3] 
		cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw a green rectangle
		# Put text label near the box
		cv2.putText(image, word, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

	
	if image is not None and image.shape[0] > 0 and image.shape[1] > 0:
		cv2.imwrite("./output_annoted/"+file_name, image)
		print("[Success] ", file_name)
		# cv2.imshow("Image", image)
		# cv2.waitKey(1)
	else:
		print("Error: Invalid image dimensions")



In [4]:


cwd= os.getcwd()

input_folder_path = cwd+'/input'
input_folder_path = os.listdir(input_folder_path)
print("[Preprocessing]")
for i in input_folder_path:
    scanDoc(i)

output_folder_path = cwd+'/output'
output_folder_contents = os.listdir(output_folder_path)
print("\n")
print("[Annotating preprocessed images]")
for i in output_folder_contents:
    ocr(output_folder_path+'/'+i, i)
    



[Preprocessing]
[Success (preprocessing skipped)]  example1.jpg
[Success (preprocessing skipped)]  example2.png
[Success]  fig3.jpg


[Annotating preprocessed images]
{'word': 'POFULAR', 'coordinates': (202, 71, 432, 120)}
{'word': 'BODK.', 'coordinates': (467, 74, 609, 123)}
{'word': 'CO.', 'coordinates': (133, 176, 221, 223)}
{'word': '(M)', 'coordinates': (275, 178, 354, 238)}
{'word': 'SDN', 'coordinates': (398, 180, 498, 227)}
{'word': 'BHD', 'coordinates': (531, 178, 629, 229)}
{'word': '{Compary', 'coordinates': (192, 285, 297, 317)}
{'word': 'No.', 'coordinates': (311, 288, 348, 311)}
{'word': '113825-4)', 'coordinates': (366, 288, 484, 319)}
{'word': '{B5T', 'coordinates': (165, 324, 215, 354)}
{'word': 'Reg', 'coordinates': (229, 324, 269, 356)}
{'word': 'No.', 'coordinates': (283, 326, 319, 350)}
{'word': '00149299,', 'coordinates': (336, 326, 444, 351)}
{'word': '008', 'coordinates': (445, 328, 508, 352)}
{'word': 'j', 'coordinates': (503, 336, 510, 359)}
{'word': 'No', 'co