In [1]:
import pytesseract 
from PIL import Image 
from pdf2image import convert_from_path 
from scipy import ndimage
import numpy as np
import cv2
import math
import sys 
import os 

In [2]:
# If you don't have tesseract executable in your PATH, include the following line of code
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
# Enter path of pdf file
PDF_file = "letter_of_offer.pdf"

''' 
Part #1 : Converting PDF to images 
'''

# Store all the pages of the PDF in a variable 
pages = convert_from_path(PDF_file, 500) 

# Counter to store images of each page of PDF to image 
image_counter = 1

# Iterate through all the pages stored above 
for page in pages: 

    # Declaring filename for each page of PDF as JPG 
    # For each page, filename will be: 
    # PDF page 1 -> page_1.jpg 
    # PDF page 2 -> page_2.jpg 
    # PDF page n -> page_n.jpg 
    filename = "page_"+str(image_counter)+".jpg"
    
    # Save image of the page in system 
    page.save(filename, 'JPEG') 

    # Increase counter to update filename 
    image_counter = image_counter + 1

''' 
Part #2 - Recognising text from the images using OCR 
'''

# Variable to get count of total number of pages 
filelimit = image_counter-1

# Create a text file to write the output 
outfile = "output_text.txt"

# Open the file in append mode so that contents of all images are added to the same file 
f = open(outfile, "a") 

# Iterate from 1 to the total number of pages 
for i in range(1, filelimit + 1): 

    # Set filename to recognize text from 
    # page_1.jpg 
    # page_2.jpg 
    # page_n.jpg 
    filename = "page_"+str(i)+".jpg"

    # Recognise the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    # The recognized text is stored in variable text 
    # Any string processing may be applied on text 
    # Basic formatting has been performed 
    # In many PDFs, if a word can't be written fully at line ending, a 'hyphen' is added 
    # The rest of the word is written in the next line 
    # Replace every '-\n' to '' to remove such hyphens 
    text = text.replace('-\n', '') 

    # Write the processed text to the file
    f.write(text) 

# Close the file after writing all the text
f.close() 

In [3]:
# Read input image
IMAGE_FILE_LOCATION = "page_1.jpg"
input_img = cv2.imread(IMAGE_FILE_LOCATION)

scale_percent = 21 # Percent of original size
width = int(input_img.shape[1] * scale_percent / 100)
height = int(input_img.shape[0] * scale_percent / 100)
dim = (width, height)

# Resize input image
resized = cv2.resize(input_img, dim, interpolation = cv2.INTER_AREA)

In [4]:
# REGION OF INTEREST (ROI) SELECTION

# Initialize the list for storing the coordinates 
coordinates = [] 
  
# Define callback function
def shape_selection(event, x, y, flags, param): 
    global coordinates # Make coordinates global
  
    # Store the (x1,y1) coordinates when left mouse button is pressed  
    if event == cv2.EVENT_LBUTTONDOWN: 
        coordinates = [(x, y)] 
  
    # Store the (x2,y2) coordinates when the left mouse button is released and make a rectangle on the selected region
    elif event == cv2.EVENT_LBUTTONUP: 
        coordinates.append((x, y)) 
  
        # Draw a rectangle around the region of interest
        cv2.rectangle(image, coordinates[0], coordinates[1], (0,0,255), 2) 
        cv2.imshow("image", image) 
        
# Load and make a copy of the image, and setup the mouse callback function 
image = resized
image_copy = image.copy()
cv2.namedWindow("image") 
cv2.setMouseCallback("image", shape_selection) 
   
# Keep looping until the 'q' key is pressed 
while True: 
    # Display the image and wait for a keypress 
    cv2.imshow("image", image) 
    key = cv2.waitKey(1) & 0xFF
  
    if key==13: # If 'enter' is pressed, apply OCR
        break
    
    if key == ord("c"): # Clear the selection when 'c' is pressed 
        image = image_copy.copy() 

if len(coordinates) == 2: 
    image_roi = image_copy[coordinates[0][1]:coordinates[1][1],
                           coordinates[0][0]:coordinates[1][0]] 
    cv2.imshow("Selected Region of Interest - Press any key to proceed", image_roi) 
    cv2.waitKey(0) 

# Close all open windows 
cv2.destroyAllWindows()  

In [5]:
# OPTICAL CHARACTER RECOGNITION (OCR) ON ROI

text = pytesseract.image_to_string(image_roi)
print("The text in the selected region is as follows:\n")
print(text)

The text in the selected region is as follows:

16 June 2016



In [6]:
# Extraction of selected ROI coordinates
coordinates[0] # x1,y1

(83, 188)

In [7]:
coordinates[1] # x2,y2

(307, 204)

In [8]:
IMAGE_FILE_LOCATION = "page_1.jpg"
input_img = cv2.imread(IMAGE_FILE_LOCATION) # read image

scale_percent = 21 # percent of original size
width = int(input_img.shape[1] * scale_percent / 100)
height = int(input_img.shape[0] * scale_percent / 100)
dim = (width, height)

# resize image
resized = cv2.resize(input_img, dim, interpolation = cv2.INTER_AREA)

In [9]:
# To automate the extraction of selected fields from a standard letter of offer, the coordinates of selected fields are extracted and assigned to variables
lo_date = resized[188:204, 83:307] # y1:y2, x1:x2
tenant_name = resized[238:259, 81:346]
unit_num = resized[498:512, 113:154]
floor_area = resized[557:576, 112:229]
lease_term = resized[667:686, 112:219]
term_of_tenancy = resized[728:748, 113:462]
rent = resized[849:866, 465:601]
service_charge = resized[865:879, 467:599]
total_monthly_gross_rent = resized[884:899, 467:603]

In [10]:
# OPTICAL CHARACTER RECOGNITION (OCR) on selected fields

lo_date = pytesseract.image_to_string(lo_date)
print("Date of letter of offer:\n")
print(lo_date)

tenant_name = pytesseract.image_to_string(tenant_name)
print("Name of tenant:\n")
print(tenant_name)

unit_num = pytesseract.image_to_string(unit_num)
print("Unit number:\n")
print(unit_num)

floor_area = pytesseract.image_to_string(floor_area)
print("Floor area:\n")
print(floor_area)

lease_term = pytesseract.image_to_string(lease_term)
print("Lease term:\n")
print(lease_term)

term_of_tenancy = pytesseract.image_to_string(term_of_tenancy)
print("Tenancy term:\n")
print(term_of_tenancy)

rent = pytesseract.image_to_string(rent)
print("Rent amount:\n")
print(rent)

service_charge = pytesseract.image_to_string(service_charge)
print("Service charge amount:\n")
print(service_charge)

total_monthly_gross_rent = pytesseract.image_to_string(total_monthly_gross_rent)
print("Total monthly gross rent amount:\n")
print(total_monthly_gross_rent)

Date of letter of offer:

16 June 2016

Name of tenant:

ASLAN PHARMACEUTICALS PTE.LTD.

Unit number:

#12-03

Floor area:

4,500.00 square feet

Lease term:

Three (03) years

Tenancy term:

1 October 2016—30 September 2019 (both dates inclusive)

Rent amount:

$$26,775.00 per month

Service charge amount:

SS 4.725.00 per month

Total monthly gross rent amount:

$$31,500.00 per month

