In [1]:
import cv2
import re
import numpy as np
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
from scipy.ndimage import interpolation as inter
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [2]:
#convert pdf to image
pages = convert_from_path(r'data_sample.pdf', 500)
for page in pages:
    page.save('out.jpg', 'JPEG')
img = cv2.imread(r'out.jpg')

In [3]:
def correct_skew(image, delta=2, limit=5):
    def determine_score(arr, angle):
        data = inter.rotate(arr, angle, reshape=False, order=0)
        histogram = np.sum(data, axis=1, dtype=float)
        score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
        return histogram, score

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] 

    scores = []
    angles = np.arange(-limit, limit + delta, delta)
    for angle in angles:
        histogram, score = determine_score(thresh, angle)
        scores.append(score)

    best_angle = angles[scores.index(max(scores))]

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
    corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
            borderMode=cv2.BORDER_REPLICATE)

    return best_angle, corrected

In [4]:
#skew correction of text in image
best_angle, corrected = correct_skew(img)

  data = inter.rotate(arr, angle, reshape=False, order=0)


In [6]:
#extract field names, field types and Description
corrected1 = corrected.copy()
field_name = []
field_type = []
field_name_cordinates = []
field_type_cordinates = []
text_cordinates = []
description = []
code  = []
code_cord = []
pattern = re.compile(r"\(.*?-.*?\)")
pattern1 = re.compile(r"\(\d\)")
gray = cv2.cvtColor(corrected, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (7,7), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
dilate = cv2.dilate(thresh, kernel, iterations=4)
# Find contours and draw rectangle
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    cropped = corrected1[y:y + h, x:x + w]
    text = pytesseract.image_to_string(cropped)
    text_cordinates.append([x,y,w,h])
    #search for field name pattern
    if re.search(pattern, text):
        field_name.append(text.strip())
        cv2.rectangle(corrected1, (x, y), (x + w, y + h), (36,255,12), 2)
        field_name_cordinates.append([x,y,w,h])
    # search for field type pattern
    if re.search(pattern1, text):
        field_type.append(text.strip())
        field_type_cordinates.append([x,y,w,h])
        cv2.rectangle(corrected1, (x, y), (x + w, y + h), (36,255,12), 2)
    # search for the text area above field types as backup 
    if 'SC' in text:
        code.append(text)
        code_cord.append([x,y,w,h])  
#sort the items in reverse order
field_name_cordinates = field_name_cordinates[::-1]
field_type_cordinates = field_type_cordinates[::-1]
#identify last text area in the image to draw bounding box
last_block = field_name_cordinates[-1]
last_item = field_type_cordinates[-1]
# identify  and extract description from image
for f,b in zip(field_name_cordinates,field_type_cordinates):
    cropped = corrected1[f[1]+f[3]+10:b[1]-100, f[0]:b[0]+b[2]]
    text = pytesseract.image_to_string(cropped)
    cv2.rectangle(corrected1,(f[0],f[1]+f[3]+10),(b[0]+b[2],b[1]-100),(255,0,0), 2)
    description.append(text.strip())

cropped = corrected1[last_block[1]+50:text_cordinates[0][1], last_block[0] :last_item[0]+last_item[3] + 30 ]
text = pytesseract.image_to_string(cropped)
description.append(text.strip())
cv2.rectangle(corrected1,(last_block[0],last_block[1]+last_block[3]+10),(last_item[0]+last_item[2],text_cordinates[0][1]+text_cordinates[0][3]),(255,0,0), 2)

field_name = field_name[::-1]
field_type = field_type[::-1] 
code_cord = code_cord[::-1]

if len(field_name) != len(field_type):
    x = 110 + code_cord[0][0]
    y = code_cord[0][1] + 50
    w = field_type_cordinates[0][2] + 10
    h = 20 + field_type_cordinates[0][3]
    cropped = corrected1[y:y + h, x:x + w]
    text = pytesseract.image_to_string(cropped)
    field_type.insert(0,text.strip())
    field_type_cordinates.insert(0,[x,y,w,h])
    cv2.rectangle(corrected1, (x, y), (x + w, y + h), (36,255,12), 2) 
cv2.imwrite("output2.jpg", corrected1)

True

In [7]:
# save the data to Dataframe
df = pd.DataFrame({'field_name': field_name,'field_type': field_type,'field_description ': description})

In [9]:
df.to_csv('output.csv')