In [3]:
from PIL import Image
import cv2
import numpy as np
import pytesseract

In [4]:
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [5]:
def noise_removal(image):
    kernel = np.ones((1,1), np.uint8)
    image = cv2.dilate(image , kernel, iterations=1)
    kernel = np.ones((1,1), np.uint8)
    image = cv2.erode(image,kernel,iterations=1)
    image = cv2.morphologyEx(image , cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image , 3)
    return (image)

In [6]:
def remove_borders(image):
    contours, heirarchy = cv2.findContours(image , cv2.RETR_EXTERNAL , cv2.CHAIN_APPROX_SIMPLE)
    cntSorted = sorted(contours, key=lambda x:cv2.contourArea(x))
    cnt = cntSorted[-1]

    x,y,w,h = cv2.boundingRect(cnt)
    crop = image[y:y+h,x:x+w]
    return crop

In [7]:
def thin_font(image):
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2), np.uint8)
    image = cv2.erode(image , kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return image

In [8]:
def thick_font(image):
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2), np.uint8)
    image = cv2.dilate(image , kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return image

In [9]:
color = [255,255,255]
top , bottom , left , right = [150]*4


In [10]:
def preprocess_image(im_file):
    img = cv2.imread(im_file)
    gray_image= grayscale(img)
    thresh , im_bw = cv2.threshold(gray_image , 210 , 230, cv2.THRESH_BINARY)
    no_noise = noise_removal(im_bw)
    # thin = thin_font(no_noise)
    # no_borders = remove_borders(no_noise)
    # image_with_border = cv2.copyMakeBorder(no_borders, top, bottom , left , right, cv2.BORDER_CONSTANT , value=color)
    return no_noise



In [11]:
#displaying-different-images-with-actual-size-in-matplotlib-subplot
from matplotlib import pyplot as plt
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()
    

In [12]:
def ocr_image(image_path):
    final_image = preprocess_image("temp/Agreement AshokAvenue C 001-06.png")
    cv2.imwrite("output/final_test.jpg", final_image)
    extracted_text = pytesseract.image_to_string("output/final_test.jpg")
    return extracted_text

In [13]:
output = ocr_image("temp/Agreement AshokAvenue C 001-06.png")

In [14]:
print(output)

LEAVE AND LICENSE AGREEMENT

Particulars Amount Paid GRN/Transaction Id
Rs. 1590.00/- MHO011704487202122E 15/01/2022
Registration Fee Rs. 1000/- MHO011704487202122E 15/01/2022

LEAVE AND LICENSE AGREEMENT
This agreement is made and executed on 15/01/2022 at mumbai
Between,

1) Name: Mr.Rathore Mahendra singh , Age : About 45 Years, Occupation : Service, PAN :
AIGPR1118Q Residing at: Flat No:B - 204, Building Name: MANORA MLA HOSTEL, Block
Sector:NARIMAN POINT, Road: TULSHAMI CHAMBERS, MUMBAI, Mumbai, Maharashtra,
400002

HEREINAFTER called ‘the Licensor (which expression shall mean and include the Licensor above
named and also his/her/their respective heirs, successors, assigns, executors and administrators)

AND

1) Name: Mr.Nagori Sanjay , Age : About 51 Years, Occupation : Service Residing at: Flat No:D-
003, Building Name:ashok avenue chs ltd, Block Sector:off ashok nagar , Road:marol military road,
Marol, Mumbai, Maharashtra, 400059

HEREINAFTER called ‘the Licensee’ (which expres

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")
ocr_text = ""
for line in output.split("\n"):
    item = (line.strip().replace("\n", ""))
    ocr_text = ocr_text + item
    if len(item) > 2:
        final_token = nlp(item)
        for ent in final_token.ents:
            print(ent,"|", spacy.explain(ent.label_)) 

Particulars Amount Paid | People, including fictional
1590.00/- | Numerals that do not fall under another type
15/01/2022 | Companies, agencies, institutions, etc.
15/01/2022 | Absolute or relative dates or periods
15/01/2022 | Absolute or relative dates or periods
mumbai | Countries, cities, states
1 | Numerals that do not fall under another type
Rathore Mahendra | People, including fictional
About 45 Years | Absolute or relative dates or periods
Occupation : Service | Companies, agencies, institutions, etc.
PAN | Companies, agencies, institutions, etc.
B - 204 | Objects, vehicles, foods, etc. (not services)
NARIMAN POINT | People, including fictional
MUMBAI | Companies, agencies, institutions, etc.
Mumbai | Countries, cities, states
Maharashtra, | Companies, agencies, institutions, etc.
400002 | Absolute or relative dates or periods
1 | Numerals that do not fall under another type
Nagori Sanjay | People, including fictional
About 51 Years | Absolute or relative dates or periods
003 |

In [17]:
final_text = ""
for line in output.split("\n"):
    item = line.strip().replace("\n", "")
    if len(item)>1:
        # print(item)
        final_text = final_text +" " + item

In [18]:
len(final_text)

2638

In [25]:
final_text

' LEAVE AND LICENSE AGREEMENT Particulars Amount Paid GRN/Transaction Id Rs. 1590.00/- MHO011704487202122E 15/01/2022 Registration Fee Rs. 1000/- MHO011704487202122E 15/01/2022 LEAVE AND LICENSE AGREEMENT This agreement is made and executed on 15/01/2022 at mumbai Between, 1) Name: Mr.Rathore Mahendra singh , Age : About 45 Years, Occupation : Service, PAN : AIGPR1118Q Residing at: Flat No:B - 204, Building Name: MANORA MLA HOSTEL, Block Sector:NARIMAN POINT, Road: TULSHAMI CHAMBERS, MUMBAI, Mumbai, Maharashtra, 400002 HEREINAFTER called ‘the Licensor (which expression shall mean and include the Licensor above named and also his/her/their respective heirs, successors, assigns, executors and administrators) AND 1) Name: Mr.Nagori Sanjay , Age : About 51 Years, Occupation : Service Residing at: Flat No:D- 003, Building Name:ashok avenue chs ltd, Block Sector:off ashok nagar , Road:marol military road, Marol, Mumbai, Maharashtra, 400059 HEREINAFTER called ‘the Licensee’ (which expression 

In [31]:
import docx

def read_docx(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

docx_path = 'Agreement AshokAvenue C 001-6.docx'
docx_text = read_docx(docx_path)

In [28]:
test_text = ""
for line in docx_text.split("\n"):
    item = line.strip().replace("\n", "")
    if len(item)>1:
        test_text = test_text +" " + item

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    similarity_score = cosine_similarity(vectors)
    return similarity_score[0, 1]

similarity = calculate_similarity(test_text, final_text)
print(f"Cosine Similarity: {similarity}")


Cosine Similarity: 0.9932201608522585


In [34]:
import spacy

def calculate_similarity_with_spacy(text1, text2):
    nlp = spacy.load("en_core_web_sm")
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    similarity_score = doc1.similarity(doc2)
    return similarity_score


similarity = calculate_similarity_with_spacy(final_text, test_text)
print(f"Spacy Similarity: {similarity}")


Spacy Similarity: 0.9951845643038231


  similarity_score = doc1.similarity(doc2)
