# optical character recognition using pytesseract

## in this project we will be using computer vision technologies to extract text from images.

# import libraries

In [37]:
import cv2
import pytesseract
from PIL import Image
import numpy as np
from pytesseract import Output
import re
import matplotlib.pyplot as plt
from colorama import Fore, Style
import os

# test with an image

In [21]:
image = Image.open('../input/book-pages/IMG_20220730_172154.jpg')
image.resize((300, 150))
image.save('sample1.png')
image

In [22]:
custom_config = r'-l eng --oem 3 --psm 6' 
text = pytesseract.image_to_string(image,config=custom_config)
print(text)

### it works pretty well

In [23]:
image1 = Image.open('../input/book-pages/IMG_20220730_172224.jpg')
image1.resize((300, 150))
#image.save('sample1.png')
image1

In [24]:
custom_config = r'-l eng --oem 3 --psm 6' 
text = pytesseract.image_to_string(image1,config=custom_config)
print(text)

### now lets remove the unwanted characters from the text

In [25]:
try:
    text = pytesseract.image_to_string(image1, lang = 'eng')
    characters_to_remove = "!()@—*“>+-/,'|£#%$&^_~"
    new_string = text
    for characters in characters_to_remove:
        new_string = new_string.replace(characters, '')
    print(new_string)
except IOError as e:
    print("Error (%s)." % e)

### we will be using cv2 to work with complex images

In [26]:
image = cv2.imread('sample1.png')
image

## convert to grayscale
### in this way it will be lot more easier to work with

In [27]:
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = get_grayscale(image)
Image.fromarray(gray)


## remove the noises using medianBlur function 

In [28]:
def remove_noise(image):
    return cv2.medianBlur(image, 5)
noise = remove_noise(gray)
Image.fromarray(noise)

## Thresholding
### thresholding is an interesting operation. if the pixel value is greater than some threshold it will assign 1 to that cell and if not it will assign 0

In [29]:
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
thresh = thresholding(gray)
Image.fromarray(thresh)

## erosion 
### erode operation is very helpful in image manipulation. It usually fits the missing shapes and lattices in the images

In [30]:
def erode(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.erode(image, kernel, iterations  = 1)
erode = erode(gray)
Image.fromarray(erode)

## morphological transformation
### Morphological transformation is one of the best-suited techniques for binary images where it sorts the image according to its pixel values rather than going for numerical values of the images keeping into account the threshold values too.

In [31]:
def opening(image):
    kernel = np.ones((5,5), np.uint8)
    return cv2.morphologyEx(image,cv2.MORPH_OPEN, kernel)
opening = opening(gray)
Image.fromarray(opening)

## template matching
### template matching is a method for searching and finding the location of a template image in a larger image. 

In [32]:
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
match = match_template(gray, gray)
match

## drawing rectangle araound text


In [33]:
img = cv2.imread('sample1.png')
h, w, c = img.shape
boxes = pytesseract.image_to_boxes(img)
for b in boxes.splitlines():
    b = b.split(' ')
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])),(0, 255, 0), 2)
Image.fromarray(img)

### Now we will draw rectangle around a specific word. in this case "India".

In [34]:
img = cv2.imread('sample1.png')
d = pytesseract.image_to_data(img, output_type=Output.DICT)
keys = list(d.keys())

date_pattern = 'India'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if float(d['conf'][i]) > 60:
        if re.match(date_pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
Image.fromarray(img)

# Adhaar details extraction.

### Adhaar is an identity proof document in India. it contains fingerpring, address, age and other usefull information. Each adhaar card has a unique Id. Adhaar Id is a 12 digit numerical combination. 
### In this section we will be extracting information from adhaar cards. \### images used in this project are taken from google ( without the user's consent). 

In [167]:
def extract_details(image_path):
    image = cv2.imread(image_path)
    image = Image.fromarray(thresholding(get_grayscale(image)))
    text = pytesseract.image_to_string(image, lang = 'eng')
    text = text.replace("\n", " ")
    text = text.replace("  ", " ")
    regex_NUM = re.compile('\d{4}')
    regex_DOB = re.compile('\d{2}[-/]\d{2}[-/]\d{4}')
    regex_NAME = re.compile('[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+')
    
    
    name = ''
    DOB = ''
    num = ''
    if len(regex_NAME.findall(text)) == 0:
        name = "No Name"
        #print(Style.RESET_ALL)
    else:
        name = regex_NAME.findall(text)[0]
    #print("==========================")
    
    if len(regex_DOB.findall(text)) == 0:
        DOB = "No DOB"
        #print(Style.RESET_ALL)
    else:
        DOB = regex_DOB.findall(text)[0]
    #print("==========================")
    
    if len(regex_NUM.findall(text)) < 3:
        num = "No num"
        #print(Style.RESET_ALL)
    else:
        num = regex_NUM.findall(text)[-3:]
    #print("==========================")
    return name, DOB, num

### The above function extract details from the grayscale and thresholded image

In [173]:
def extract_details1(image_path):
    #image = cv2.imread(image_path)
    #image = Image.fromarray(thresholding(get_grayscale(image)))
    text = pytesseract.image_to_string(Image.open(image_path), lang = 'eng')
    text = text.replace("\n", " ")
    text = text.replace("  ", " ")
    regex_NUM = re.compile('\d{4}')
    regex_DOB = re.compile('\d{2}[-/]\d{2}[-/]\d{4}')
    regex_NAME = re.compile('[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+')
    
    name = ''
    DOB = ''
    num = ''
    if len(regex_NAME.findall(text)) == 0:
        name = "No Name"
        #print(Style.RESET_ALL)
    else:
        name = regex_NAME.findall(text)[0]
    #print("==========================")
    
    if len(regex_DOB.findall(text)) == 0:
        DOB = "No DOB"
        #print(Style.RESET_ALL)
    else:
        DOB = regex_DOB.findall(text)[0]
    #print("==========================")
    
    if len(regex_NUM.findall(text)) < 3:
        num = "No num"
        #print(Style.RESET_ALL)
    else:
        num = regex_NUM.findall(text)[-3:]
    #print("==========================")
    return name, DOB, num

### The above function extracts information from original images

### Now let's merge those imformation 

In [159]:
def extract(image_path):
    image = cv2.imread(os.path.join(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
    
    n1, d1, A1 = extract_details(image_path)
    n2, d2, A2 = extract_details1(image_path)
    if n1 != 'No Name':
        print("Name :", n1)
    elif n2 != 'No Name':
        print("Name :", n2)
    else:
        print(f'Blurry Image for tesseract. Input new clear image for viewing name !!!')
    print("===================")
    if d1 != 'No DOB':
        print("Date of Birth :", d1)
    elif d2 != 'No DOB':
        print("Date of Birth :", d2)
    else:
        print(f'Blurry Image for tesseract. Input new clear image for viewing Date of Birth !!!')
    print("===================")
    if A1 != 'No num':
        print("Adhaar card number :", " ".join(A1))
    elif A2 != 'No num':
        print("Adhaar card number :", " ".join(A2))
    else:
        print(f'Blurry Image for tesseract. Input new clear image for viewing Adhaar card number !!!')
    

### Now let's try out our performences

In [177]:
extract('../input/adhaar/Aadhar_Card.jpg')

In [176]:
extract("../input/adhaar/card2.jpg")

In [178]:
extract("../input/adhaar/thumb1.jpg")

In [180]:
extract("../input/adhaar/astha.jpeg")

In [181]:
extract("../input/adhaar/abdul.jpg")

### pretty good actually

In [None]:
end 