<a href="https://colab.research.google.com/github/rahiakela/computer-vision-research-and-practice/blob/main/opencv-projects-and-guide/ocr-works/04_image_pdf_preprocessing_with_opencv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Image PDF preprocessing with OpenCV

##Setup

In [None]:
%%shell

sudo apt install tesseract-ocr
pip install tesseract
pip install pytesseract
pip install Pillow==9.0.0

Just restart the colab environment.

In [None]:
import re
import cv2 
import numpy as np
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt

In [None]:
!tesseract --version

In [None]:
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

In [None]:
image_name = 'sample1.png'

## Preprocessing using OpenCV

We will write basic functions for different preprocessing methods 
- grayscaling
- thresholding
- dilating
- eroding
- opening
- canny edge detection
- noise removal
- deskwing
- template matching. 

Different methods can come in handy with different kinds of images. 

In [None]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [None]:
# Plot original image
image = cv2.imread('sample1.png')
b,g,r = cv2.split(image)
rgb_img = cv2.merge([r,g,b])

plt.figure(figsize=(30,15))
plt.imshow(image)
plt.title('SAMPLE 1 ORIGINAL IMAGE')
plt.show()

In [None]:
# Preprocess image 
gray = get_grayscale(image)
thresh = thresholding(gray)
opening = opening(gray)
canny = canny(gray)
noise = remove_noise(gray)
dilated = dilate(gray)

images = {'gray': gray, 
          'thresh': thresh, 
          'opening': opening, 
          'canny': canny}

In [None]:
# Plot images after preprocessing
fig = plt.figure(figsize=(16,16))
ax = []

rows = 2
columns = 2
keys = list(images.keys())
for i in range(rows*columns):
    ax.append( fig.add_subplot(rows, columns, i+1) )
    ax[-1].set_title('INVOICE - ' + keys[i]) 
    plt.imshow(images[keys[i]], cmap='gray')    

In [None]:
# Get OCR output using Pytesseract
custom_config = r'--oem 3 --psm 6'
print('-----------------------------------------')
print('TESSERACT OUTPUT --> ORIGINAL IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(image, config=custom_config))

In [None]:
print('\n-----------------------------------------')
print('TESSERACT OUTPUT --> GRAY IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(images["gray"], config=custom_config))

In [None]:
print('\n-----------------------------------------')
print('TESSERACT OUTPUT --> THRESHOLDED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(images["thresh"], config=custom_config))

In [None]:
print('\n-----------------------------------------')
print('TESSERACT OUTPUT --> OPENED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(images["opening"], config=custom_config))

In [None]:
print('TESSERACT OUTPUT --> CANNY IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(images["canny"], config=custom_config))

In [None]:
print('TESSERACT OUTPUT --> NOISY IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(noise, config=custom_config))

In [None]:
print('TESSERACT OUTPUT --> DILATED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(dilated, config=custom_config))

##Removing noise using `fastNlMeansDenoising`

In [None]:
# Plot original image
image = cv2.imread(image_name)
b,g,r = cv2.split(image)
rgb_img = cv2.merge([r,g,b])

plt.figure(figsize=(30,15))
plt.imshow(image)
plt.title('SAMPLE 1 ORIGINAL IMAGE')
plt.show()

###Step 1

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

noise_removed = cv2.fastNlMeansDenoising(gray, 31, 7, 21)
plt.figure(figsize=(30, 15))
plt.imshow(noise_removed)
plt.title('SAMPLE DENOISY IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> DENOISE IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(noise_removed, config=custom_config))

###Step 2

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

noise_removed = cv2.fastNlMeansDenoising(gray, None, 31, 7, 21)
plt.figure(figsize=(30, 15))
plt.imshow(noise_removed)
plt.title('SAMPLE DENOISY IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> DENOISE IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(noise_removed, config=custom_config))

###Step 3

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

empty_gray = np.empty(gray.shape, np.uint8)
noise_removed = cv2.fastNlMeansDenoising(gray, empty_gray, 31, 7, 21)
plt.figure(figsize=(30, 15))
plt.imshow(noise_removed)
plt.title('SAMPLE DENOISY IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> DENOISE IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(noise_removed, config=custom_config))

###Step 4

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

noise_removed = cv2.fastNlMeansDenoising(gray, h=31, templateWindowSize=7, searchWindowSize=21)
plt.figure(figsize=(30, 15))
plt.imshow(noise_removed)
plt.title('SAMPLE DENOISY IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> DENOISE IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(noise_removed, config=custom_config))

##Removing blur using `bilateralFilter`

###Step 1

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

blurred = blur = cv2.bilateralFilter(gray, 31, 7, 21)
plt.figure(figsize=(30, 15))
plt.imshow(blurred)
plt.title('SAMPLE BLURRED IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> BLURRED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(blurred, config=custom_config))

###Step 2

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

blurred = cv2.bilateralFilter(gray, 31, 7, 21, None)
plt.figure(figsize=(30, 15))
plt.imshow(blurred)
plt.title('SAMPLE BLURRED IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> BLURRED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(blurred, config=custom_config))

###Step 3

In [None]:
# Plot original image
image = cv2.imread(image_name)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

blur_ = np.empty(gray.shape, np.uint8)
blurred = cv2.bilateralFilter(gray, 31, 7, 21, blur_)
plt.figure(figsize=(30, 15))
plt.imshow(blurred)
plt.title('SAMPLE BLURRED IMAGE')
plt.show()

In [None]:
print('TESSERACT OUTPUT --> BLURRED IMAGE')
print('-----------------------------------------')
print(pytesseract.image_to_string(blurred, config=custom_config))