<a href="https://colab.research.google.com/github/rahiakela/computer-vision-research-and-practice/blob/main/opencv-projects-and-guide/ocr-works/08_ocr_image_preprocessing_and_text_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

**Reference**

https://stackoverflow.com/questions/62042172/how-to-remove-noise-in-image-opencv-python

https://medium.com/@sandun.amarathunga/extract-text-from-files-and-images-using-pytessaract-and-opencv-aa26b615a7fb



In [None]:
%%shell

sudo apt install tesseract-ocr
sudo apt-get install poppler-utils
pip install tesseract
pip install pytesseract
pip install Pillow==9.0.0
pip install pdf2image

Just restart the colab environment.

In [None]:
import os
import re
import cv2 
import numpy as np
import pytesseract
from pytesseract import Output

from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

In [None]:
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

##Define some functions

In [None]:
def pdf_splitter(path):
  pdf_in_file = open(path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"doc-page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"doc-page-{page}.pdf")
  return pdf_list

In [None]:
def pdf_to_image(pdf_list):
  img_list = []
  i = 0
  for pdf_file in pdf_list:
    pages = convert_from_path(pdf_file)
    for page in pages:
      page.save(f"doc-page-{str(i)}.jpg", "JPEG")
      img_list.append(f"doc-page-{str(i)}.jpg")
      i = i + 1
  return img_list

In [None]:
def img_display(im_data):
    dpi = 80
    #im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [None]:
def thick_font(image):
  image = cv2.bitwise_not(image)
  kernel = np.ones((2, 2), np.uint8)
  image = cv2.dilate(image, kernel, iterations=1)
  image = cv2.bitwise_not(image)
  
  return image

##PDF to Image conversion

In [None]:
pdf_list = pdf_splitter("large2_orig.pdf")

In [None]:
len(pdf_list)

529

In [None]:
pdf_list[18:19]

['doc-page-18.pdf']

In [None]:
img_list = pdf_to_image(pdf_list)

In [None]:
len(img_list)

529

In [None]:
img_list[18:19]

['doc-page-18.jpg']

In [None]:
!rm -rf *.pdf

##Image text extraction

In [None]:
custom_config = r'--oem 3 --psm 12'


# iterate from 18 to end 
for index, image_file in enumerate(img_list[18:]):
  print(f"Extracting text from {image_file}")
  image = cv2.imread(image_file)

  # removing noise
  noiseless_image_bw = cv2.fastNlMeansDenoising(image, None, 40, 7, 21)
  # font thickness
  image_smoothed = thick_font(noiseless_image_bw) 
  # extract text from image
  txt_data = pytesseract.image_to_string(image_smoothed, config=custom_config)

  with open("ocr-extracted.txt", "a") as f:
    f.write(f"############################## Page-{index} #################\n\n\n {txt_data}")

print("File is written")

In [26]:
!rm -rf *.jpg