<a href="https://colab.research.google.com/github/rahiakela/computer-vision-research-and-practice/blob/main/opencv-projects-and-guide/ocr-works/08_ocr_image_preprocessing_and_text_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

**Reference**

https://stackoverflow.com/questions/62042172/how-to-remove-noise-in-image-opencv-python

https://medium.com/@sandun.amarathunga/extract-text-from-files-and-images-using-pytessaract-and-opencv-aa26b615a7fb



In [None]:
%%shell

sudo apt install tesseract-ocr
sudo apt-get install poppler-utils
pip install tesseract
pip install pytesseract
#pip install Pillow==9.0.0
pip install pdf2image
pip install PyPDF2

In [None]:
!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext

Just restart the colab environment.

In [2]:
import os
import re
import cv2 
import numpy as np
import pytesseract
from pytesseract import Output

import pdftotext

from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

In [3]:
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

##Define some functions

In [4]:
!mkdir pdf-files
!mkdir jpg-files

In [5]:
pdf_files_path = "pdf-files"
jpg_files_path = "jpg-files"

In [6]:
def pdf_splitter(path):
  pdf_in_file = open(path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/pdf-page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"pdf-page-{page}.pdf")
  return pdf_list

In [7]:
def pdf_to_image(pdf_list):
  img_list = []
  i = 0
  for pdf_file in pdf_list:
    pages = convert_from_path(os.path.join(pdf_files_path, pdf_file))
    for page in pages:
      page.save(f"{jpg_files_path}/pdf-page-{str(i)}.jpg", "JPEG")
      img_list.append(f"{jpg_files_path}/pdf-page-{str(i)}.jpg")
      i = i + 1
  return img_list

In [8]:
def img_display(im_data):
    dpi = 80
    #im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [9]:
def thick_font(image):
  image = cv2.bitwise_not(image)
  kernel = np.ones((2, 2), np.uint8)
  image = cv2.dilate(image, kernel, iterations=1)
  image = cv2.bitwise_not(image)
  
  return image

##PDF to Image conversion

In [18]:
pdf_list = pdf_splitter("file_large.pdf")

In [20]:
len(pdf_list)

144

In [20]:
pdf_list[0]

'pdf-page-0.pdf'

In [28]:
img_list = pdf_to_image(pdf_list)

In [29]:
len(img_list)

144

In [30]:
img_list[18:19]

['jpg-files/pdf-page-18.jpg']

In [26]:
!rm -rf jpg-files/*.jpg

##Image text extraction

In [31]:
%%time

custom_config = r'--oem 3 --psm 12'

# iterate from 18 to end 
for index, image_file in enumerate(img_list):
  # print(f"Extracting text from {image_file}")
  image = cv2.imread(image_file)

  # removing noise
  noiseless_image_bw = cv2.fastNlMeansDenoising(image, None, 40, 7, 21)
  # font thickness
  image_smoothed = thick_font(noiseless_image_bw) 
  # extract text from image
  txt_data = pytesseract.image_to_string(image_smoothed, config=custom_config)

  with open("ocr-extracted.txt", "a") as f:
    f.write(txt_data)

print("File is written")

File is written
CPU times: user 54min, sys: 5.95 s, total: 54min 6s
Wall time: 39min 10s


In [None]:
!rm -rf *.jpg

##Large PDF text extraction

In [11]:
%%time

# Load your PDF
with open("file_large.pdf", "rb") as f:
  pdf = pdftotext.PDF(f)

# How many pages?
print(len(pdf))

pdf_text = "\n\n".join(pdf)

# Iterate over all the pages
#for page in pdf:
  #print(page)
  #pdf_text = "\n\n".join(pdf)

# write text into file
with open("ocr-extracted3.txt", "a") as f:
  f.write(pdf_text)

print("File is written")
# Read all the text into one string
#print("\n\n".join(pdf))

144
File is written
CPU times: user 633 ms, sys: 9.18 ms, total: 642 ms
Wall time: 643 ms


In [12]:
keywords_list = [
  "Consolidated", "Cigarettes", "Tobacco", "Ketoacidosis", "Diagnosis", "Medical", "Night Sweats", "Symptom Status",
  "Diagnosis Status", "Diabetes is under", "Proteinuria diagnosed", "Ketone levels", "Random Blood Sugar", "Serum Creatinine",
  "Albumin-Creatinine Ratio", "Elaborate abnormal findings", "Urine Creatinine", "Urine Albumin"
]

In [None]:
%%time

# Load your PDF
with open("file_large.pdf", "rb") as f:
  pdf = pdftotext.PDF(f)

# How many pages?
print(len(pdf))

pdf_text = "\n\n".join(pdf)

# Iterate over all the pages
for page in pdf:
  print(page)
  pdf_text = "\n\n".join(pdf)
