<a href="https://colab.research.google.com/github/rahiakela/computer-vision-research-and-practice/blob/main/opencv-projects-and-guide/ocr-works/08_ocr_image_preprocessing_and_text_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

**Reference**

https://stackoverflow.com/questions/62042172/how-to-remove-noise-in-image-opencv-python

https://medium.com/@sandun.amarathunga/extract-text-from-files-and-images-using-pytessaract-and-opencv-aa26b615a7fb



In [None]:
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!sudo apt install tesseract-ocr
!sudo apt-get install poppler-utils

In [None]:
%%shell


pip install -U pdftotext
pip install pillow
pip install tesseract
pip install pytesseract
pip install pdf2image
pip install PyPDF2
pip install img2pdf==0.5.0

In [None]:
!pip install img2pdf==0.5.0

Just restart the colab environment.

In [1]:
import os
import re
import cv2
import numpy as np
import pytesseract
from pytesseract import Output

import pdftotext

from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

In [None]:
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')

##Image Process Utils

In [None]:
!pip install PyPDF2
!pip install pymupdf==1.22.0

In [1]:
%%shell

mkdir ocr_test_samples
mkdir ocr_test_samples/input_path
mkdir ocr_test_samples/out_path
mkdir ocr_test_samples/temp_path
mkdir ocr_test_samples/processed_path



In [2]:
!cp *.pdf ocr_test_samples/input_path

In [2]:
from image_processor import process_image_main

In [3]:
process_image_main()

In [4]:
!zip output.zip ocr_test_samples/out_path/*.*

  adding: ocr_test_samples/out_path/TESTING_OCI_5381240.pdf (deflated 26%)
  adding: ocr_test_samples/out_path/TESTING_OCI_5381286.pdf (deflated 21%)


##Define some functions

In [None]:
!mkdir pdf-files
!mkdir jpg-files

In [None]:
pdf_files_path = "pdf-files"
jpg_files_path = "jpg-files"

In [None]:
def pdf_splitter(path):
  pdf_in_file = open(path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/pdf-page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"pdf-page-{page}.pdf")
  return pdf_list

In [None]:
def pdf_to_image(pdf_list):
  img_list = []
  i = 0
  for pdf_file in pdf_list:
    pages = convert_from_path(os.path.join(pdf_files_path, pdf_file))
    for page in pages:
      page.save(f"{jpg_files_path}/pdf-page-{str(i)}.jpg", "JPEG")
      img_list.append(f"{jpg_files_path}/pdf-page-{str(i)}.jpg")
      i = i + 1
  return img_list

In [None]:
def img_display(im_path=None, im_data=None):
    dpi = 80
    if im_path is not None:
      im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]

    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [None]:
def thick_font(image):
  image = cv2.bitwise_not(image)
  kernel = np.ones((2, 2), np.uint8)
  image = cv2.dilate(image, kernel, iterations=1)
  image = cv2.bitwise_not(image)

  return image

##PDF to Image conversion

In [None]:
pdf_list = pdf_splitter("-248363259958132913_CLD_Redacted.pdf")

In [None]:
len(pdf_list)

37

In [None]:
pdf_list[118]

'pdf-page-118.pdf'

In [None]:
img_list = pdf_to_image(pdf_list)

In [None]:
len(img_list)

14

In [None]:
img_list[118]

'jpg-files/pdf-page-118.jpg'

In [None]:
!rm -rf jpg-files/*.jpg

In [None]:
!rm -rf pdf-files/*.pdf

##Image preprocessing

In [None]:
img_display(im_path=img_list[118])

In [None]:
image = cv2.imread(img_list[118])
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (80, 1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=4)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
  cv2.drawContours(result, [c], -1, (255,255,255), 5)

# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=4)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
  cv2.drawContours(result, [c], -1, (255,255,255), 5)

#img_display(im_data=thresh)
img_display(im_data=result)
cv2.imwrite('result.png', result)

In [None]:
image = cv2.imread(img_list[118])
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (80, 1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
  cv2.drawContours(result, [c], -1, (255,255,255), 5)

# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 80))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=1)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
  cv2.drawContours(result, [c], -1, (255,255,255), 5)

#img_display(im_data=thresh)
img_display(im_data=result)
cv2.imwrite('result2.png', result)

In [None]:
custom_config = r'--oem 3 --psm 12'

txt_data = pytesseract.image_to_string("result2.png", config=custom_config)

with open("ocr-extracted2.txt", "a") as f:
  f.write(txt_data)

print("File is written")

File is written


##Image text extraction

In [None]:
%%time

custom_config = r'--oem 3 --psm 12'

# iterate from 18 to end
for index, image_file in enumerate(img_list):
  # print(f"Extracting text from {image_file}")
  image = cv2.imread(image_file)

  # removing noise
  noiseless_image_bw = cv2.fastNlMeansDenoising(image, None, 40, 7, 21)
  # font thickness
  image_smoothed = thick_font(noiseless_image_bw)
  # extract text from image
  txt_data = pytesseract.image_to_string(image_smoothed, config=custom_config)

  with open("-248363259958132913_CLD_Redacted.txt", "a") as f:
    f.write(txt_data)

print("File is written")

File is written
CPU times: user 13min 32s, sys: 1.65 s, total: 13min 34s
Wall time: 11min 18s


In [None]:
!rm -rf *.jpg

##Large PDF text extraction

In [None]:
%%time

# Load your PDF
with open("Synodex_Sample_APS_Report.pdf", "rb") as f:
  pdf = pdftotext.PDF(f)

# How many pages?
print(len(pdf))

pdf_text = "\n\n".join(pdf)

# Iterate over all the pages
#for page in pdf:
  #print(page)
  #pdf_text = "\n\n".join(pdf)

# write text into file
with open("ocr-extracted.txt", "a") as f:
  f.write(pdf_text)

print("File is written")
# Read all the text into one string
#print("\n\n".join(pdf))

132
File is written
CPU times: user 136 ms, sys: 7.98 ms, total: 144 ms
Wall time: 150 ms


In [None]:
keywords_list = [
  "Consolidated", "Cigarettes", "Tobacco", "Ketoacidosis", "Diagnosis", "Medical", "Night Sweats", "Symptom Status",
  "Diagnosis Status", "Diabetes is under", "Proteinuria diagnosed", "Ketone levels", "Random Blood Sugar", "Serum Creatinine",
  "Albumin-Creatinine Ratio", "Elaborate abnormal findings", "Urine Creatinine", "Urine Albumin"
]

In [None]:
%%time

# Load your PDF
with open("file_large.pdf", "rb") as f:
  pdf = pdftotext.PDF(f)

# How many pages?
print(len(pdf))

keyword_found_list = []
#keyword_found_dict = {}

# Iterate over all the keywords
for keyword in keywords_list:
  # Iterate over all the pages
  for page in pdf:
    if re.search(keyword, page) and keyword not in keyword_found_list:
      keyword_found_list.append(keyword)
      #keyword_found_dict[keyword] = True
      break

      #pdf_text = "\n\n".join(pdf)


144
CPU times: user 459 ms, sys: 448 µs, total: 460 ms
Wall time: 459 ms


In [None]:
keyword_found_list

['Consolidated',
 'Cigarettes',
 'Tobacco',
 'Ketoacidosis',
 'Diagnosis',
 'Medical',
 'Night Sweats',
 'Symptom Status',
 'Diagnosis Status',
 'Diabetes is under',
 'Proteinuria diagnosed',
 'Ketone levels',
 'Random Blood Sugar',
 'Serum Creatinine',
 'Albumin-Creatinine Ratio',
 'Elaborate abnormal findings',
 'Urine Creatinine',
 'Urine Albumin']