In [7]:
#Note: For Prototype OK to import whole library, otherwise only import necessary modules

import cv2 
import pytesseract
from pytesseract import Output
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


### Goals: 

- Create a prototype OCR 

#### Minimal functions: 

- Parse a Folder with images
- Preprocess each image 
- Run OCR on each image 
- Store image name and text found in a DataFrame
- Convert DataFrame to CSV for further processing / upload 

In [37]:


#find only images with jpg, jpeg and png ending
def is_image(filename):
    if file.lower().endswith(('.jpg', '.jpeg','.png')):
            return file


#resize image
def scale_image(pct, img):
    scale_percent = pct # percent of original size
    width = int(img.shape[1] * scale_percent / 100)
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    return resized
  


### Iamge corrections from: https://nanonets.com/blog/ocr-with-tesseract/

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,3)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)


#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated


   

In [46]:
#Experiment with different components of the pipeline to get best results

def processing_pipeline(src_path):
    image = cv2.imread(src_path)
    image = scale_image(150, image)
    image = get_grayscale(image) #convert image to Greyscale
    image = remove_noise(image) #remove noise
    image = thresholding(image) #Threshholding
    image = canny(image) #canny edge detection
    image = deskew(image)  #deskew
    return image  

In [90]:
#Process the images

image_filenames = [] #list for image filenames
extracted_text = [] #list for extracted text

image_folder = "./images" #where the images are stored for prototype

for root, directories, files in os.walk(image_folder): #iterate through file system
    for file in files:
        if is_image(file):
            src_path = os.path.join(root,file) #this only works without subdirectories
            processed_image = processing_pipeline(src_path) #process the images with image processing pipeline
            processed_data = pytesseract.image_to_string(processed_image,  lang='eng') #extract english text from images 
            image_filenames.append(src_path) #append filenames
            extracted_text.append(processed_data) #append text

raw_data = zip(image_filenames, extracted_text) #combine image filenames with extracted text
df = pd.DataFrame(raw_data, columns=['image_filenamme','extracted_text']) #convert to dataframe 
df.to_csv('extracted_information.csv') #save csv to disk