# OCR WITH TESSERACT

Tesseract overview: https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33418.pdf

## Install Library

In [None]:
!pip install pytesseract

## Import Library

In [None]:
import cv2
import pytesseract
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import sys, re
np.set_printoptions(threshold=sys.maxsize)

## 1. PREPROCESSING

### 1.1 Convert data to bitmap

In [None]:
image_path = 'Data/A.jpeg'
image = cv2.imread(image_path)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

### 1.2 Grayscale

In [None]:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
plt.imshow(cv2.cvtColor(gray, cv2.COLOR_BGR2RGB))

### 1.3 Tresholding

convert to black-and-white

In [None]:
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
plt.imshow(cv2.cvtColor(thresh, cv2.COLOR_BGR2RGB))

### Check vector from black-and-white image

In [None]:
scale_percent = 8
width = int(image.shape[1] * scale_percent / 100)
height = int(image.shape[0] * scale_percent / 100)
dim = (width, height)
  
# resize image
resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
gray_r = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
thresh_r = cv2.threshold(gray_r, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
print(thresh_r)

#### Create Function

In [None]:
def preprocessing(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.show()
    plt.imshow(cv2.cvtColor(gray, cv2.COLOR_BGR2RGB))
    plt.show()
    plt.imshow(cv2.cvtColor(thresh, cv2.COLOR_BGR2RGB))
    plt.show()
    return thresh

In [None]:
a = preprocessing('Data/2_1.png')

## 2. Pytesseract    

### Cutsom Configuration

Page segmentation modes:

0. Orientation and script detection (OSD) only.
1. Automatic page segmentation with OSD.
2. Automatic page segmentation, but no OSD, or OCR. (not implemented)
3. Fully automatic page segmentation, but no OSD. (Default)
4. Assume a single column of text of variable sizes.
5. Assume a single uniform block of vertically aligned text.
6. Assume a single uniform block of text.
7. Treat the image as a single text line.
8. Treat the image as a single word.
9. Treat the image as a single word in a circle.
10. Treat the image as a single character.
11. Sparse text. Find as much text as possible in no particular order.
12. Sparse text with OSD.
13. Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.


OCR Engine modes:

0. Legacy engine only.
1. Neural nets LSTM engine only.
2. Legacy + LSTM engines.
3. Default, based on what is available.

## 2.1 Pytesseract Image to String

In [None]:
def image_to_string(file_path, oem=3, psm=3):
    bnw = preprocessing(file_path)
    custom_config = r"--oem "+str(oem)+" --psm "+str(psm)
    text = pytesseract.image_to_string(bnw, config=custom_config, lang="ind")
    return text

In [None]:
text = image_to_string('Data/2_2.png')
print(text)

## 2.2 Pytesseract Image to Data

In [None]:
def image_to_data(file_path, oem=3, psm=3):
    bnw = preprocessing(file_path)
    custom_config = r"--oem "+str(oem)+" --psm "+str(psm)
    result = pytesseract.image_to_data(bnw, output_type="dict", config=custom_config, lang="ind")
    return result

In [None]:
data = image_to_data('Data/2_4.png')

In [None]:
data.keys()

# 3. Postprocessing

## 3.1 Simple Postprocessing

In [None]:
def simple_postprocessing(text):
    if text[-1] == '\n':
        text = text[:-1]
        
    text = re.sub('\n\n', '\t', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', '\n', text)
    
    return(text)

In [None]:
ocr_text = image_to_string('Data/2_3.png')
result = simple_postprocessing(ocr_text)
print(result)

In [None]:
ocr_text = image_to_string('Data/2_4.png')
result = simple_postprocessing(ocr_text)
print(result)

## 3.2 Text Localization

Find text coordinate and create bounding box in original image.

In [None]:
def text_localization(tesseract_results, image_path, confidence_threshold = 50):
    images = cv2.imread(image_path)
    for i in range(0, len(tesseract_results["text"])):

        x = tesseract_results["left"][i]
        y = tesseract_results["top"][i]
        w = tesseract_results["width"][i]
        h = tesseract_results["height"][i]

        text = tesseract_results["text"][i]
        conf = float(tesseract_results["conf"][i])
        
        # filter out weak confidence text localizations
        if conf > confidence_threshold:          
            text = "".join(text).strip()
            cv2.rectangle(images,
                          (x, y),
                          (x + w, y + h),
                          (0, 255, 0), 2)

    plt.imshow(cv2.cvtColor(images, cv2.COLOR_BGR2RGB))
    plt.show()
    
    return images

In [None]:
results = image_to_data('Data/2_4.png')
bounding_images = text_localization(results, 'Data/2_4.png', confidence_threshold=50)

# Noisy Image

In [None]:
text = image_to_string("Data/noise_01.jpg")
result = simple_postprocessing(text)
print(result)

In [None]:
text = image_to_string("Data/noise_02.jpg")
result = simple_postprocessing(text)
print(result)

In [None]:
text = image_to_string("Data/1_1.png")
result = simple_postprocessing(text)
print(result)

# Handle Simple Table Format

OCR for simple table and save as csv/excel.

In [None]:
results = image_to_data('Data/table.png')
bounding_images = text_localization(results, 'Data/table.png', confidence_threshold=50)

In [None]:
a = image_to_string('Data/table.png')
print(a)

In [None]:
results.keys()

#### Image to data have coordinate for each words, use block_num and line_num to get each entry for each tabel and column.

In [None]:

def simple_table(results, threshold_gap = 400, confidence_threshold=70):
    result_list = []
    uni_par = np.unique(results['block_num'])
    for i, pn in enumerate(uni_par):

        par_index = np.where(np.array(results['block_num']) == pn)[0]
        par_text = np.array(results['text'])[par_index]
        par_left = np.array(results['left'])[par_index]
        par_width = np.array(results['width'])[par_index]
        par_conf = np.array(results['conf'])[par_index]
        par_line = np.array(results['line_num'])[par_index]

        uni_line = np.unique(par_line)
        for j, ln in enumerate(uni_line):
            temp_result = []

            line_index = np.where(par_line == ln)[0]
            line_text = par_text[line_index]
            line_left = par_left[line_index]
            line_width = par_line[line_index]
            line_conf = par_conf[line_index]

            temp_result.append([])
            k = 0
            while k < len(line_text)-1:
                if line_text[k] != '' and line_text[k+1] != '':

                    end_k = line_left[k]+line_width[k]
                    
                    if float(line_conf[k]) > confidence_threshold:
                        temp_result[-1].append(line_text[k])

                    if (line_left[k+1] - end_k) > threshold_gap and temp_result[-1] != []:
                        temp_result.append([])

                k += 1
            temp_result[-1].append(line_text[k])

            temp_result = [' '.join(tp) for tp in temp_result]

            if temp_result != [''] and temp_result != [' ']:
                result_list.append(temp_result)

    df = pd.DataFrame(result_list)  
    return df
                

In [None]:
image_path = 'Data/table.png'
results = image_to_data(image_path)
bounding_images = text_localization(results, image_path, confidence_threshold=70)
df = simple_table(results, threshold_gap=400, confidence_threshold=70)
df

In [None]:
image_path = 'Data/table2.png'
results = image_to_data(image_path)
bounding_images = text_localization(results, image_path, confidence_threshold=70)
df = simple_table(results, threshold_gap=400, confidence_threshold=70)
df

In [None]:
image_path = 'Data/table6.png'
results = image_to_data(image_path)
bounding_images = text_localization(results, image_path, confidence_threshold=70)
df = simple_table(results, threshold_gap=400, confidence_threshold=70)
df

#### simple_table function fail to parse table6.png, try another method
#### use coordinate from tesseract result instead of block and line number

In [None]:
def simple_table2(results, threshold_gap = 20, confidence_threshold=70):
    text_list = []
    top_list = []
    left_list = []
    width_list = []
    conf_list = []

    for i, text in enumerate(results['text']):
        if text != '' and text != ' ':
            top = results['top'][i]
            if top_list == []:
                top_list.append(top)
                text_list.append([])
                left_list.append([])
                width_list.append([])
                conf_list.append([])

            min_dist = [1 if np.absolute(tl-top) < 10 else 0 for tl in top_list]
            unique = np.unique(min_dist)
            if len(unique) == 1 and unique == 0:
                top_list.append(top)
                text_list.append([])
                left_list.append([])
                width_list.append([])
                conf_list.append([])
                top_index = len(top_list)-1
            else:
                top_index = min_dist.index(1)


            text_list[top_index].append(text)
            left_list[top_index].append(results['left'][i])
            width_list[top_index].append(results['width'][i])
            conf_list[top_index].append(results['conf'][i])


    result_list = []
    for i, line_text in enumerate(text_list):

        line_width = width_list[i]
        line_left = left_list[i]
        line_conf = conf_list[i]
        temp_result = [[]]
        k = 0
        while k < len(line_text)-1:
            if line_text[k] != '' and line_text[k+1] != '':

                end_k = line_left[k]+line_width[k]

                if float(line_conf[k]) > confidence_threshold:
                    temp_result[-1].append(line_text[k])

                if (line_left[k+1] - end_k) > threshold_gap and temp_result[-1] != []:
                    temp_result.append([])

            k += 1
        temp_result[-1].append(line_text[k])

        temp_result = [' '.join(tp) for tp in temp_result]

        if temp_result != [''] and temp_result != [' ']:
            result_list.append(temp_result)

    df = pd.DataFrame(result_list)  
    return df
        
    

In [None]:
image_path = 'Data/table6.png'
results = image_to_data(image_path)
bounding_images = text_localization(results, image_path, confidence_threshold=70)
df = simple_table2(results, threshold_gap=20, confidence_threshold=70)
df

In [None]:
df.to_csv('Results/table.csv', index=None)

# Extract text in any box

In [None]:
image = cv2.imread("Data/boxes.png")
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

## Box detection

In [None]:
''' source https://pyimagesearch.com/2015/04/20/sorting-contours-using-python-and-opencv/'''

def sort_contours(cnts, method="left-to-right"):
    
    reverse = False
    i = 0

    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True

    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1

    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
        key=lambda b:b[1][i], reverse=reverse))

    return (cnts, boundingBoxes)


#### Simple box detection using opencv.

In [None]:
image = cv2.imread("Data/boxes2.png")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray,128,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)[1]

H, W, C = image.shape
cropped_dir_path = 'crop_image/'

contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
(contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")

idx = 0
for c in contours:
    x, y, w, h = cv2.boundingRect(c)
    if (w > 50 and h > 50) and w >= 2.5*h:
        if h < 0.9*H:
            idx += 1
            new_img = image[y:y+h, x:x+w]
            cv2.imwrite(cropped_dir_path+str(idx) + '.png', new_img)
        
    if (w > 50 and h > 50) and w < 2.5*h and w >= h:
        if w < 0.9*W and h < 0.9*H:
            idx += 1
            new_img = image[y:y+h, x:x+w]
            cv2.imwrite(cropped_dir_path+str(idx) + '.png', new_img)

#### For more advance box detection, try using cv2.getStructuringElement, cv2.HoughLinesP, etc.

for example: https://levelup.gitconnected.com/text-extraction-from-a-table-image-using-pytesseract-and-opencv-3342870691ae

## Detect Text in Each Box

In [None]:
image = cv2.imread("Data/boxes2.png")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray,128,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)[1]

H, W, C = image.shape
custom_config = r"--oem 3 --psm 3"

contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
(contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")

idx = 0
for c in contours:
    x, y, w, h = cv2.boundingRect(c)
    if (w > 50 and h > 50) and w >= 2.5*h:
        if h < 0.9*H:
            idx += 1
            new_thresh = thresh[y:y+h, x:x+w]
            text = pytesseract.image_to_string(new_thresh, config=custom_config, lang="ind")
            print(simple_postprocessing(text))
        
    if (w > 50 and h > 50) and w < 2.5*h and w >= h:
        if w < 0.9*W and h < 0.9*H:
            idx += 1
            new_thresh = thresh[y:y+h, x:x+w]
            text = pytesseract.image_to_string(new_thresh, config=custom_config, lang="ind")
            print(simple_postprocessing(text))

# Handle PDF Data

In [None]:
!pip install pdf2image

In [None]:
from pdf2image import convert_from_path

doc = convert_from_path('Data/table.pdf')

for page_number, page_data in enumerate(doc):
    image = np.array(page_data)

# Handwriting Recognition

data: https://www.kaggle.com/datasets/landlord/handwriting-recognition

some code for handwriting recognition : https://www.kaggle.com/datasets/landlord/handwriting-recognition/code