<a target="_blank" href="https://colab.research.google.com/github/ricocf/Leaflet-Product-Classification/blob/main/text_model/OCR_text_extraction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# **Library Import and Setup**

In [None]:
!pip install pytesseract
!sudo apt-get install tesseract-ocr
!wget https://github.com/tesseract-ocr/tessdata/raw/main/deu.traineddata
!apt-get install tesseract-ocr-deu
!wget https://zenodo.org/record/7869954/files/products_leaflets_512.zip
!unzip products_leaflets_512.zip
import re
import os
import openpyxl
from PIL import Image
import cv2
import shutil
import tqdm
from tqdm import tqdm
import pandas as pd
import csv
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

# **Configuration Setup for Tesseract**

In [None]:
temp_dir = '/usr/share/tesseract-ocr/4.00/tessdata'

if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)
os.environ['TESSDATA_PREFIX'] = temp_dir
#tesseract configrations in different PSM #3 6 11 12 for RGB images
def textextraction_psm6_RGB(image_file):

  custom_config ='''
  --oem 1
  --psm 6
  -l deu
  -c tessedit_char_blacklist=!@#$^""»«.&*->><<(^){}[]|\\
  '''
  text_extracted=pytesseract.image_to_string(image_file,lang='deu',config=custom_config)
  return text_extracted


# **Functions for Text Extraction with Different Configurations**

**textextraction_psm12_RGB**

In [None]:
def textextraction_psm12_RGB(image_file):
    os.environ['TESSDATA_PREFIX'] = temp_dir
    custom_config ='''
    --oem 1
    --psm 12
    -l deu
    -c tessedit_char_blacklist=!@#$^""»«.&*->><<(^){}[]|\\
    '''
    text_extracted=pytesseract.image_to_string(image_file,lang='deu',config=custom_config)
    return text_extracted

**textextraction_psm6_Gray**

In [None]:
def textextraction_psm6_Gray(image_file):
  custom_config ='''
  --oem 1
  --psm 6
  -l deu
  -c tessedit_char_blacklist=!@#$^""»«.&*->><<(^){}[]|\\
  '''
  image = cv2.imread(image_file)
  gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  text_extracted=pytesseract.image_to_string(gray_image,lang='deu',config=custom_config)
  return text_extracted

**textextraction_psm12_Gray**

In [None]:
def textextraction_psm12_Gray(image_file):
    os.environ['TESSDATA_PREFIX'] = temp_dir
    custom_config ='''
    --oem 1
    --psm 12
    -l deu
    -c tessedit_char_blacklist=!@#$^""»«.&*->><<(^){}[]|\\
    '''
    image = cv2.imread(image_file)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text_extracted=pytesseract.image_to_string(gray_image,lang='deu',config=custom_config)
    text_extracted=pytesseract.image_to_string(gray_image,lang='deu',config=custom_config)
    return text_extracted

# **Multithreaded Image Processing and Results**

In [None]:
main_folder = '/content/products_leaflets_512/test'
#main_folder = '/content/products_leaflets_512/test' #test and train has to be done seperately
csv_filename = "ocr_result_test.csv"
header = ["Image File", "PSM6 RGB","PSM6 Gray","PSM12 RGB","PSM 12 Gray","Class Name"]

with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(header)


    def process_image(image_file, folder_name):
        image_path = os.path.join(main_folder, folder_name, image_file)
        ocr_result_psm6_RGB = textextraction_psm6_RGB(image_path)
        ocr_result_psm6_Gray = textextraction_psm6_Gray(image_path)
        ocr_result_psm11_RGB = textextraction_psm12_RGB(image_path)
        ocr_result_psm11_Gray = textextraction_psm12_Gray(image_path)
        row = [image_file, ocr_result_psm6_RGB,ocr_result_psm6_Gray, ocr_result_psm11_RGB, ocr_result_psm11_Gray, folder_name]
        csv_writer.writerow(row)

    for folder_name in tqdm(os.listdir(main_folder)):
        folder_path = os.path.join(main_folder, folder_name)

        if os.path.isdir(folder_path):
            image_files_to_process = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

            with ThreadPoolExecutor() as executor:
                executor.map(process_image, image_files_to_process, [folder_name]*len(image_files_to_process))

print(f"Results written to {csv_filename}")
