<a target="_blank" href="https://colab.research.google.com/github/ricocf/Leaflet-Product-Classification/blob/main/text_model/OCR_pattern_extraction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# **Library Import and Setup**

In [None]:
!pip install openpyxl
!pip install pytesseract
!sudo apt-get install tesseract-ocr
!wget https://github.com/tesseract-ocr/tessdata/raw/main/deu.traineddata
!apt-get install tesseract-ocr-deu
!wget https://zenodo.org/record/7869954/files/products_leaflets_512.zip
!unzip products_leaflets_512.zip
import os
from pytesseract import Output
from tqdm import tqdm
import csv
import shutil
import string
import re
import openpyxl
from pytesseract import pytesseract as tess

# **Configuration Setup for Tesseract OCR**

In [None]:
temp_dir = '/usr/share/tesseract-ocr/4.00/tessdata'

# Create the temporary directory
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = temp_dir

 #change psm values to get different config
 custom_config ='''
  --oem 1
  --psm 11
  -l deu
  -c tessedit_char_blacklist=!@#$^""»«.&*->><<(^){}[]|\\
  '''

### **Regular Expression Patterns**

In [None]:
pattern = r'(\d+(?:[,.]\d+)?\s*(?:ml|W|L|l))'
pattern1 = r'(\d+(?:[,.]\d+)?[kK]?[g]=\d+(?:[,.]\d+)?|\d+(?:[,.]\d+)?[kK]?[g])'
pattern2 = r'.*?(\d+\d*\.g\.(?:pakung|pkg|pckg|pack|pkung|Flashe)).*'
pattern3 = r'\b\d+(?:,\d+)?%[a-zA-Z]+\b'
pattern4 = r'(\d+,\d+€[a-zA-Z]+)'
pattern5 = r'(.{4})fla[sh]{1,2}e'
pattern6 = r'\b0,\d+\b'
pattern7 = r'\d+x\d+,\d+'

### **OCR Processing and Pattern Matching**

In [None]:

base_directory = '/content/products_leaflets_512/test'
#base_directory = '/content/products_leaflets_512/train' #test and train has to be done seperatel

# Create an Excel workbook
workbook = openpyxl.Workbook()
worksheet = workbook.active

# Write a header row to the Excel file
header = ["Image File", "Class Name", "Pattern 1 Result", "Pattern 2 Result", "Pattern 3 Result", "Pattern 4 Result", "Pattern 5 Result", "Pattern 6 Result","Pattern 7 Result"]
worksheet.append(header)

# Iterate through all subfolders in the base directory
for class_folder in tqdm(os.listdir(base_directory)):
    class_path = os.path.join(base_directory, class_folder)

    # Check if it's a directory
    if os.path.isdir(class_path):
        # Iterate through all image files in the subfolder
        for image_file in os.listdir(class_path):
            if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(class_path, image_file)
                ocr_result = tess.image_to_string(image_path, lang='deu', config=custom_config)

                # Find matches in the OCR text for each pattern
                matches = re.findall(pattern, ocr_result)
                matches1 = re.findall(pattern1, ocr_result)
                matches2 = re.findall(pattern2, ocr_result, re.IGNORECASE | re.MULTILINE)
                matches3 = re.findall(pattern3, ocr_result, re.IGNORECASE | re.MULTILINE)
                matches4 = re.findall(pattern4, ocr_result, re.IGNORECASE | re.MULTILINE)
                matches5 = re.findall(pattern5, ocr_result, re.IGNORECASE | re.MULTILINE)
                matches6 = re.findall(pattern6, ocr_result,re.IGNORECASE | re.MULTILINE)
                matches7 = re.findall(pattern7, ocr_result,re.IGNORECASE | re.MULTILINE)

                # Check if the matches are not empty before writing to the Excel file
                if any(matches):
                    pattern1_result = ' '.join(matches)
                else:
                    pattern1_result = ""

                if any(matches1):
                    pattern2_result = ' '.join(matches1)
                else:
                    pattern2_result = ""

                if any(matches2):
                    pattern3_result = ' '.join(matches2)
                else:
                    pattern3_result = ""

                if any(matches3):
                    pattern4_result = ' '.join(matches3)
                else:
                    pattern4_result = ""

                if any(matches4):
                    pattern5_result = ' '.join(matches4)
                else:
                    pattern5_result = ""

                if any(matches6):
                    pattern6_result = ' '.join(matches6)
                else:
                    pattern6_result = ""
                if any(matches7):
                    pattern7_result = ' '.join(matches7)
                else:
                    pattern7_result = ""

                # Write the results to the Excel file
                row = [image_file, class_folder, pattern1_result, pattern2_result, pattern3_result, pattern4_result, pattern5_result, pattern6_result, pattern7_result]
                worksheet.append(row)

# Save the Excel workbook to a file
excel_filename = "ocr_results_pattern_psm_test11rgb.xlsx"
workbook.save(excel_filename)

print(f"Results written to {excel_filename}")


In [None]:
excel_filename = "ocr_results_pattern_psm_6rgb_threaded.xlsx"
workbook.save(excel_filename)
print(f"Results written to {excel_filename}")
