In [1]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13



[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pytesseract
from PIL import Image
import pandas as pd
import string
import re

In [16]:
img_path = 'tanpa_keyword.jpg'
img = Image.open(img_path)

# Extract text using pytesseract ocr
ocr_text = pytesseract.image_to_string(img)

# Process text into a list of words
output = []
for line in ocr_text.splitlines():
    text_line = line.strip()
    if text_line:
        output.extend(text_line.split())

# Combine the results into a string
print(" ".join(output))

Aqua, Stearic Acid, Niacinamide, Cyclopentasiloxane, Ethylhexy! Methoxycinnamate, Glycerin, Cyclohexasiloxane, Dimethicone, Propanediol, Butylene Glycol, Cetyl Alcohol, Palmitic Acid, Heptyl Glucoside, Phenoxyethanol, Octocrylene, Butyl Methoxydibenzoylmethane, Titanium Dioxide, Polyacrylate-13, Chlorphenesin, Arachidic Acid, Polyisobutene, Potassium Hydroxide, Allantoin, Tocopheryl Acetate, Polysorbate 20, Fragrance, Triethoxycaprylyisi- lane, Arbutin, Citric Acid, Sodium Sulfite, Aluminum Hydroxide, Actinidia Polygama Fruit Extract, Heptanol, Acetyl Tyrosine, Saxifraga Sarmentosa Extract, BHT, Paeonia Suffruticosa Root Extract, Aminopropyl Ascorby| Phosphate, Scutellaria Baicalensis Root Extract, Glutathione, Saccharomyces Lysate, Disodium Succinate, Ethylhexylglycerin, Glutamic Acid, Glycine, Threonine Valine


In [17]:
# Levenshtein distance function
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

# Closest match function for ingredients
def find_closest_match(part_cleaned, ingredients, threshold):
    closest_match = None
    closest_distance = float("inf")

    for candidate in ingredients:
        if abs(len(part_cleaned) - len(candidate)) <= threshold:  # To make it faster, compare with similar length
            distance = levenshtein_distance(part_cleaned, candidate)
            if distance < closest_distance:
                closest_distance = distance
                closest_match = candidate

    return closest_match, closest_distance

# Text preprocessing function
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation + '-'))  # Remove punctuation
    text = " ".join(text.split())  # Remove extra spaces
    return text

# String matching function with OCR results
def process_ocr_text(ocr_text, ingredients, threshold=1):
    detected_ingredients = []
    ingredients_set = set(ingredients)

    # Keywords
    keywords = ['komposisi', 'ingredients', 'ingredient']
    match = None
    keyword_found = False

    for keyword in keywords:
        match = re.search(r'\b' + re.escape(keyword) + r'\b', ocr_text.lower())
        if match:
            keyword_found = True
            break

    # Split text
    ocr_parts = ocr_text.split(",") if not keyword_found else ocr_text[match.end():].split(",")

    for part in ocr_parts:
        part_cleaned = preprocess_text(part.strip().lower())
        if "/" in part_cleaned:
            part_cleaned = part_cleaned.split("/")[0].strip()

        if part_cleaned in ingredients_set:
            detected_ingredients.append(part_cleaned)
        else:
            closest_match, closest_distance = find_closest_match(part_cleaned, ingredients, threshold)
            if closest_distance <= threshold:
                detected_ingredients.append(closest_match)

    return detected_ingredients

# Load the ingredients dataset
ingredients_path = "./ingredients.csv"
df = pd.read_csv(ingredients_path)

# List ingredients from the name column
ingredients = df['nama'].dropna().str.lower().tolist()

# Output from previous OCR
ocr_text = " ".join(output)

# Process matching by calling the matching function
result = process_ocr_text(ocr_text, ingredients)
print(result)

['aqua', 'stearic acid', 'niacinamide', 'cyclopentasiloxane', 'ethylhexyl methoxycinnamate', 'glycerin', 'cyclohexasiloxane', 'dimethicone', 'propanediol', 'butylene glycol', 'cetyl alcohol', 'palmitic acid', 'heptyl glucoside', 'phenoxyethanol', 'octocrylene', 'butyl methoxydibenzoylmethane', 'titanium dioxide', 'polyacrylate-3', 'chlorphenesin', 'arachidic acid', 'polyisobutene', 'potassium hydroxide', 'allantoin', 'tocopheryl acetate', 'polysorbate 20', 'fragrance', 'arbutin', 'citric acid', 'sodium sulfite', 'aluminum hydroxide', 'actinidia polygama fruit extract', 'heptanol', 'acetyl tyrosine', 'saxifraga sarmentosa extract', 'bht', 'aminopropyl ascorbyl phosphate', 'scutellaria baicalensis root extract', 'glutathione', 'saccharomyces lysate', 'ethylhexylglycerin', 'glutamic acid', 'glycine']
