In [16]:
from PIL import Image, ImageOps

# Open the saved image (or use the image object directly from pdf2image)
img = Image.open('image1.jpg')

# Convert to grayscale and then to black and white
gray_img = ImageOps.grayscale(img)
bw_img = gray_img.point(lambda x: 0 if x < 128 else 255, '1')

# (Optional) Save the preprocessed image for verification
bw_img.save('image1.jpg')


In [17]:
import pytesseract

# Optional: define custom configuration
custom_config = r'--oem 3 --psm 6'

# Extract text from the preprocessed image
extracted_text = pytesseract.image_to_string(bw_img, config=custom_config)
print("Extracted Text:", extracted_text)


Extracted Text: Nutrition Facts
8 servings per container
Serving size 2/3 cup (55g)
ee
Amount per serving
Calories 230
% Daily Value*
Total Fat 8g 10%
Saturated Fat 1g 5%
Trang Fat 0g
Cholesterol Omg 0%
Sodium 160mg 7%
Total Carbohydrate 37g 13%
Dietary Fiber 4g 14%
Total Sugars 12g
Includes 10g Added Sugars 20%
Protein 3g
ee
Vitamin D 2meg 10%
Calcium 260mg 20%
(ron 8mg 45%
Potassium 240mg 6%
a serving of food contributes to a daily diet. 2,000 calories
a day is used for general nutrition advice
(For educationat purposes only. These labels do not meet
the labeling requirements described in 21 CFR 101.9.)



In [2]:
import re

def check_allergies_extended(product_text: str, user_allergies: list) -> str:
    """
    Checks if the product text contains any allergens based on user input,
    including synonyms for each allergen.
    
    Args:
        product_text (str): The extracted nutrition label text.
        user_allergies (list of str): List of allergens selected by the user 
                                      (e.g., ['dairy', 'peanuts', 'gluten']).
    
    Returns:
        str: A warning message if potential allergens are found; otherwise,
             a message indicating no allergens detected.
    """
    # Normalize the product text to lowercase.
    normalized_text = product_text.lower()

    # Mapping from generic allergen names to common synonyms/keywords.
    allergy_synonyms = {
        "dairy": ["milk", "cheese", "butter", "curd", "yogurt", "cream", "whey", "casein"],
        "peanuts": ["peanut", "groundnut", "arachis"],
        "gluten": ["wheat", "barley", "rye", "oats", "gluten"],
        "soy": ["soy", "soybean", "tofu", "edamame", "soymilk"],
        "tree nuts": ["almond", "cashew", "walnut", "pecan", "hazelnut", "pistachio", "macadamia"],
        "egg": ["egg", "albumin"],
        # Add additional allergens and synonyms as needed.
    }

    found_allergens = []

    # For each user-specified allergen, check for any synonym in the text.
    for allergy in user_allergies:
        allergy_lower = allergy.lower()
        synonyms = allergy_synonyms.get(allergy_lower, [allergy_lower])
        
        # Use regex with word boundaries to avoid partial matches.
        for synonym in synonyms:
            if re.search(r'\b' + re.escape(synonym) + r'\b', normalized_text):
                found_allergens.append(allergy)
                break  # Stop checking synonyms once a match is found.

    if found_allergens:
        return f"WARNING: The product contains {', '.join(found_allergens)} which may cause an allergic reaction."
    else:
        return "No allergens detected in the product."
