In [3]:
import pytesseract
from PIL import Image
import cv2
import os
import re
import pandas as pd

# Set the Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [66]:
# Function for image preprocessing
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    return Image.fromarray(binary)

# Function to extract nutrient information from an image using OCR
def extract_nutrients_from_image(image_path):
    # Preprocess the image
    preprocessed_image = preprocess_image(image_path)
    
    # Perform OCR on the preprocessed image with additional configuration options
    custom_config = r'--oem 3 --psm 6 -c lstm_choice_mode=2'
    ocr_result = pytesseract.image_to_string(preprocessed_image, lang='eng', config=custom_config)
    
    # Regular expression pattern to extract nutrient values
    nutrient_pattern = r"(?i)(Total\s+Fat|Protein|Iron|Calcium|Sodium|Cholesterol)\s+(\d+(?:\.\d+)?)\s*([a-z]*)"
    
    # Find all nutrient values using the regular expression
    nutrients = re.findall(nutrient_pattern, ocr_result)
    
    # Prepare a dictionary to store nutrient values
    nutrient_values = {}
    for nutrient in nutrients:
        nutrient_header = nutrient[0]
        nutrient_value = nutrient[1]
        nutrient_unit = nutrient[2]
        
        # Convert values to lowercase and replace spaces with underscores for consistency
        nutrient_header = nutrient_header.lower().replace(" ", "_")
        
        # Consolidate the nutrient values if the header is already present
        if nutrient_header in nutrient_values:
            existing_value = nutrient_values[nutrient_header]
            nutrient_value = f"{existing_value} {nutrient_value}"
        
        nutrient_values[nutrient_header] = f"{nutrient_value}{nutrient_unit}"
    
    return nutrient_values


# Function to create an output DataFrame with nutrient information from all images in the directory
def create_output_dataframe(images_folder):
    # Create an empty list to store nutrient information for each image
    nutrient_info_list = []
    
    # Get a list of image files in the specified folder
    image_files = [f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))]
    
    # Process each image and extract nutrient information
    for image_file in image_files:
        image_path = os.path.join(images_folder, image_file)
        nutrients = extract_nutrients_from_image(image_path)
        
        # Append the nutrient information for the current image to the list
        nutrient_info_list.append({"File Name": image_file, **nutrients})
    
    # Create a DataFrame from the list of nutrient information
    df = pd.DataFrame(nutrient_info_list)
    
    return df

# Main function
if __name__ == "__main__":
    # Provide the folder containing food packet images
    images_folder = r"G:\My Drive\GitHub\img"
    
    # Step 1: Create the output DataFrame
    output_df = create_output_dataframe(images_folder)
    
    # Step 2: Display the output DataFrame
    print("Output DataFrame:")
    print(output_df)


Output DataFrame:
  File Name total_fat cholesterol sodium protein calcium iron
0     1.jpg         8         0mg  160mg      3g   260mg  8mg
1     2.jpg       12g         NaN   70mg      5g      10    0
2     3.jpg       15g        30mg  650mg      5g      15  NaN
3     4.jpg       129         8mg  210mg     11g   210mg  4mg
