In [21]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [23]:
from PIL import Image
import pytesseract
import re

# Function to clean and process OCR output
def clean_text(text):
    # Remove unwanted characters and symbols
    text = re.sub(r'[_\|\-—]', '', text)  # Remove underscores, pipes, dashes
    return text

# Function to dynamically group text into headings and lists
def extract_text_from_image(image_path):
    # Load the image
    img = Image.open(image_path)

    # Perform OCR to extract raw text
    raw_text = pytesseract.image_to_string(img)

    # Clean the raw text
    clean_raw_text = clean_text(raw_text)

    # Split the cleaned text into lines
    lines = [line.strip() for line in clean_raw_text.split('\n') if line.strip()]

    # Initialize dictionary to store organized text
    organized_output = {}

    # Heuristic patterns for headings and subheadings:
    # - Headings: Usually single line, capitalized, and short (e.g., "Hypothalamus")
    # - Subheadings: Typically follow the headings, often a list of items
    heading_pattern = re.compile(r'^[A-Z][a-z\s]+$|^[A-Z][a-z]+\s[A-Z][a-z]+$')  # Simple heuristic for headings
    current_key = None

    # Loop through lines to dynamically detect headings and associated subheadings (descriptions)
    for line in lines:
        # Check if the current line is a heading
        if heading_pattern.match(line) or re.match(r'^[A-Z ]+$', line):
            current_key = line
            organized_output[current_key] = []  # Initialize the heading with an empty list for subheadings
        else:
            # This is a subheading (description) related to the current heading
            if current_key:
                organized_output[current_key].append(line)

    return organized_output

# Example usage for the uploaded image
image_path = '/content/sample1.png'
organized_output = extract_text_from_image(image_path)

# Reorganizing the data to match a more structured dictionary format
# For example, we'll combine subheadings into a single string for each heading

final_output = {key: ', '.join(value) for key, value in organized_output.items()}

final_output


{'Represents strength and courage': 'Dharma Chakra (Wheel of Law)',
 'White': '',
 'Symbolizes peace and truth': '',
 'Green': 'Represents fertility, growth, and auspiciousness of the land'}