In [5]:
!apt install tesseract-ocr
!apt install libtesseract-dev
!pip install pytesseract pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 9 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,793 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 120493 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [42]:
import pytesseract
from PIL import Image
import re
import os
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

def process_image(image_path):
    img = Image.open(image_path)
    extracted_text = pytesseract.image_to_string(img)

    def extract_info_from_text(text):
        word, part_of_speech, definition, example, level = None, None, None, None, None

        word_pos_pattern = r'WORD OF THE DAY\s*([^\n(]+)\s*\(([^)]+)\)'
        definition_pattern = r'(?<=\(noun\)\s)(.*?)(?=(Eg:|WORD OF THE DAY FOR))'
        example_pattern = r'Eg:(.*?)(?=WORD OF THE DAY FOR)'

        word_pos_match = re.search(word_pos_pattern, text)
        if word_pos_match:
            word = word_pos_match.group(1).strip()
            part_of_speech = word_pos_match.group(2).strip()

        definition_match = re.search(definition_pattern, text, re.DOTALL)
        if definition_match:
            definition = definition_match.group(1).strip()

        example_match = re.search(example_pattern, text, re.DOTALL)
        if example_match:
            example = example_match.group(1).strip()

        if "BEGINNER" in text.upper():
            level = "Beginner"
        elif "ADVANCE" in text.upper():
            level = "Advance"
        elif "INTERMEDIATE" in text.upper():
            level = "Intermediate"

        return word, part_of_speech, definition, example, level

    word, part_of_speech, definition, example, level = extract_info_from_text(extracted_text)

    definition = " ".join(definition.split())
    example = " ".join(example.split())

    return word, part_of_speech, definition, example, level

images_directory = '/content/drive/MyDrive/Ed-Tech Automation'

words, parts_of_speech, definitions, examples, levels = [], [], [], [], []

for filename in os.listdir(images_directory):
    if filename.lower().endswith('.png'):
        image_path = os.path.join(images_directory, filename)
        word, pos, definition, example, level = process_image(image_path)
        words.append(word)
        parts_of_speech.append(pos)
        definitions.append(definition)
        examples.append(example)
        levels.append(level)

data = {
    'Word': words,
    'Part of Speech (POS)': parts_of_speech,
    'Definition': definitions,
    'Example': examples,
    'Level': levels
}

df = pd.DataFrame(data)

output_excel_file = '/content/drive/MyDrive/Ed-Tech Automation/output.xlsx'
df.to_excel(output_excel_file, index=False)

print("Data saved to Excel successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data saved to Excel successfully.
