In [10]:
import pandas as pd
import pytesseract
from PIL import Image
import os
import re

In [11]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [12]:
# Function to extract text from an image
def extract_text_from_image(image_path):
    try:
        return pytesseract.image_to_string(Image.open(image_path))
    except FileNotFoundError:
        print(f"File not found: {image_path}")
        return ""
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

# Function to check if the file is an image
def is_image(file_path):
    return file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))

In [13]:
def clean_ocr_text(text):
    # Remove non-alphanumeric characters, except common punctuation and whitespace
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s\-]", "", text)

    # Remove common OCR errors (e.g., misinterpreted characters like 'I' instead of '1')
    text = re.sub(r"\bI\b", "1", text)  # Replace single 'I' with '1' (common OCR issue)

    # Normalize spaces around punctuation
    text = re.sub(r"\s+([.,!?'])", r"\1", text)  # Remove space before punctuation
    text = re.sub(r"([.,!?'])\s+", r"\1 ", text)  # Ensure there's only one space after punctuation

    # Remove extra spaces, tabs, and newlines
    text = re.sub(r"\s+", " ", text).strip()

    # Remove very short words (likely OCR noise) or specific unwanted tokens (e.g., 'O')
    words = text.split()
    text = " ".join([word for word in words if len(word) > 2 and word.lower() not in ['o', 'l', 'i']])

    # Remove unnecessary hyphens (common in OCR when words are split incorrectly)
    text = re.sub(r"(?<=\w)-(?=\w)", "", text)  # Remove hyphen between words

    # Remove emojis using regex (Unicode range for emojis)
    text = re.sub(r'[^\w\s,.\-!?\'""]|[\U00010000-\U0010ffff]', '', text)

    return text

In [16]:
base_dir = os.getcwd()  # Get the script's directory
output_dir = os.path.join(base_dir, "..", "output")

file_path = os.path.join(output_dir, "preprocessed_data.xlsx")
df = pd.read_excel(file_path)

In [6]:
# Apply OCR to images in the dataset
df['Extracted_Text_from_Image'] = df['Media File'].apply(
    lambda x: extract_text_from_image(x) if pd.notna(x) and is_image(x) else "")

File not found: D:\media\ILTVnews_2254 (2).jpg


In [7]:
df.sample(5)

Unnamed: 0,Message_ID,Date,Message,Text_Comments,Text_Positive_Reactions,Text_Negative_Reactions,Media File,Media_Comments,Media_Positive_Reactions,Media_Negative_Reactions,Extracted_Text_from_Image
1937,12766,2024-11-27 02:04:15,,,0,0,,,0,0,
412,13530,2025-01-28 18:58:54,Lebanese media report an Israeli airstrike in ...,,7,48,D:\media\IsraelWarLive_13530.mp4,,7,48,
1808,12615,2024-11-24 04:35:10,"IDF: A short while ago, sirens regarding a hos...",,1,7,,,0,0,
1719,12492,2024-11-17 06:15:04,Footage posted by Lebanese media shows one of ...,['its mins they know where is the hizbollah!'],97,9,,,0,0,
110,13242,2025-01-16 19:02:17,Appears following Netanyahu's call\nBen Gvir's...,"['also Trump togheter with biden', 'If he agre...",0,5,,,0,0,


In [8]:
# Apply to the extracted text column
df['Cleaned_Extracted_Text'] = df['Extracted_Text_from_Image'].apply(clean_ocr_text)
df.drop(columns=['Extracted_Text_from_Image'], inplace=True)

In [9]:
file_path1 = os.path.join(output_dir, "data_with_OCR_text.xlsx")
df.to_excel(file_path1, index=False)