In [4]:
import pandas as pd
import pytesseract
from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm  # Import tqdm for progress bar

# Tesseract Path Setup (Windows only, uncomment below line if needed)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Function to download image and extract text
def extract_text_from_image(image_url):
    try:
        # Download image from the URL
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))

        # Extract text from image
        extracted_text = pytesseract.image_to_string(img)
        return extracted_text
    except Exception as e:
        return str(e)  # Return the error message if something goes wrong

# Path to the dataset CSV file
input_csv_path = 'dataset/train.csv'

# Read the dataset CSV file
df = pd.read_csv(input_csv_path)

# Create a list to hold results
results_list = []

# Process the first 100 entries with a progress bar
for index, row in tqdm(df.head(10).iterrows(), total=df.head(10).shape[0], desc='Processing Images'):
    image_url = row['image_link']
    extracted_text = extract_text_from_image(image_url)
    
    # Append the result to the list
    results_list.append({'image_link': image_url, 'text': extracted_text})

# Convert the list to a DataFrame
results_df = pd.DataFrame(results_list)

# Path to the output CSV file
output_csv_path = 'ocr_text.csv'

# Save the results to a new CSV file
results_df.to_csv(output_csv_path, index=False)

print("OCR text extraction completed and saved to ocr_text.csv")


Processing Images: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.25it/s]

OCR text extraction completed and saved to ocr_text.csv



