In [None]:
import easyocr
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm

# Create an EasyOCR reader object with GPU support
reader = easyocr.Reader(['en'], gpu=True)

# Function to download image and extract text using EasyOCR
def extract_text_from_image(image_url):
    try:
        # Download image from the URL
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))

        # Extract text from image
        extracted_text = reader.readtext(img, detail=0)  # Use PIL Image object directly
        return ' '.join(extracted_text)  # Joining list of text into a single string
    except Exception as e:
        return str(e)  # Return the error message if something goes wrong

# Path to the dataset CSV file
input_csv_path = 'dataset/train.csv'

# Read the dataset CSV file
df = pd.read_csv(input_csv_path)

# Prompt the user for the number of entries to process
user_input = input("Enter the number of elements to process (e.g., 50, 100) or type 'all' to process all entries: ")

# Determine the number of entries to process
if user_input.lower() == 'all':
    num_entries = df.shape[0]
else:
    try:
        num_entries = int(user_input)
        if num_entries > df.shape[0]:
            num_entries = df.shape[0]  # Limit to the number of available rows
    except ValueError:
        print("Invalid input. Processing the first 10 entries by default.")
        num_entries = 10

# Create a list to hold results
results_list = []

# Process the specified number of entries with a progress bar
for index, row in tqdm(df.head(num_entries).iterrows(), total=num_entries, desc='Processing Images'):
    image_url = row['image_link']
    extracted_text = extract_text_from_image(image_url)
    
    # Append the result to the list
    results_list.append({
        'group_id': row['group_id'],
        'image_link': image_url,
        'text_ext': extracted_text,
        'entity_name': row['entity_name'],
        'entity_value': row['entity_value']
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results_list)

# Path to the output CSV file
output_csv_path = 'easyocr_results.csv'

# Save the results to a new CSV file
results_df.to_csv(output_csv_path, index=False)

print(f"OCR text extraction completed and saved to {output_csv_path}")


  warn(
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Enter the number of elements to process (e.g., 50, 100) or type 'all' to process all entries:  10


Processing Images:  80%|███████████████████████████████████████████████████▏            | 8/10 [04:21<01:25, 42.70s/it]