In [1]:
import easyocr
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm
import os
from google.colab import files  # For file upload in Google Colab

# Fix the environment variable assignment
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Use CPU and disable multithreading to reduce resource consumption
reader = easyocr.Reader(['en'], gpu=False, recog_network='standard')

# Upload the CSV file manually
uploaded = files.upload()

# Path to the dataset CSV file (assuming it's uploaded as train.csv)
input_csv_path = '/content/train.csv'

# Check if the CSV file exists
if not os.path.exists(input_csv_path):
    print(f"Error: CSV file not found at {input_csv_path}")
    raise FileNotFoundError(f"CSV file not found at {input_csv_path}")

# Read the dataset CSV file
df = pd.read_csv(input_csv_path)

# Prompt the user for the range or 'all'
user_input = input('Enter range (e.g., "1523-1725") or "all" for all images: ').strip()

# Determine the range of entries to process
if user_input.lower() == 'all':
    start_index, end_index = 0, df.shape[0]
else:
    try:
        start_index, end_index = map(int, user_input.split('-'))
        if start_index < 0 or end_index > df.shape[0] or start_index >= end_index:
            print("Invalid range. Processing first 10 entries by default.")
            start_index, end_index = 0, 10
    except ValueError:
        print("Invalid input. Processing first 10 entries by default.")
        start_index, end_index = 0, 10

# Create a list to hold results
results_list = []

# Function to download image and extract text using EasyOCR
def extract_text_from_image(image_url):
    try:
        # Download image from the URL
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))
        
        # Resize image to prevent large images from consuming too much memory
        max_size = (512, 512)
        img.thumbnail(max_size, Image.ANTIALIAS)
        
        # Extract text from image
        extracted_text = reader.readtext(img, detail=0)
        return ' '.join(extracted_text)
    except Exception as e:
        return str(e)

# Process the specified range of entries with a progress bar
for index, row in tqdm(df.iloc[start_index:end_index].iterrows(), total=end_index - start_index, desc='Processing Images'):
    image_url = row['image_link']
    try:
        extracted_text = extract_text_from_image(image_url)
    except Exception as e:
        print(f"Failed to process image at {image_url}: {e}")
        continue  # Skip problematic images
    
    # Append the result to the list
    results_list.append({
        'group_id': row['group_id'],
        'image_link': image_url,
        'text_ext': extracted_text,
        'entity_name': row['entity_name'],
        'entity_value': row['entity_value']
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results_list)

# Path to the output CSV file
output_csv_path = 'easyocr_results.csv'

# Save the results to a new CSV file
results_df.to_csv(output_csv_path, index=False)

print(f"OCR text extraction completed and saved to {output_csv_path}")


  warn(


ModuleNotFoundError: No module named 'google.colab'