In [8]:
import pandas as pd
import os
from tqdm import tqdm
from PIL import Image
import pytesseract
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Enable tqdm with pandas
tqdm.pandas()

# Set up pytesseract (make sure Tesseract-OCR is installed and its path is configured correctly)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path if needed

# Thread lock for thread-safe file writing
lock = threading.Lock()

def ocr_from_image_url(image_url):
    try:
        # Download the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Check if the request was successful

        # Open the image using PIL
        image = Image.open(BytesIO(response.content))

        # Convert the image to a format compatible with pytesseract (RGB)
        image = image.convert('RGB')

        # Perform OCR using pytesseract
        text = pytesseract.image_to_string(image)

        return text

    except requests.exceptions.RequestException as e:
        return f"Error downloading image: {e}"
    except Exception as e:
        return f"Error processing image: {e}"

def process_row(row, output_file):
    image_url = row['image_link']
    ocr_output = ocr_from_image_url(image_url)
    
    # Create a DataFrame for the single row with the OCR output
    result_df = pd.DataFrame([row.to_dict()])  # Convert row to dictionary and wrap it in a DataFrame
    result_df['pytesseract_output'] = ocr_output  # Add the OCR output column

    # Write the row to the CSV file (thread-safe)
    with lock:
        result_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

def process_rows_concurrently(dataframe, output_file, start_index, checkpoint_file, max_workers=5):
    # Use ThreadPoolExecutor to process rows concurrently
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for index, row in dataframe.iloc[start_index:].iterrows():
            futures.append(executor.submit(process_row, row, output_file))

            # Save the current index to the checkpoint file in case of interruption
            with lock:
                with open(checkpoint_file, 'w') as f:
                    f.write(str(index))
        
        # Display progress with tqdm
        for _ in tqdm(as_completed(futures), total=len(futures)):
            pass

# Load the dataset
dataframe = pd.read_csv('../dataset/parts/part_1/data.csv')

# Output file path
output_file = '../dataset/parts/part_1/tesseract_test_ocr.csv'

# File to store the last processed index
checkpoint_file = '../dataset/parts/part_1/tesseract_test_last_processed_index.txt'

# Read the last processed index from the checkpoint file if it exists
start_index = 0
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        start_index = int(f.read().strip())

# Load already processed data if the output file exists
processed_data = pd.DataFrame()
if os.path.exists(output_file):
    processed_data = pd.read_csv(output_file)

# Filter out already processed rows
dataframe = dataframe[~dataframe['image_link'].isin(processed_data['image_link'])]

# Process rows concurrently
process_rows_concurrently(dataframe, output_file, start_index, checkpoint_file)

print("Processing completed.")


  1%|          | 86/13115 [04:44<11:57:11,  3.30s/it]


KeyboardInterrupt: 

In [4]:
import pandas as pd
import os
from tqdm import tqdm
from PIL import Image
import pytesseract
import requests
from io import BytesIO

# Enable tqdm with pandas
tqdm.pandas()

# Set up pytesseract (make sure Tesseract-OCR is installed and its path is configured correctly)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
  # Update this path if needed

def ocr_from_image_url(image_url):
    try:
        # Download the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Check if the request was successful

        # Open the image using PIL
        image = Image.open(BytesIO(response.content))

        # Convert the image to a format compatible with pytesseract (RGB)
        image = image.convert('RGB')

        # Perform OCR using pytesseract
        text = pytesseract.image_to_string(image)

        return text

    except requests.exceptions.RequestException as e:
        return f"Error downloading image: {e}"
    except Exception as e:
        return f"Error processing image: {e}"

dataframe = pd.read_csv('../dataset/parts/part_1/data.csv')

# Output file path
output_file = '../dataset/parts/part_1/tesseract_test_ocr.csv'

# File to store the last processed index
checkpoint_file = '../dataset/parts/part_1/tesseract_test_last_processed_index.txt'

# Read the last processed index from the checkpoint file if it exists
start_index = 0
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        start_index = int(f.read().strip())

# Load already processed data if the output file exists
processed_data = pd.DataFrame()
if os.path.exists(output_file):
    processed_data = pd.read_csv(output_file)

# Process each image link and append results to the CSV file
for index, row in tqdm(dataframe.iloc[start_index:].iterrows(), total=len(dataframe) - start_index):
    if not processed_data.empty and row['image_link'] in processed_data['image_link'].values:
        # Skip if this image link has already been processed
        continue

    image_url = row['image_link']
    ocr_output = ocr_from_image_url(image_url)
    
    # Create a DataFrame for the single row with the OCR output
    result_df = pd.DataFrame([row.to_dict()])  # Convert row to dictionary and wrap it in a DataFrame
    result_df['pytesseract_output'] = ocr_output  # Add the OCR output column

    # Append the row to the CSV file
    result_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
    
    # Save the current index to the checkpoint file
    with open(checkpoint_file, 'w') as f:
        f.write(str(index))

print("Processing completed.")


  0%|          | 28/13118 [00:08<1:07:36,  3.23it/s]


KeyboardInterrupt: 