In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install paddleocr
!pip install paddlepaddle

In [None]:
from paddleocr import PaddleOCR
import os
import pandas as pd

# Paths for the CSV file and image directory
temp_csv_path = '/content/drive/MyDrive/PhotoQA/newimage.csv'
image_base_path = '/content/drive/MyDrive/PhotoQA/newimage_cropped/approved/'

# Template for saving batch output files
batch_output_file_template = '/content/drive/MyDrive/PhotoQA/ocr_results_batch_{batch_idx}.txt'

# Load and filter data
data = pd.read_csv(temp_csv_path)
data = data[(data['image_type'] == 'ExpiryImages') & (data['image_status'] == 'Approved')]

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Set batch size
batch_size = 100

# Split DataFrame into batches
batches = [data.iloc[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Process each batch and save results
for batch_idx, batch in enumerate(batches):
    batch_results = {}

    # Perform OCR for each batch
    for job_no, group in batch.groupby(['job_no']):
        batch_results[job_no] = {}
        for line_id, line_group in group.groupby('line_id'):
            line_texts = []

            for image_name in line_group['image_name']:
                image_path = os.path.join(image_base_path, image_name)
                try:
                    # Run OCR
                    results = ocr.ocr(image_path)
                    for line in results[0]:
                        detected_text = line[1][0]
                        confidence = line[1][1]
                        line_texts.append(f"Text: {detected_text} (Confidence: {confidence:.2f})")
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

            # Save OCR results
            batch_results[job_no][line_id] = line_texts

    # Save batch results to file
    batch_output_file = batch_output_file_template.format(batch_idx=batch_idx + 1)
    with open(batch_output_file, 'w') as f:
        for job_no, line_groups in batch_results.items():
            f.write(f"Job No: {job_no}\n")
            for line_id, texts in line_groups.items():
                f.write(f"  Line ID: {line_id}\n")
                f.write("\n".join(texts))
                f.write("\n\n")

    print(f"Batch {batch_idx + 1}/{len(batches)} results saved to {batch_output_file}.")

print("All batches processed and results saved.")



In [None]:
from paddleocr import PaddleOCR
import os
import pandas as pd

# Paths for the CSV file and image directory
temp_csv_path = '/content/drive/MyDrive/PhotoQA/newimage.csv'
image_base_path = '/content/drive/MyDrive/PhotoQA/newimage_cropped/approved/'

# Template for saving batch output files
batch_output_file_template = '/content/drive/MyDrive/PhotoQA/ocr_results_batch_{batch_idx}.txt'

# Load and filter data
data = pd.read_csv(temp_csv_path)
data = data[(data['image_type'] == 'ExpiryImages') & (data['image_status'] == 'Approved')]

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Set batch size
batch_size = 100

# Start from an existing batch index
start_batch_idx = 0 # Change this to the batch index you want to start from

# Slice the data starting from the specified batch index
remaining_batches = [data.iloc[i:i + batch_size] for i in range((start_batch_idx - 1) * batch_size, len(data), batch_size)]

# Process each batch and save results (starting from batch 23)
for batch_idx, batch in enumerate(remaining_batches, start=start_batch_idx):
    batch_results = {}

    # Perform OCR for each batch
    for job_no, group in batch.groupby(['job_no']):
        batch_results[job_no] = {}
        for line_id, line_group in group.groupby('line_id'):
            line_texts = []

            for image_name in line_group['image_name']:
                image_path = os.path.join(image_base_path, image_name)
                try:
                    # Run OCR
                    results = ocr.ocr(image_path)
                    for line in results[0]:
                        detected_text = line[1][0]
                        confidence = line[1][1]
                        line_texts.append(f"Text: {detected_text} (Confidence: {confidence:.2f})")
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

            # Save OCR results
            batch_results[job_no][line_id] = line_texts

    # Save batch results to file
    batch_output_file = batch_output_file_template.format(batch_idx=batch_idx)
    with open(batch_output_file, 'w') as f:
        for job_no, line_groups in batch_results.items():
            f.write(f"Job No: {job_no}\n")
            for line_id, texts in line_groups.items():
                f.write(f"  Line ID: {line_id}\n")
                f.write("\n".join(texts))
                f.write("\n\n")

    print(f"Batch {batch_idx}/{start_batch_idx + len(remaining_batches) - 1} results saved to {batch_output_file}.")

print("results saved.")

