In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path where the shortcut to the shared file is added
zip_path = '/content/drive/MyDrive/train_images.zip'

In [None]:
# Verifying if the file is accessible
!ls '/content/drive/My Drive/train_images.zip'

In [None]:
import zipfile

# Unzipping the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/train_images')

print("Files extracted successfully!")

Files extracted successfully!


In [None]:
!pip install paddlepaddle-gpu paddleocr

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle-gpu)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloadin

In [None]:
import os
import pandas as pd
from paddleocr import PaddleOCR
from tqdm import tqdm

In [None]:
import os
import cv2
import pandas as pd
from paddleocr import PaddleOCR
from tqdm import tqdm  # For progress tracking

# Function to sanitize URL and get the filename
def get_image_filename(url):
    return url.split('/')[-1].split('.')[0]

# Function to extract text from an image
def extract_text_from_image(image_path, ocr):
    if os.path.exists(image_path):
        img = cv2.imread(image_path)
        if img is None:
            print(f"Warning: Image {image_path} could not be loaded or is empty.")
            return ""  # Return empty string if the image is None (not loaded)

        try:
            result = ocr.ocr(image_path)
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            return ""

        # If result is None or empty, return an empty string
        if result and isinstance(result, list) and len(result) > 0 and result[0]:
            extracted_text = " ".join([line[1][0] for line in result[0]]) if result[0] else ""
            return extracted_text
    else:
        print(f"Warning: Image {image_path} does not exist.")
    return ""

# Initialize PaddleOCR without logs
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, use_gpu=True)  # Suppress PaddleOCR logs

# Load your original CSV file
csv_file_path = 'sampled_train.csv'  # Replace with the actual path to your CSV file
output_csv_path = 'result.csv'  # Path to save the new CSV
image_folder = 'train_images'  # Replace with the actual path to your images folder

# Load the CSV data into a DataFrame
df = pd.read_csv(csv_file_path)

# Limit DataFrame to the first 12,000 rows
df = df.iloc[3000:6001]

# Define batch size
batch_size = 50  # Adjust batch size based on your GPU memory capacity

# List to store new rows with extracted text
all_new_rows = []

# Iterate through the DataFrame in batches
for start_index in tqdm(range(0, df.shape[0], batch_size), desc="Processing batches"):
    end_index = min(start_index + batch_size, df.shape[0])
    batch_df = df.iloc[start_index:end_index]

    new_rows = []

    # Process each row in the current batch
    for index, row in batch_df.iterrows():
        # Get the image filename
        image_filename = get_image_filename(row['image_link'])
        image_path = os.path.join(image_folder, f"{image_filename}.jpg")  # Assuming the images are saved as .jpg

        # Extract text from the image using PaddleOCR
        extracted_text = extract_text_from_image(image_path, ocr)

        # Append a new row with extracted text
        new_rows.append({
            'image_link': row['image_link'],
            'group_id': row['group_id'],
            'entity_name': row['entity_name'],
            'entity_value': row['entity_value'],
            'extracted_text': extracted_text  # Use empty string if no text is extracted
        })

    # Append the current batch's results to the all_new_rows list
    all_new_rows.extend(new_rows)

# Create a new DataFrame with all the new rows
new_df = pd.DataFrame(all_new_rows)

# Save the new DataFrame to a CSV file
new_df.to_csv(output_csv_path, index=False)

print(f"CSV with extracted text saved to {output_csv_path}")

Processing images: 100%|██████████| 100000/100000 [1:23:27<00:00, 19.97it/s]


CSV with extracted text saved to /content/output.csv
