In [None]:
! pip install pandas opencv-python pytesseract easyocr numpy requests pillow tqdm matplotlib

import pandas as pd
import cv2
import pytesseract
import easyocr
import numpy as np
import re
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import matplotlib.pyplot as plt

# Load CSV
csv_path = "final_classification_pickup.csv"
df = pd.read_csv(csv_path)

# Detect column containing image URLs or paths
image_col = None
for col in df.columns:
    if 'image' in col.lower() or 'url' in col.lower():
        image_col = col
        break

if image_col is None:
    raise ValueError("❌ Could not find image column in CSV.")

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=False)

# Utility: Load image
def load_image(path_or_url):
    try:
        if str(path_or_url).startswith('http'):
            response = requests.get(path_or_url)
            img = Image.open(BytesIO(response.content)).convert("RGB")
            return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        elif os.path.exists(path_or_url):
            return cv2.imread(path_or_url)
        else:
            return None
    except:
        return None

# Utility: Format weight
def format_weight(raw):
    digits = re.sub(r'\D', '', raw)
    if len(digits) >= 4:
        return digits[:-3] + '.' + digits[-3:]
    return None

# OCR detection
def detect_weight(image):
    result = {"tesseract": "Not detected", "easyocr": "Not detected", "detected_weight": "Not detected"}
    try:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        eq = cv2.equalizeHist(gray)
        blur = cv2.GaussianBlur(eq, (3, 3), 0)
        _, thresh = cv2.threshold(blur, 100, 255, cv2.THRESH_BINARY)

        config = r'--oem 3 --psm 11 -c tessedit_char_whitelist=0123456789'
        t_text = pytesseract.image_to_string(thresh, config=config)
        t_match = re.findall(r'\d{4,}', t_text)
        if t_match:
            formatted = format_weight(t_match[0])
            if formatted:
                result["tesseract"] = formatted

        e_result = reader.readtext(image)
        for bbox, text, conf in e_result:
            if conf > 0.8 and re.fullmatch(r'\d{4,}', text):
                formatted = format_weight(text)
                if formatted:
                    result["easyocr"] = formatted
                    result["detected_weight"] = formatted
                    break

        if result["detected_weight"] == "Not detected" and result["tesseract"] != "Not detected":
            result["detected_weight"] = result["tesseract"]

    except Exception as e:
        print(f"⚠ Error: {e}")

    return result

# Output path
output_path = "pickup_weight_detection_results.csv"
processed = []

# Process first 50 rows
for idx, row in tqdm(df.head(2600).iterrows(), total=50):
    img_path = row[image_col]
    image = load_image(img_path)

    if image is None:
        result = {
            "image": img_path,
            "tesseract": "Load error",
            "easyocr": "Load error",
            "detected_weight": "Load error"
        }
    else:
        result = detect_weight(image)
        result["image"] = img_path

        if result["detected_weight"] != "Not detected":
            print(f"\n Image: {img_path}")
            print(f" Detected: {result['detected_weight']} (EasyOCR: {result['easyocr']}, Tesseract: {result['tesseract']})")

            plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            plt.title(f"Detected: {result['detected_weight']}")
            plt.axis('off')
            plt.show()

    processed.append(result)

    # Append row to CSV (update after each image)
    pd.DataFrame([result]).to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)

print(f"\nFinished first 50 images.\nOutput saved to: {output_path}")