In [None]:
!pip -q install easyocr pillow pillow-heif

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.8/963.8 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.1/292.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import files
import zipfile, io, os, shutil
import numpy as np, torch, easyocr, pandas as pd
from PIL import Image, ImageSequence
import pillow_heif
from pathlib import Path
from tqdm import tqdm


In [None]:
print("Upload your ZIP file containing images...")
uploaded = files.upload()  # pick e.g., my_images.zip
if not uploaded:
    raise RuntimeError("No file uploaded. Please upload a .zip.")
zip_name = next(iter(uploaded.keys()))

# ---- 2) Clean target & extract ----
TARGET_DIR = "/content/images"
shutil.rmtree(TARGET_DIR, ignore_errors=True)
os.makedirs(TARGET_DIR, exist_ok=True)

with zipfile.ZipFile(io.BytesIO(uploaded[zip_name]), "r") as zf:
    zf.extractall(TARGET_DIR)

# remove macOS junk folder if present
shutil.rmtree(os.path.join(TARGET_DIR, "__MACOSX"), ignore_errors=True)
print(f"Extracted to {TARGET_DIR}")


Upload your ZIP file containing images...


Saving sample01_02.zip to sample01_02.zip
Extracted to /content/images


In [None]:
def load_image_any(path: str) -> np.ndarray:
    ext = path.lower()
    if ext.endswith((".heic", ".heif")):
        try:
            heif = pillow_heif.read_heif(path)
            img = Image.frombytes(heif.mode, heif.size, heif.data, "raw").convert("RGB")
        except Exception:
            img = Image.open(path).convert("RGB")
    else:
        with Image.open(path) as im:
            if getattr(im, "is_animated", False):
                im = next(ImageSequence.Iterator(im))  # first frame
            img = im.convert("RGB")
    return np.array(img)



In [None]:
EXTS = (".png",".jpg",".jpeg",".webp",".tiff",".tif",".heic",".heif")
files = [
    p for p in Path(TARGET_DIR).rglob("*")
    if p.suffix.lower() in EXTS and not p.name.startswith("._")
]
print(f"Found {len(files)} images. Processing...")
#Limit to first 100 images
files = files[:100]

Found 1050 images. Processing...


In [None]:
use_gpu = torch.cuda.is_available()
reader = easyocr.Reader(['en'], gpu=use_gpu)
print("GPU available:", use_gpu)



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [None]:
rows = []
for p in tqdm(files, desc="EasyOCR", unit="image"):
    try:
        arr = load_image_any(str(p))
        res = reader.readtext(arr, detail=1, paragraph=False)  # [(bbox, text, conf), ...]

        if not res:
            rows.append({"file": str(p), "text": "", "word_count": 0, "mean_confidence": None})
            continue

        # sort lines top-to-bottom, then left-to-right
        lines = []
        for bbox, txt, conf in res:
            xs = [pt[0] for pt in bbox]; ys = [pt[1] for pt in bbox]
            lines.append((min(ys), min(xs), txt, float(conf)))
        lines.sort(key=lambda t: (t[0], t[1]))

        texts  = [t[2] for t in lines if t[2]]
        scores = [t[3] for t in lines]
        text = "\n".join(texts)
        mean_conf = round(float(np.mean(scores)), 4) if scores else None

        rows.append({
            "file": str(p),
            "text": text,
            "word_count": len(text.split()),
            "mean_confidence": mean_conf
        })
    except Exception as e:
        rows.append({"file": str(p), "text": "", "word_count": 0, "mean_confidence": None})


EasyOCR: 100%|██████████| 100/100 [00:41<00:00,  2.44image/s]


In [None]:
CSV_PATH = "/content/ocr_results.csv"
pd.DataFrame(rows, columns=["file","text","word_count","mean_confidence"]).to_csv(CSV_PATH, index=False)
print(f"Saved CSV → {CSV_PATH}  |  {len(rows)} rows")

Saved CSV → /content/ocr_results.csv  |  100 rows
