In [None]:
!pip install -q paddleocr paddlepaddle pillow pillow-heif pandas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import files
import zipfile, io, os, shutil

print("Upload your ZIP file containing images...")
uploaded = files.upload()
if not uploaded:
    raise RuntimeError("No file uploaded. Please upload a .zip.")

zip_name = next(iter(uploaded.keys()))

#  Clean the target folder before extracting
TARGET_DIR = "/content/Sample_images"
shutil.rmtree(TARGET_DIR, ignore_errors=True)
os.makedirs(TARGET_DIR, exist_ok=True)

# Extract
with zipfile.ZipFile(io.BytesIO(uploaded[zip_name]), "r") as zf:
    zf.extractall(TARGET_DIR)

#  Remove macOS junk
shutil.rmtree(os.path.join(TARGET_DIR, "__MACOSX"), ignore_errors=True)

print(f"Extracted to {TARGET_DIR}")


Upload your ZIP file containing images...


Saving sample01_02.zip to sample01_02 (1).zip
Extracted to /content/Sample_images


In [None]:
from paddleocr import PaddleOCR
from PIL import Image, ImageSequence
import pillow_heif
import numpy as np
from pathlib import Path
import pandas as pd


In [None]:
IMG_DIR  = "/content/Sample_images"          # change to your folder
CSV_PATH = "/content/ocr_results.csv"
EXTS = (".png", ".jpg", ".jpeg", ".webp", ".tiff", ".tif", ".heic", ".heif")

In [None]:
def load_image_any(path: str) -> np.ndarray:
    ext = path.lower()
    # HEIC/HEIF fast path
    if ext.endswith((".heic", ".heif")):
        try:
            heif = pillow_heif.read_heif(path)
            img = Image.frombytes(heif.mode, heif.size, heif.data, "raw")
            return np.array(img.convert("RGB"))
        except Exception:
            pass
    with Image.open(path) as im:
        if getattr(im, "is_animated", False):
            im = next(ImageSequence.Iterator(im))
        return np.array(im.convert("RGB"))


In [None]:
ocr = PaddleOCR(lang="en", use_textline_orientation=True)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('en_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete

In [None]:
from tqdm import tqdm

rows = []
files_list = [
    p for p in Path(IMG_DIR).rglob("*")
    if p.suffix.lower() in EXTS and not p.name.startswith("._")
]

# limit to first 100 images
files_list = files_list[:100]

print(f"Found {len(files_list)} images (limited to 100).")

for p in tqdm(files_list, desc="OCR", unit="image"):
    try:
        arr = load_image_any(str(p))
        pred = ocr.predict(arr)
        if pred and pred[0]:
            rec = pred[0]
            texts  = rec.get("rec_texts", []) or []
            scores = rec.get("rec_scores", []) or []
            text = "\n".join([t for t in texts if t])
            conf = round(float(np.mean(scores)), 4) if scores else None
            rows.append({
                "file": str(p),
                "text": text,
                "word_count": len(text.split()),
                "mean_confidence": conf
            })
        else:
            rows.append({"file": str(p), "text": "", "word_count": 0, "mean_confidence": None})
            print(f"(no text) {p}")
    except Exception as e:
        rows.append({"file": str(p), "text": "", "word_count": 0, "mean_confidence": None})
        print(f"(error) {p}: {e}")


Found 100 images (limited to 100).


OCR: 100%|██████████| 100/100 [29:11<00:00, 17.52s/image]


In [None]:
df = pd.DataFrame(rows, columns=["file", "text", "word_count", "mean_confidence"])
df.to_csv(CSV_PATH, index=False)
print(f"Saved CSV → {CSV_PATH}  |  {len(df)} rows")

Saved CSV → /content/ocr_results.csv  |  100 rows
