In [None]:

import os, cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
input_dir  = "path"
output_dir = "path"
os.makedirs(output_dir, exist_ok=True)

# ---------------- CONFIG ----------------
CROP_TOP    = 0.15   # remove 15% from top
CROP_BOTTOM = 0.20   # remove 20% from bottom
CROP_LEFT   = 0.10   # remove 10% from left
CROP_RIGHT  = 0.18   # remove 18% from right

DO_CLAHE = True
CLAHE_CLIP = 2.0
CLAHE_TILE = (8,8)

DO_RESIZE = True
RESIZE_TO = (3000,3000)

def is_image_file(name):
    return name.lower().endswith((".png",".jpg",".jpeg",".bmp",".tif",".tiff"))

def strict_crop(img):
    """Crop fixed percentages to guarantee no text remains."""
    h, w = img.shape[:2]
    top    = int(h * CROP_TOP)
    bottom = int(h * (1 - CROP_BOTTOM))
    left   = int(w * CROP_LEFT)
    right  = int(w * (1 - CROP_RIGHT))
    return img[top:bottom, left:right]

def preprocess(img):
    cropped = strict_crop(img)
    if DO_CLAHE:
        g = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
        clahe = cv2.createCLAHE(clipLimit=CLAHE_CLIP, tileGridSize=CLAHE_TILE)
        g = clahe.apply(g)
        cropped = cv2.cvtColor(g, cv2.COLOR_GRAY2BGR)
    if DO_RESIZE:
        cropped = cv2.resize(cropped, RESIZE_TO, interpolation=cv2.INTER_AREA)
    return cropped


def preview_one(path):
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    processed = preprocess(img)

    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1); plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)); plt.title("Original"); plt.axis("off")
    plt.subplot(1,2,2); plt.imshow(cv2.cvtColor(processed, cv2.COLOR_BGR2RGB)); plt.title("Strict Cleaned (No Text)"); plt.axis("off")
    plt.show()


def process_all():
    files = [f for f in os.listdir(input_dir) if is_image_file(f)]
    print(f"Found {len(files)} images.")
    for f in tqdm(files, desc="Cleaning"):
        in_path = os.path.join(input_dir, f)
        img = cv2.imread(in_path, cv2.IMREAD_COLOR)
        if img is None: continue
        processed = preprocess(img)
        out_path = os.path.join(output_dir, os.path.splitext(f)[0] + ".jpg")
        cv2.imwrite(out_path, processed, [cv2.IMWRITE_JPEG_QUALITY, 95])
    print(" Done! Clean images saved to:", output_dir)

# Run full cleaning (uncomment when ready):
process_all()