In [1]:
import os, sys, json, time, glob, traceback
from dataclasses import dataclass, asdict
from typing import List, Tuple, Optional, Dict
from collections import defaultdict

import numpy as np
import pandas as pd
import cv2, h5py, yaml
from tqdm import tqdm
from skimage.feature import local_binary_pattern

# ========= EDIT THESE TWO PATHS IF NEEDED =========
ROOT_DIR = r"C:\Users\NXTWAVE\Downloads\Vision Care\archive\Odir5k preprocessed with CLAHE\training images"
OUT_DIR  = r"C:\Users\NXTWAVE\Downloads\Vision Care"
# ==================================================

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

# Feature params
IMG_SIZE   = (256, 256)     # (w,h)
LBP_P, LBP_R, LBP_METHOD = 8, 1, "uniform"  # uniform-> P+2 bins

@dataclass
class Item:
    id: str
    path: str
    label_name: str
    label_id: int
    width: Optional[int] = None
    height: Optional[int] = None
    extra: Optional[dict] = None

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def list_subdirs(path: str) -> List[str]:
    return [os.path.join(path, d) for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

def list_files_recursive(dir_path: str) -> List[str]:
    files = []
    for root, _, fnames in os.walk(dir_path):
        for f in fnames:
            files.append(os.path.join(root, f))
    return files

def is_image(path: str) -> bool:
    return os.path.splitext(path)[1].lower() in IMG_EXTS

def imread_color_unicode(path: str):
    arr = np.fromfile(path, dtype=np.uint8)
    if arr.size == 0: return None
    return cv2.imdecode(arr, cv2.IMREAD_COLOR)

def resize_letterbox(img, target_wh):
    tw, th = target_wh
    h, w = img.shape[:2]
    scale = min(tw / w, th / h)
    nw, nh = int(w * scale), int(h * scale)
    resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)
    canvas = np.zeros((th, tw, 3), dtype=img.dtype)
    y0, x0 = (th - nh)//2, (tw - nw)//2
    canvas[y0:y0+nh, x0:x0+nw] = resized
    return canvas

def lbp_hist(gray):
    lbp = local_binary_pattern(gray, LBP_P, LBP_R, LBP_METHOD)
    n_bins = LBP_P + 2 if LBP_METHOD == "uniform" else int(lbp.max() + 1)
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    return hist.astype("float32")

def extract_features(img_bgr):
    img = resize_letterbox(img_bgr, IMG_SIZE)
    means = img.mean(axis=(0,1))
    stds  = img.std(axis=(0,1)) + 1e-8
    color_stats = np.concatenate([means, stds], axis=0).astype("float32")
    green = img[:, :, 1]
    lbp = lbp_hist(green)
    return np.concatenate([lbp, color_stats], axis=0).astype("float32")

def collect_images_flat(root_dir: str) -> List[str]:
    # images directly under root (not in subfolders)
    candidates = [os.path.join(root_dir, f) for f in os.listdir(root_dir) if is_image(os.path.join(root_dir, f))]
    if candidates:
        return candidates
    # or anywhere under root (if nested)
    return [p for p in list_files_recursive(root_dir) if is_image(p)]

def try_load_labels_from_csv(root_dir: str) -> Optional[pd.DataFrame]:
    # search for CSVs in ROOT_DIR and its parent
    candidates = glob.glob(os.path.join(root_dir, "*.csv")) + glob.glob(os.path.join(os.path.dirname(root_dir), "*.csv"))
    if not candidates:
        return None

    # Known ODIR code columns
    ODIR_CODES = ["N","D","G","C","A","H","M","O"]
    def primary_from_codes(row):
        for code in ODIR_CODES:  # priority order as listed
            if code in row and str(row[code]).strip() not in ("", "0", "0.0", "False", "false"):
                try:
                    if float(row[code]) > 0: 
                        return code
                except:
                    # if it's non-numeric but truthy
                    return code
        return None

    for csv_path in candidates:
        try:
            df = pd.read_csv(csv_path)
        except Exception:
            try:
                df = pd.read_csv(csv_path, encoding="latin-1")
            except Exception:
                continue

        cols = [c.lower() for c in df.columns]
        df.columns = cols

        # Try common patterns
        # 1) filename + label
        if "filename" in cols and "label" in cols:
            out = df[["filename","label"]].copy()
            out["filename"] = out["filename"].astype(str)
            out["label"] = out["label"].astype(str)
            out["__source_csv__"] = csv_path
            return out

        # 2) id + label
        if "id" in cols and "label" in cols:
            out = df[["id","label"]].copy().rename(columns={"id":"filename"})
            out["filename"] = out["filename"].astype(str)
            out["label"] = out["label"].astype(str)
            out["__source_csv__"] = csv_path
            return out

        # 3) filename + diagnosis / labels (single text col)
        for txtcol in ("diagnosis","labels","label_text","disease","diseases"):
            if "filename" in cols and txtcol in cols:
                out = df[["filename", txtcol]].copy().rename(columns={txtcol:"label"})
                out["filename"] = out["filename"].astype(str)
                out["label"] = out["label"].astype(str)
                out["__source_csv__"] = csv_path
                return out

        # 4) ODIR multi-hot columns N,D,G,C,A,H,M,O
        if all(code.lower() in cols for code in [c.lower() for c in ["N","D","G","C","A","H","M","O"]]):
            name_col = "filename" if "filename" in cols else ("id" if "id" in cols else None)
            if name_col is None:
                # try to guess a column holding filenames by searching for .jpg/.png patterns
                for c in df.columns:
                    if df[c].astype(str).str.contains(r"\.(jpg|jpeg|png|bmp|tif|tiff)$", case=False, regex=True).any():
                        name_col = c
                        break
            if name_col:
                df_codes = df.copy()
                df_codes["__primary__"] = df_codes.apply(primary_from_codes, axis=1)
                out = df_codes[[name_col, "__primary__"]].rename(columns={name_col:"filename","__primary__":"label"}).copy()
                out["filename"] = out["filename"].astype(str)
                out["label"] = out["label"].fillna("unknown").astype(str)
                out["__source_csv__"] = csv_path
                return out

    return None

def main():
    t0 = time.time()
    assert os.path.isdir(ROOT_DIR), f"Folder not found: {ROOT_DIR}"
    ensure_dir(OUT_DIR)

    # 1) Detect class subfolders
    class_dirs = list_subdirs(ROOT_DIR)
    use_subfolders = len(class_dirs) > 0

    # 2) Gather records: (path, label_name)
    records = []
    if use_subfolders:
        # subfolders as labels
        class_dirs_sorted = sorted(class_dirs, key=lambda p: os.path.basename(p).lower())
        for cdir in class_dirs_sorted:
            cname = os.path.basename(cdir).lower()
            imgs = [p for p in list_files_recursive(cdir) if is_image(p)]
            for p in imgs:
                records.append((p, cname))
        print(f"[INFO] Using {len(class_dirs_sorted)} class subfolders as labels.")
    else:
        # flat folder; try to use CSV mapping
        print("[INFO] No class subfolders found. Searching for CSV labels...")
        df_lbl = try_load_labels_from_csv(ROOT_DIR)
        imgs = collect_images_flat(ROOT_DIR)
        assert imgs, f"No images found in {ROOT_DIR}"

        if df_lbl is None:
            # last resort: everyone is 'unknown'
            print("[WARN] No CSV labels found. Assigning label 'unknown' to all images.")
            for p in imgs:
                records.append((p, "unknown"))
        else:
            # index by stem name (filename without extension) and also full filename
            df_lbl["filename"] = df_lbl["filename"].astype(str)
            lbl_map_stem = {os.path.splitext(os.path.basename(a))[0].lower(): str(b) for a,b in zip(df_lbl["filename"], df_lbl["label"])}
            lbl_map_full = {os.path.basename(a).lower(): str(b) for a,b in zip(df_lbl["filename"], df_lbl["label"])}
            miss = 0
            for p in imgs:
                base = os.path.basename(p)
                stem = os.path.splitext(base)[0].lower()
                key_full = base.lower()
                label = lbl_map_full.get(key_full, lbl_map_stem.get(stem, "unknown"))
                if label == "unknown":
                    miss += 1
                records.append((p, str(label).strip().lower()))
            print(f"[INFO] Mapped {len(imgs)-miss} / {len(imgs)} images via CSV ({df_lbl.get('__source_csv__', ['?'])[0] if '__source_csv__' in df_lbl.columns else '?'})")
            if miss:
                print(f"[WARN] {miss} images without a CSV label -> 'unknown'")

    assert records, "No labeled records found."

    # 3) Build label map
    label_names = sorted(list({lab for _, lab in records}))
    label_to_id = {name: i for i, name in enumerate(label_names)}
    print("[INFO] Label map:", label_to_id)

    # 4) Extract features
    feats, labels, paths, widths, heights, items = [], [], [], [], [], []
    print("[INFO] Extracting features...")
    for idx, (path, lname) in enumerate(tqdm(records)):
        try:
            img = imread_color_unicode(path)
            if img is None: 
                continue
            h, w = img.shape[:2]
            feat = extract_features(img)
            feats.append(feat); labels.append(label_to_id[lname]); paths.append(path)
            widths.append(w); heights.append(h)
            items.append(Item(
                id=str(len(paths)-1), path=path, label_name=lname, label_id=label_to_id[lname],
                width=w, height=h, extra=None
            ))
        except Exception as e:
            print(f"[WARN] {path} -> {e}")

    assert len(feats) > 0, "No features extracted. Check images."
    X = np.stack(feats, axis=0)
    y = np.array(labels, dtype=np.int64)
    print(f"[INFO] X shape: {X.shape} | samples: {len(items)} | classes: {len(label_names)}")

    # 5) Save artifacts
    base = "visioncare_index"
    ensure_dir(OUT_DIR)

    # HDF5
    h5_path = os.path.join(OUT_DIR, f"{base}.h5")
    with h5py.File(h5_path, "w") as f:
        f.create_dataset("X", data=X, compression="gzip", compression_opts=4)
        f.create_dataset("y", data=y, compression="gzip", compression_opts=4)
        dt = h5py.special_dtype(vlen=str)
        ds_paths = f.create_dataset("filenames", (len(paths),), dtype=dt); ds_paths[:] = paths
        f.attrs["img_size"] = json.dumps({"w": IMG_SIZE[0], "h": IMG_SIZE[1]})
        f.attrs["lbp"] = json.dumps({"P": LBP_P, "R": LBP_R, "method": LBP_METHOD})
        f.attrs["label_map"] = json.dumps({str(i): name for name, i in label_to_id.items()})
        f.attrs["feature_schema"] = json.dumps({
            "order": ["lbp_green_hist", "rgb_means", "rgb_stds"],
            "dims": {"lbp_green_hist": (LBP_P + 2), "rgb_means": 3, "rgb_stds": 3},
            "total_dim": int(X.shape[1])
        })

    # PKL
    pkl_path = os.path.join(OUT_DIR, f"{base}.pkl")
    df = pd.DataFrame(X)
    df["label_id"] = y
    df["label_name"] = [it.label_name for it in items]
    df["path"] = paths
    df["width"] = widths
    df["height"] = heights
    df.to_pickle(pkl_path)

    # YAML
    yaml_path = os.path.join(OUT_DIR, f"{base}.yaml")
    by_label_counts = defaultdict(int)
    for it in items:
        by_label_counts[it.label_name] += 1
    stats = {
        "total_items": int(len(items)),
        "classes": int(len(label_names)),
        "by_label": {name: int(by_label_counts[name]) for name in label_names},
        "img_size": {"width": IMG_SIZE[0], "height": IMG_SIZE[1]},
        "lbp": {"P": LBP_P, "R": LBP_R, "method": LBP_METHOD},
        "feature_schema": {
            "order": ["lbp_green_hist", "rgb_means", "rgb_stds"],
            "dims": {"lbp_green_hist": (LBP_P + 2), "rgb_means": 3, "rgb_stds": 3},
            "total_dim": int(X.shape[1])
        },
        "label_map": {name: int(idx) for name, idx in label_to_id.items()}
    }
    with open(yaml_path, "w", encoding="utf-8") as fy:
        yaml.safe_dump(stats, fy, sort_keys=False, allow_unicode=True)

    # JSON
    json_path = os.path.join(OUT_DIR, f"{base}.json")
    payload = {
        "version": "1.0",
        "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "label_map": {name: int(idx) for name, idx in label_to_id.items()},
        "items": [asdict(it) for it in items]
    }
    with open(json_path, "w", encoding="utf-8") as fj:
        json.dump(payload, fj, indent=2, ensure_ascii=False)

    print("\n[SUMMARY]")
    print("Artifacts created in:", OUT_DIR)
    print(" - HDF5:", h5_path)
    print(" - PKL :", pkl_path)
    print(" - YAML:", yaml_path)
    print(" - JSON:", json_path)

main()


[INFO] No class subfolders found. Searching for CSV labels...
[INFO] Mapped 6392 / 7000 images via CSV (C:\Users\NXTWAVE\Downloads\Vision Care\archive\Odir5k preprocessed with CLAHE\full_df.csv)
[WARN] 608 images without a CSV label -> 'unknown'
[INFO] Label map: {"['a']": 0, "['c']": 1, "['d']": 2, "['g']": 3, "['h']": 4, "['m']": 5, "['n']": 6, "['o']": 7, 'unknown': 8}
[INFO] Extracting features...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 7000/7000 [03:07<00:00, 37.43it/s]

[INFO] X shape: (7000, 16) | samples: 7000 | classes: 9

[SUMMARY]
Artifacts created in: C:\Users\NXTWAVE\Downloads\Vision Care
 - HDF5: C:\Users\NXTWAVE\Downloads\Vision Care\visioncare_index.h5
 - PKL : C:\Users\NXTWAVE\Downloads\Vision Care\visioncare_index.pkl
 - YAML: C:\Users\NXTWAVE\Downloads\Vision Care\visioncare_index.yaml
 - JSON: C:\Users\NXTWAVE\Downloads\Vision Care\visioncare_index.json



