In [1]:
# ============================================================
# MediScan — Pseudo-Label Prediction + Heatmaps (Full Script)
# ============================================================
# Inputs:
#   - Images under C:\Users\sagni\Downloads\MediScan\archive\data  (recursive)
#
# Outputs in C:\Users\sagni\Downloads\MediScan:
#   - medi_scan_features.csv
#   - medi_scan_feature_corr_heatmap.png
#   - medi_scan_confusion_heatmap.png
#   - medi_scan_predictions.csv
#   - medi_scan_model.pkl  (StandardScaler + LogisticRegression pipeline)
#
# Notes:
#   - Uses KMeans to create pseudo-labels (no ground truth).
#   - Trains multinomial LogisticRegression to predict those labels.
#   - Confusion heatmap is w.r.t. pseudo-labels (not true classes).
# ============================================================

from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from skimage.color import rgb2gray
from skimage.feature import canny

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
import joblib

# ----------------------------
# Paths / Config
# ----------------------------
IN_DIR  = Path(r"C:\Users\sagni\Downloads\MediScan\archive\data")
OUT_DIR = Path(r"C:\Users\sagni\Downloads\MediScan")
OUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURES_CSV = OUT_DIR / "medi_scan_features.csv"
PRED_CSV     = OUT_DIR / "medi_scan_predictions.csv"
CORR_PNG     = OUT_DIR / "medi_scan_feature_corr_heatmap.png"
CM_PNG       = OUT_DIR / "medi_scan_confusion_heatmap.png"
MODEL_PKL    = OUT_DIR / "medi_scan_model.pkl"

IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}

# ----------------------------
# Utilities
# ----------------------------
def list_images(root: Path) -> List[Path]:
    if not root.exists():
        raise FileNotFoundError(f"Input directory does not exist: {root}")
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def extract_features_one(path: Path, hist_bins: int = 16, edge_resize: int = 256) -> Dict[str, Any]:
    """
    Extract numeric features from an image:
      - width/height/aspect_ratio
      - mean/std of RGB channels
      - grayscale brightness & contrast
      - edge density (Canny on resized gray)
      - grayscale histogram (hist_bins, L1-normalized)
    """
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")
            w, h = im.size
            arr = np.asarray(im, dtype=np.float32) / 255.0

        width, height = float(w), float(h)
        aspect = float(w / h) if h else np.nan

        r, g, b = arr[..., 0], arr[..., 1], arr[..., 2]
        mean_r, mean_g, mean_b = float(r.mean()), float(g.mean()), float(b.mean())
        std_r,  std_g,  std_b  = float(r.std()),  float(g.std()),  float(b.std())

        gray = rgb2gray(arr)
        bright = float(gray.mean())
        contrast = float(gray.std())

        # Edge density on resized gray (keep aspect)
        if min(h, w) > 0 and min(h, w) != edge_resize:
            scale = edge_resize / min(h, w)
            new_w, new_h = int(w * scale), int(h * scale)
            gray_small = np.asarray(Image.fromarray((gray*255).astype(np.uint8)).resize((new_w, new_h))) / 255.0
        else:
            gray_small = gray
        edges = canny(gray_small, sigma=1.5)
        edge_density = float(edges.mean())

        # Gray histogram
        hist, _ = np.histogram((gray * 255.0).astype(np.uint8), bins=hist_bins, range=(0, 255))
        hist = hist.astype(np.float32)
        hist = hist / (hist.sum() + 1e-9)

        feats = {
            "filename": path.name,
            "relpath": str(path.relative_to(IN_DIR)),
            "width": width, "height": height, "aspect_ratio": aspect,
            "mean_r": mean_r, "mean_g": mean_g, "mean_b": mean_b,
            "std_r": std_r, "std_g": std_g, "std_b": std_b,
            "brightness": bright, "contrast": contrast,
            "edge_density": edge_density,
        }
        for i, v in enumerate(hist):
            feats[f"hist_{i:02d}"] = float(v)

        return feats

    except Exception as e:
        # Return NaNs if we fail to parse this image
        base = {
            "filename": path.name,
            "relpath": str(path.relative_to(IN_DIR)),
            "width": np.nan, "height": np.nan, "aspect_ratio": np.nan,
            "mean_r": np.nan, "mean_g": np.nan, "mean_b": np.nan,
            "std_r": np.nan, "std_g": np.nan, "std_b": np.nan,
            "brightness": np.nan, "contrast": np.nan,
            "edge_density": np.nan,
        }
        for i in range(16):
            base[f"hist_{i:02d}"] = np.nan
        base["error"] = str(e)
        return base

def plot_confusion_heatmap(cm: np.ndarray, labels: list, title: str, out_path: Path):
    plt.figure(figsize=(6, 5))
    im = plt.imshow(cm, aspect='equal')
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True (pseudo-label)")
    plt.colorbar(im)
    plt.xticks(range(len(labels)), labels)
    plt.yticks(range(len(labels)), labels)
    # annotate counts
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha='center', va='center')
    plt.tight_layout()
    plt.savefig(out_path, dpi=220)
    plt.close()

# ----------------------------
# 1) Load images & extract features
# ----------------------------
paths = list_images(IN_DIR)
rows = [extract_features_one(p) for p in paths]
df = pd.DataFrame(rows)

# Drop rows missing core features
core = ["width", "height", "aspect_ratio", "brightness", "contrast", "edge_density"]
df_clean = df.dropna(subset=core).reset_index(drop=True)

# Save feature table for reference
df_clean.to_csv(FEATURES_CSV, index=False)
print("[SAVED] Features CSV ->", FEATURES_CSV)

# Numeric feature matrix
non_feat_cols = {"filename", "relpath", "error"}
feat_cols = [c for c in df_clean.columns if c not in non_feat_cols]
feat_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(df_clean[c])]
X_all = df_clean[feat_cols].astype(float).values

# ----------------------------
# 2) Correlation heatmap (features)
# ----------------------------
corr = df_clean[feat_cols].corr(numeric_only=True)
plt.figure(figsize=(10, 8))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right')
plt.yticks(range(corr.shape[0]), corr.index)
plt.title("MediScan: Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(CORR_PNG, dpi=220)
plt.close()
print("[SAVED] Feature correlation heatmap ->", CORR_PNG)

# ----------------------------
# 3) Pseudo-labels with KMeans
# ----------------------------
k = 3 if len(df_clean) >= 60 else 2
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
pseudo_labels = kmeans.fit_predict(X_all)

# ----------------------------
# 4) Train/test split & model training (multinomial Logistic)
# ----------------------------
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_all, pseudo_labels, np.arange(len(pseudo_labels)),
    test_size=0.2, random_state=42, stratify=pseudo_labels
)

# Build a pipeline: Standardize -> LogisticRegression (multinomial)
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="saga",       # good for larger feature spaces
        C=1.0,
        class_weight="balanced",
        max_iter=400,
        n_jobs=-1,
        random_state=42
    ))
])

pipe.fit(X_train, y_train)

# Confusion matrix heatmap on test set
y_pred_test = pipe.predict(X_test)
cm = confusion_matrix(y_test, y_pred_test, labels=list(range(k)))
plot_confusion_heatmap(
    cm, labels=[f"C{i}" for i in range(k)],
    title="MediScan: Confusion Matrix (pseudo-label classifier)",
    out_path=CM_PNG
)
print("[SAVED] Confusion heatmap ->", CM_PNG)

# ----------------------------
# 5) Predict for ALL images & save CSV (with top-3 probabilities)
# ----------------------------
# Fit scaler on all, then refit LR on all (better final model) — optional but common
pipe_all = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="saga",
        C=1.0,
        class_weight="balanced",
        max_iter=400,
        n_jobs=-1,
        random_state=42
    ))
])
pipe_all.fit(X_all, pseudo_labels)

# Predict labels + probabilities
pred_labels = pipe_all.predict(X_all)
probs = pipe_all.predict_proba(X_all)  # shape (N, k)

# For readability, compute top-3 (label, prob) pairs per image
def top3(prob_row):
    idxs = np.argsort(prob_row)[::-1][:3]
    return [(int(i), float(prob_row[i])) for i in idxs]

top3_list = [top3(p) for p in probs]
max_prob = probs.max(axis=1)

out_df = pd.DataFrame({
    "filename": df_clean["filename"],
    "relpath": df_clean["relpath"],
    "pred_cluster": pred_labels,
    "pred_prob": max_prob
})
# Add probability columns per cluster
for i in range(k):
    out_df[f"prob_C{i}"] = probs[:, i]

# (Optional) dump a compact string of top3
out_df["top3"] = [str(t) for t in top3_list]

out_df.to_csv(PRED_CSV, index=False)
print("[SAVED] Predictions CSV ->", PRED_CSV)

# Save the final model
joblib.dump(pipe_all, MODEL_PKL)
print("[SAVED] Model ->", MODEL_PKL)

print("\nDone.")
print("Outputs:")
print(" -", FEATURES_CSV)
print(" -", CORR_PNG)
print(" -", CM_PNG)
print(" -", PRED_CSV)
print(" -", MODEL_PKL)


[SAVED] Features CSV -> C:\Users\sagni\Downloads\MediScan\medi_scan_features.csv
[SAVED] Feature correlation heatmap -> C:\Users\sagni\Downloads\MediScan\medi_scan_feature_corr_heatmap.png




[SAVED] Confusion heatmap -> C:\Users\sagni\Downloads\MediScan\medi_scan_confusion_heatmap.png
[SAVED] Predictions CSV -> C:\Users\sagni\Downloads\MediScan\medi_scan_predictions.csv
[SAVED] Model -> C:\Users\sagni\Downloads\MediScan\medi_scan_model.pkl

Done.
Outputs:
 - C:\Users\sagni\Downloads\MediScan\medi_scan_features.csv
 - C:\Users\sagni\Downloads\MediScan\medi_scan_feature_corr_heatmap.png
 - C:\Users\sagni\Downloads\MediScan\medi_scan_confusion_heatmap.png
 - C:\Users\sagni\Downloads\MediScan\medi_scan_predictions.csv
 - C:\Users\sagni\Downloads\MediScan\medi_scan_model.pkl


