In [1]:
# ==============================================
# MediScan — Accuracy Graph, CSV, and Heatmap
# ==============================================
# - Scans images under: C:\Users\sagni\Downloads\MediScan\archive\data
# - Extracts numeric features from each image
# - Builds correlation heatmap of features
# - Creates pseudo-labels with KMeans (unsupervised)
# - Trains SGDClassifier (logistic) with partial_fit to log accuracy over epochs
# - Saves:
#     * medi_scan_heatmap.png
#     * medi_scan_accuracy_over_epochs.png
#     * medi_scan_accuracy_over_epochs.csv
# ==============================================

import os
import math
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, ImageOps
from skimage.color import rgb2gray
from skimage.feature import canny

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# ----------------------------
# Paths
# ----------------------------
IN_DIR  = Path(r"C:\Users\sagni\Downloads\MediScan\archive\data")
OUT_DIR = Path(r"C:\Users\sagni\Downloads\MediScan")
OUT_DIR.mkdir(parents=True, exist_ok=True)

HEATMAP_PNG = OUT_DIR / "medi_scan_heatmap.png"
ACC_PNG     = OUT_DIR / "medi_scan_accuracy_over_epochs.png"
ACC_CSV     = OUT_DIR / "medi_scan_accuracy_over_epochs.csv"
FEATURES_CSV= OUT_DIR / "medi_scan_features.csv"   # bonus: full features dump

# ----------------------------
# Helper: list images
# ----------------------------
IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}

def list_images(root: Path) -> List[Path]:
    if not root.exists():
        print(f"[ERROR] Input directory does not exist: {root}")
        return []
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

# ----------------------------
# Feature extraction
# ----------------------------
def extract_features_one(path: Path, hist_bins: int = 16, edge_resize: int = 256) -> Dict[str, Any]:
    """
    Extract simple numeric features:
      - width, height, aspect_ratio
      - mean/std per RGB channel
      - brightness (gray mean) & contrast (gray std)
      - edge density (Canny on resized gray)
      - grayscale histogram (hist_bins, L1-normalized)
    Robust to weird formats (GIF/PNG paletted) via PIL convert('RGB')
    """
    try:
        with Image.open(path) as im:
            # Some images could be paletted (mode "P"), convert to RGB
            im = im.convert("RGB")
            w, h = im.size
            arr = np.asarray(im, dtype=np.float32) / 255.0  # (H, W, 3) in [0,1]

        # Basic geometry
        width  = float(w)
        height = float(h)
        aspect = float(w / h) if h else np.nan

        # Channel stats
        r = arr[..., 0]; g = arr[..., 1]; b = arr[..., 2]
        mean_r, mean_g, mean_b = float(r.mean()), float(g.mean()), float(b.mean())
        std_r,  std_g,  std_b  = float(r.std()),  float(g.std()),  float(b.std())

        # Grayscale for brightness/contrast & edges
        gray = rgb2gray(arr)  # [0,1]
        bright = float(gray.mean())
        contrast = float(gray.std())

        # Edge density (resize shorter side to edge_resize for stability)
        if min(h, w) > 0 and min(h, w) != edge_resize:
            scale = edge_resize / min(h, w)
            new_w, new_h = int(w * scale), int(h * scale)
            gray_small = np.asarray(Image.fromarray((gray*255).astype(np.uint8)).resize((new_w, new_h), Image.BILINEAR)) / 255.0
        else:
            gray_small = gray

        edges = canny(gray_small, sigma=1.5)
        edge_density = float(edges.mean())  # fraction of edge pixels

        # Grayscale histogram
        hist, _ = np.histogram((gray * 255.0).astype(np.uint8), bins=hist_bins, range=(0, 255))
        hist = hist.astype(np.float32)
        hist = hist / (hist.sum() + 1e-9)  # L1 normalize

        # Pack features
        feats = {
            "filename": path.name,
            "relpath": str(path.relative_to(IN_DIR)),
            "width": width, "height": height, "aspect_ratio": aspect,
            "mean_r": mean_r, "mean_g": mean_g, "mean_b": mean_b,
            "std_r": std_r, "std_g": std_g, "std_b": std_b,
            "brightness": bright, "contrast": contrast,
            "edge_density": edge_density,
        }
        # append histogram bins
        for i, v in enumerate(hist):
            feats[f"hist_{i:02d}"] = float(v)

        return feats

    except Exception as e:
        # On error, return NaNs so row can be dropped later
        return {
            "filename": path.name,
            "relpath": str(path.relative_to(IN_DIR)),
            "width": np.nan, "height": np.nan, "aspect_ratio": np.nan,
            "mean_r": np.nan, "mean_g": np.nan, "mean_b": np.nan,
            "std_r": np.nan, "std_g": np.nan, "std_b": np.nan,
            "brightness": np.nan, "contrast": np.nan,
            "edge_density": np.nan,
            **{f"hist_{i:02d}": np.nan for i in range(hist_bins)},
            "error": str(e),
        }

# ----------------------------
# Build feature table
# ----------------------------
paths = list_images(IN_DIR)
if not paths:
    raise FileNotFoundError(f"No images found in {IN_DIR}")

rows = [extract_features_one(p) for p in paths]
df = pd.DataFrame(rows)

# Drop rows where core fields are missing
core_cols = ["width", "height", "aspect_ratio", "brightness", "contrast", "edge_density"]
df_clean = df.dropna(subset=core_cols).reset_index(drop=True)

# Keep only numeric columns for modeling/heatmap
num_cols = [c for c in df_clean.columns if c not in ["filename", "relpath", "error"]]
num_cols = [c for c in num_cols if pd.api.types.is_numeric_dtype(df_clean[c])]
X_all = df_clean[num_cols].astype(float)

# ----------------------------
# Correlation heatmap
# ----------------------------
corr = X_all.corr(numeric_only=True)

plt.figure(figsize=(10, 8))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right')
plt.yticks(range(corr.shape[0]), corr.index)
plt.title("MediScan: Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] Heatmap -> {HEATMAP_PNG}")

# (Optional) Save full feature table for inspection
df_clean.to_csv(FEATURES_CSV, index=False)
print(f"[SAVED] Features CSV -> {FEATURES_CSV}")

# ----------------------------
# Pseudo-labels via KMeans (unsupervised)
# ----------------------------
# Standardize features for clustering/classification
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all.values)

# Choose K (2 or 3 usually OK for prescriptions; adapt if you like)
k = 3 if len(df_clean) >= 60 else 2
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
pseudo_labels = kmeans.fit_predict(X_scaled)

# ----------------------------
# Accuracy over epochs (SGD on pseudo-labels)
# ----------------------------
# Stratified split on pseudo-labels
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, pseudo_labels, test_size=0.2, random_state=42, stratify=pseudo_labels
)

clf = SGDClassifier(
    loss="log_loss",   # multinomial logistic regression
    alpha=1e-4,
    max_iter=1,        # we will loop epochs manually
    learning_rate="optimal",
    random_state=42,
    warm_start=True
)

classes = np.unique(pseudo_labels)  # e.g., array([0,1,2])
epochs = 15
train_acc, test_acc = [], []

for epoch in range(epochs):
    if epoch == 0:
        clf.partial_fit(X_train, y_train, classes=classes)
    else:
        clf.partial_fit(X_train, y_train)

    yhat_tr = clf.predict(X_train)
    yhat_te = clf.predict(X_test)

    train_acc.append(accuracy_score(y_train, yhat_tr))
    test_acc.append(accuracy_score(y_test, yhat_te))

# Plot accuracy curve
plt.figure(figsize=(8, 4.5))
plt.plot(range(1, epochs+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, epochs+1), test_acc, marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("MediScan: Accuracy over Epochs (pseudo-label classifier)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] Accuracy graph -> {ACC_PNG}")

# Save accuracy CSV
pd.DataFrame({
    "epoch": list(range(1, epochs+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] Accuracy CSV -> {ACC_CSV}")

print("\nDone.")
print("Outputs:")
print(" -", HEATMAP_PNG)
print(" -", ACC_PNG)
print(" -", ACC_CSV)
print(" -", FEATURES_CSV)


[SAVED] Heatmap -> C:\Users\sagni\Downloads\MediScan\medi_scan_heatmap.png
[SAVED] Features CSV -> C:\Users\sagni\Downloads\MediScan\medi_scan_features.csv
[SAVED] Accuracy graph -> C:\Users\sagni\Downloads\MediScan\medi_scan_accuracy_over_epochs.png
[SAVED] Accuracy CSV -> C:\Users\sagni\Downloads\MediScan\medi_scan_accuracy_over_epochs.csv

Done.
Outputs:
 - C:\Users\sagni\Downloads\MediScan\medi_scan_heatmap.png
 - C:\Users\sagni\Downloads\MediScan\medi_scan_accuracy_over_epochs.png
 - C:\Users\sagni\Downloads\MediScan\medi_scan_accuracy_over_epochs.csv
 - C:\Users\sagni\Downloads\MediScan\medi_scan_features.csv
