In [6]:
# ==========================================================
# ArtifyAI — Heatmap + Accuracy Graph + CSV (All-in-One, fixed .str.strip)
# ==========================================================
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, UnidentifiedImageError
from skimage.color import rgb2gray
from skimage.feature import canny

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ----------------------------
# Paths / Config
# ----------------------------
IN_ROOT = Path(r"C:\Users\sagni\Downloads\Artify AI\archive")
OUT_DIR = Path(r"C:\Users\sagni\Downloads\Artify AI")
OUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURES_CSV = OUT_DIR / "artify_features.csv"
HEATMAP_PNG  = OUT_DIR / "artify_feature_corr_heatmap.png"
ACC_PNG      = OUT_DIR / "artify_accuracy_over_epochs.png"
ACC_CSV      = OUT_DIR / "artify_accuracy_over_epochs.csv"
CM_PNG       = OUT_DIR / "artify_confusion_matrix.png"
REPORT_TXT   = OUT_DIR / "artify_classification_report.txt"

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
HIST_BINS = 16
EDGE_RESIZE = 256
EPOCHS = 12
TEST_SIZE = 0.2
RANDOM_STATE = 42
MIN_SAMPLES_PER_CLASS = 2

# ----------------------------
# Helpers
# ----------------------------
def list_images(root: Path):
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def extract_features_one(path: Path) -> Dict[str, Any]:
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")
            w, h = im.size
            arr = np.asarray(im, dtype=np.float32) / 255.0

        r, g, b = arr[..., 0], arr[..., 1], arr[..., 2]
        gray = rgb2gray(arr)  # [0,1]

        # basic geometry + color stats
        feats = {
            "width": float(w),
            "height": float(h),
            "aspect_ratio": float(w / h) if h else np.nan,
            "mean_r": float(r.mean()), "mean_g": float(g.mean()), "mean_b": float(b.mean()),
            "std_r": float(r.std()),  "std_g": float(g.std()),  "std_b": float(b.std()),
            "brightness": float(gray.mean()),
            "contrast": float(gray.std()),
        }

        # edge density
        if min(h, w) > 0 and min(h, w) != EDGE_RESIZE:
            scale = EDGE_RESIZE / min(h, w)
            gray_small = np.asarray(
                Image.fromarray((gray * 255).astype(np.uint8)).resize((int(w * scale), int(h * scale)))
            ) / 255.0
        else:
            gray_small = gray
        feats["edge_density"] = float(canny(gray_small, sigma=1.5).mean())

        # grayscale histogram
        hist, _ = np.histogram((gray * 255.0).astype(np.uint8), bins=HIST_BINS, range=(0, 255))
        hist = (hist.astype(np.float32) / (hist.sum() + 1e-9)).tolist()
        for i, v in enumerate(hist):
            feats[f"hist_{i:02d}"] = float(v)

        return feats

    except UnidentifiedImageError:
        base = {
            "width": np.nan, "height": np.nan, "aspect_ratio": np.nan,
            "mean_r": np.nan, "mean_g": np.nan, "mean_b": np.nan,
            "std_r": np.nan, "std_g": np.nan, "std_b": np.nan,
            "brightness": np.nan, "contrast": np.nan, "edge_density": np.nan,
        }
        for i in range(HIST_BINS):
            base[f"hist_{i:02d}"] = np.nan
        base["error"] = "unidentified_image"
        return base

def derive_label_rel_first(root: Path, path: Path) -> str:
    rel = path.relative_to(root)
    parts = rel.parts
    return parts[0] if len(parts) >= 2 else "misc"

def derive_label_from_filename(name: str) -> str:
    stem = Path(name).stem
    for sep in ("_", "-"):
        if sep in stem:
            return stem.split(sep)[0]
    return stem

def build_manifest(root: Path) -> pd.DataFrame:
    files = list_images(root)
    rows = []
    for i, p in enumerate(files, 1):
        feats = extract_features_one(p)
        rows.append({
            "label": derive_label_rel_first(root, p),
            "filename": p.name,
            "relpath": str(p.relative_to(root)),
            "abspath": str(p.resolve()),
            **feats
        })
        if i % 500 == 0:
            print(f"[SCAN] {i}/{len(files)}")
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.dropna(subset=["width", "height"]).reset_index(drop=True)
    return df

# ----------------------------
# 1) Scan + features → CSV
# ----------------------------
df = build_manifest(IN_ROOT)
if df.empty:
    raise SystemExit("No images found. Check IN_ROOT path and file types.")

# Fallback to filename-based labels if only one folder-label exists
labels_before = sorted(df["label"].unique().tolist())
if len(labels_before) == 1:
    print(f"[WARN] Only one label from folders: {labels_before[0]}")
    print("[INFO] Falling back to filename-based labels…")
    df["label"] = df["filename"].map(derive_label_from_filename)

# ✅ FIX: normalize labels using .str.strip()
df["label"] = df["label"].astype(str).str.strip()

# Save features CSV (always)
FEATURES_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(FEATURES_CSV, index=False)
print(f"[SAVED] {FEATURES_CSV}")

# ----------------------------
# 2) Correlation heatmap (numeric-only)
# ----------------------------
num_cols = [c for c in df.columns
            if c not in {"label", "filename", "relpath", "abspath", "error"}
            and pd.api.types.is_numeric_dtype(df[c])]
corr = df[num_cols].corr(numeric_only=True)

plt.figure(figsize=(10, 7.5))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right')
plt.yticks(range(corr.shape[0]), corr.index)
plt.title("ArtifyAI: Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(HEATMAP_PNG, dpi=220)
plt.close()
print(f"[SAVED] {HEATMAP_PNG}")

# ----------------------------
# 3) Drop rare classes (< MIN_SAMPLES_PER_CLASS)
# ----------------------------
vc = df["label"].value_counts()
rare = vc[vc < MIN_SAMPLES_PER_CLASS]
if not rare.empty:
    print(f"[WARN] Dropping rare classes (<{MIN_SAMPLES_PER_CLASS} samples): {list(rare.index)}")
    df = df[~df["label"].isin(rare.index)].reset_index(drop=True)

classes = sorted(df["label"].unique().tolist())
print(f"[INFO] Classes after filtering ({len(classes)}): {classes}")
if len(classes) < 2:
    print("\n[SKIP] Not enough classes after filtering. "
          "Accuracy graph & CM require >= 2 classes with >= 2 samples each.")
    raise SystemExit(0)

# ----------------------------
# 4) Train/Test split (stratified → fallback)
# ----------------------------
le = LabelEncoder()
y = le.fit_transform(df["label"].astype(str))
X = df[num_cols].astype(float).values

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
except ValueError as e:
    print(f"[WARN] Stratified split failed ({e}); using non-stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=None
    )

# ----------------------------
# 5) Train over epochs (SGD + Standardize)
# ----------------------------
pipe = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SGDClassifier(
        loss="log_loss",
        alpha=1e-4,
        max_iter=1,          # manual epochs
        learning_rate="optimal",
        random_state=RANDOM_STATE,
        warm_start=True
    ))
])

train_acc, test_acc = [], []
for ep in range(EPOCHS):
    pipe.fit(X_train, y_train)
    y_tr_pred = pipe.predict(X_train)
    y_te_pred = pipe.predict(X_test)
    train_acc.append(accuracy_score(y_train, y_tr_pred))
    test_acc.append(accuracy_score(y_test, y_te_pred))

# Accuracy curve + CSV
plt.figure(figsize=(8, 4.8))
plt.plot(range(1, EPOCHS+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, EPOCHS+1), test_acc,  marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("ArtifyAI: Accuracy over Epochs (Engineered Features + SGD)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] {ACC_PNG}")

pd.DataFrame({"epoch": range(1, EPOCHS+1),
              "train_accuracy": train_acc,
              "test_accuracy": test_acc}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] {ACC_CSV}")

# ----------------------------
# 6) Confusion matrix + robust report
# ----------------------------
y_pred = pipe.predict(X_test)
labels_present = np.unique(np.concatenate([y_test, y_pred]))
class_names_present = [le.classes_[i] for i in labels_present]

cm = confusion_matrix(y_test, y_pred, labels=labels_present)
plt.figure(figsize=(max(7.5, min(14, 0.45*len(class_names_present)+4)), 6.5))
im = plt.imshow(cm, aspect='auto')
plt.title("ArtifyAI: Confusion Matrix (Styles)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar(im)
plt.xticks(range(len(class_names_present)), class_names_present, rotation=45, ha='right')
plt.yticks(range(len(class_names_present)), class_names_present)
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close()
print(f"[SAVED] {CM_PNG}")

report = classification_report(
    y_test, y_pred,
    labels=labels_present,
    target_names=class_names_present,
    digits=4,
    zero_division=0
)
with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== ArtifyAI: Engineered Features + SGDClassifier ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] {REPORT_TXT}")

print("\n[DONE] All artifacts saved in:", OUT_DIR)


[WARN] Only one label from folders: misc
[INFO] Falling back to filename-based labels…
[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_features.csv
[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_feature_corr_heatmap.png
[WARN] Dropping rare classes (<2 samples): ['Camille Pissarro', 'Lubo Kristek', 'Oscar Florianus Bluemner', 'Paul Nash', 'Salvador Dali']
[INFO] Classes after filtering (4): ['Edvard Munch', 'Katsushika Hokusai', 'Leonardo Da Vinci', 'Vincent Van Gogh']
[WARN] Stratified split failed (The test_size = 3 should be greater or equal to the number of classes = 4); using non-stratified split.




[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_accuracy_over_epochs.png
[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_accuracy_over_epochs.csv
[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_confusion_matrix.png
[SAVED] C:\Users\sagni\Downloads\Artify AI\artify_classification_report.txt

[DONE] All artifacts saved in: C:\Users\sagni\Downloads\Artify AI
