In [1]:
# ==========================================================
# DeepFakeShield â€” Accuracy Graph + Heatmaps (Full Script)
# ==========================================================
# Inputs (recursive):
#   Train/Real:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\real
#   Train/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\fake
#   Sample/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\Sample_fake_images\Sample_fake_images\fake
#   Test/Fake:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\fake
#   Test/Real:
#     C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\real
#
# Outputs (saved to C:\Users\sagni\Downloads\DeepFakeShield):
#   - deepfakeshield_features.csv
#   - deepfakeshield_feature_corr_heatmap.png
#   - deepfakeshield_confusion_heatmap.png
#   - deepfakeshield_accuracy_over_epochs.png
#   - deepfakeshield_accuracy_over_epochs.csv
# ==========================================================

from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from skimage.color import rgb2gray
from skimage.feature import canny

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report
)

# ----------------------------
# Paths / Config
# ----------------------------
OUT_DIR = Path(r"C:\Users\sagni\Downloads\DeepFakeShield")
OUT_DIR.mkdir(parents=True, exist_ok=True)

DIRS: List[Tuple[Path, str, str]] = [
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\real"), "train", "real"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\train-20250112T065955Z-001\train\fake"), "train", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\Sample_fake_images\Sample_fake_images\fake"), "sample", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\fake"), "test", "fake"),
    (Path(r"C:\Users\sagni\Downloads\DeepFakeShield\archive\test-20250112T065939Z-001\test\real"), "test", "real"),
]

IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp", ".gif"}

FEATURES_CSV = OUT_DIR / "deepfakeshield_features.csv"
CORR_PNG     = OUT_DIR / "deepfakeshield_feature_corr_heatmap.png"
CM_PNG       = OUT_DIR / "deepfakeshield_confusion_heatmap.png"
ACC_PNG      = OUT_DIR / "deepfakeshield_accuracy_over_epochs.png"
ACC_CSV      = OUT_DIR / "deepfakeshield_accuracy_over_epochs.csv"
REPORT_TXT   = OUT_DIR / "deepfakeshield_classification_report.txt"

# ----------------------------
# Utilities
# ----------------------------
def list_images(root: Path) -> List[Path]:
    if not root.exists():
        print(f"[WARN] Missing directory (skipped): {root}")
        return []
    return [p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in IMG_EXTS]

def extract_features_one(path: Path, hist_bins: int = 16, edge_resize: int = 256) -> Dict[str, Any]:
    """
    Extract numeric features:
      - width/height/aspect_ratio
      - RGB mean/std
      - grayscale brightness & contrast
      - edge density (Canny on resized gray)
      - grayscale histogram (L1-normalized, 'hist_00'..'hist_15')
    """
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")
            w, h = im.size
            arr = np.asarray(im, dtype=np.float32) / 255.0  # HxWx3 in [0,1]

        width, height = float(w), float(h)
        aspect = float(w / h) if h else np.nan

        r, g, b = arr[..., 0], arr[..., 1], arr[..., 2]
        mean_r, mean_g, mean_b = float(r.mean()), float(g.mean()), float(b.mean())
        std_r,  std_g,  std_b  = float(r.std()),  float(g.std()),  float(b.std())

        gray = rgb2gray(arr)  # [0,1]
        brightness = float(gray.mean())
        contrast   = float(gray.std())

        # Edge density on resized gray (keep aspect ratio)
        if min(h, w) > 0 and min(h, w) != edge_resize:
            scale = edge_resize / min(h, w)
            new_w, new_h = int(w * scale), int(h * scale)
            gray_small = np.asarray(Image.fromarray((gray*255).astype(np.uint8)).resize((new_w, new_h))) / 255.0
        else:
            gray_small = gray
        edges = canny(gray_small, sigma=1.5)
        edge_density = float(edges.mean())

        # Grayscale histogram
        hist, _ = np.histogram((gray*255.0).astype(np.uint8), bins=hist_bins, range=(0, 255))
        hist = hist.astype(np.float32)
        hist = hist / (hist.sum() + 1e-9)

        feats = {
            "width": width, "height": height, "aspect_ratio": aspect,
            "mean_r": mean_r, "mean_g": mean_g, "mean_b": mean_b,
            "std_r": std_r, "std_g": std_g, "std_b": std_b,
            "brightness": brightness, "contrast": contrast,
            "edge_density": edge_density,
        }
        for i, v in enumerate(hist):
            feats[f"hist_{i:02d}"] = float(v)
        return feats

    except Exception as e:
        # Return NaNs so row can be dropped later
        base = {
            "width": np.nan, "height": np.nan, "aspect_ratio": np.nan,
            "mean_r": np.nan, "mean_g": np.nan, "mean_b": np.nan,
            "std_r": np.nan, "std_g": np.nan, "std_b": np.nan,
            "brightness": np.nan, "contrast": np.nan,
            "edge_density": np.nan,
        }
        for i in range(16):
            base[f"hist_{i:02d}"] = np.nan
        base["error"] = str(e)
        return base

def scan_dirs(dirs: List[Tuple[Path, str, str]]) -> pd.DataFrame:
    rows = []
    for root, split, label_name in dirs:
        label = 1 if label_name.lower() == "fake" else 0
        files = list_images(root)
        for p in files:
            feats = extract_features_one(p)
            row = {
                "split": split,
                "label_name": label_name,
                "label": label,
                "filename": p.name,
                "relpath": str(p.relative_to(root)),
                "abspath": str(p.resolve()),
                **feats
            }
            rows.append(row)
        print(f"[INFO] {split}/{label_name}: {len(files)} images")
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["split", "label_name", "filename"]).reset_index(drop=True)
    print(f"[INFO] Total rows: {len(df)}")
    return df

def plot_confusion_heatmap(cm: np.ndarray, labels: list, title: str, out_path: Path):
    plt.figure(figsize=(6, 5))
    im = plt.imshow(cm, aspect='equal')
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.colorbar(im)
    plt.xticks(range(len(labels)), labels)
    plt.yticks(range(len(labels)), labels)
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha='center', va='center')
    plt.tight_layout()
    plt.savefig(out_path, dpi=220)
    plt.close()

# ----------------------------
# 1) Build feature table
# ----------------------------
df = scan_dirs(DIRS)
if df.empty:
    raise SystemExit("No images found. Please check your paths.")

# Drop rows with missing core features
core = ["width", "height", "aspect_ratio", "brightness", "contrast", "edge_density"]
df_clean = df.dropna(subset=core).reset_index(drop=True)
df_clean.to_csv(FEATURES_CSV, index=False)
print(f"[SAVED] Features CSV -> {FEATURES_CSV}")

# ----------------------------
# 2) Feature correlation heatmap (all splits together)
# ----------------------------
feat_cols = [c for c in df_clean.columns
             if c not in {"split","label_name","label","filename","relpath","abspath","error"}]
feat_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(df_clean[c])]
corr = df_clean[feat_cols].corr(numeric_only=True)

plt.figure(figsize=(12, 9))
im = plt.imshow(corr.values, aspect='auto')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=45, ha='right', fontsize=8)
plt.yticks(range(corr.shape[0]), corr.index, fontsize=8)
plt.title("DeepFakeShield: Feature Correlation Heatmap")
plt.colorbar(im)
plt.tight_layout()
plt.savefig(CORR_PNG, dpi=220)
plt.close()
print(f"[SAVED] Feature correlation heatmap -> {CORR_PNG}")

# ----------------------------
# 3) Train (train split) / Test (test + sample)
# ----------------------------
train_df = df_clean[df_clean["split"] == "train"].copy()
test_df  = df_clean[df_clean["split"].isin(["test","sample"])].copy()

if train_df.empty or test_df.empty:
    raise SystemExit("Train or Test split is empty. Ensure your folders have images.")

X_train = train_df[feat_cols].astype(float).values
y_train = train_df["label"].astype(int).values

X_test  = test_df[feat_cols].astype(float).values
y_test  = test_df["label"].astype(int).values

# Standardize
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

# ----------------------------
# 4) Accuracy over epochs (SGD logistic)
# ----------------------------
clf = SGDClassifier(
    loss="log_loss",
    alpha=1e-4,
    max_iter=1,     # we'll loop epochs manually
    learning_rate="optimal",
    random_state=42,
    warm_start=True
)
classes = np.unique(y_train)
epochs = 15
train_acc, test_acc = [], []

for epoch in range(epochs):
    if epoch == 0:
        clf.partial_fit(X_train_sc, y_train, classes=classes)
    else:
        clf.partial_fit(X_train_sc, y_train)

    yhat_tr = clf.predict(X_train_sc)
    yhat_te = clf.predict(X_test_sc)

    train_acc.append(accuracy_score(y_train, yhat_tr))
    test_acc.append(accuracy_score(y_test, yhat_te))

# Plot accuracy curve
plt.figure(figsize=(8, 4.5))
plt.plot(range(1, epochs+1), train_acc, marker='o', label="Train Accuracy")
plt.plot(range(1, epochs+1), test_acc, marker='s', label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("DeepFakeShield: Accuracy over Epochs (SGD Logistic)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(ACC_PNG, dpi=220)
plt.close()
print(f"[SAVED] Accuracy graph -> {ACC_PNG}")

# Save accuracy CSV
pd.DataFrame({
    "epoch": list(range(1, epochs+1)),
    "train_accuracy": train_acc,
    "test_accuracy": test_acc
}).to_csv(ACC_CSV, index=False)
print(f"[SAVED] Accuracy CSV -> {ACC_CSV}")

# ----------------------------
# 5) Final evaluation (balanced Logistic Regression) + Confusion heatmap
# ----------------------------
final_lr = LogisticRegression(
    solver="saga",
    penalty="l2",
    class_weight="balanced",
    max_iter=600,
    n_jobs=-1,
    random_state=42
)
final_lr.fit(X_train_sc, y_train)
y_pred = final_lr.predict(X_test_sc)
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
plot_confusion_heatmap(
    cm, labels=["Real (0)", "Fake (1)"],
    title="DeepFakeShield: Confusion Matrix (Balanced LR)",
    out_path=CM_PNG
)
print(f"[SAVED] Confusion matrix heatmap -> {CM_PNG}")

report = classification_report(y_test, y_pred, digits=4)
with open(REPORT_TXT, "w", encoding="utf-8") as f:
    f.write("=== DeepFakeShield: Balanced Logistic Regression (Test) ===\n\n")
    f.write(report + "\n")
print(f"[SAVED] Classification report -> {REPORT_TXT}")

print("\n[DONE] Outputs in:", OUT_DIR)
print(" -", FEATURES_CSV)
print(" -", CORR_PNG)
print(" -", ACC_PNG)
print(" -", ACC_CSV)
print(" -", CM_PNG)
print(" -", REPORT_TXT)




[INFO] train/real: 326 images
[INFO] train/fake: 153 images
[INFO] sample/fake: 5 images
[INFO] test/fake: 389 images
[INFO] test/real: 110 images
[INFO] Total rows: 983
[SAVED] Features CSV -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_features.csv
[SAVED] Feature correlation heatmap -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_feature_corr_heatmap.png
[SAVED] Accuracy graph -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_accuracy_over_epochs.png
[SAVED] Accuracy CSV -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_accuracy_over_epochs.csv
[SAVED] Confusion matrix heatmap -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_confusion_heatmap.png
[SAVED] Classification report -> C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_classification_report.txt

[DONE] Outputs in: C:\Users\sagni\Downloads\DeepFakeShield
 - C:\Users\sagni\Downloads\DeepFakeShield\deepfakeshield_features.csv
 - C:\Users\sagni\Downloads\DeepFakeShield\deepfake