In [1]:
import os
import json
import yaml
import pickle
import random
import datetime
from pathlib import Path
from typing import List
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# ----------------------------------
# CONFIG (Windows raw-string paths)
# ----------------------------------
DATA_DIRS: List[str] = [
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Normal",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Insertion+Blur",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Insertion+Noise",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Noise",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\CopyPaste+Insertion",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\CopyPaste+Noise",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Insertion",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Copy Paste",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\CopyPaste+Blur",
    r"C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023\Blur",
]
# Parent directory for flow_from_directory:
DATA_ROOT = str(Path(DATA_DIRS[0]).parent)

OUTPUT_DIR = r"C:\Users\sagni\Downloads\Docu Verify"
# Core artifacts
MODEL_H5       = str(Path(OUTPUT_DIR) / "model.h5")
CLASS_PKL      = str(Path(OUTPUT_DIR) / "class_indices.pkl")
RUN_YAML       = str(Path(OUTPUT_DIR) / "run_config.yaml")
METRICS_JSON   = str(Path(OUTPUT_DIR) / "metrics.json")
VAL_PRED_JSON  = str(Path(OUTPUT_DIR) / "val_predictions.json")
# Plot/report artifacts
ACC_COMBINED_PNG = str(Path(OUTPUT_DIR) / "accuracy_loss.png")
ACC_ONLY_PNG     = str(Path(OUTPUT_DIR) / "accuracy_loss_acc.png")
LOSS_ONLY_PNG    = str(Path(OUTPUT_DIR) / "accuracy_loss_loss.png")
CM_PNG           = str(Path(OUTPUT_DIR) / "confusion_matrix.png")
CR_CSV           = str(Path(OUTPUT_DIR) / "classification_report.csv")
CM_CSV           = str(Path(OUTPUT_DIR) / "confusion_matrix.csv")

# Model / training params
IMG_SIZE   = (256, 256)   # a bit larger than 224 suits documents
BATCH_SIZE = 16
EPOCHS     = 15
VAL_SPLIT  = 0.2
SEED       = 42
LR         = 1e-3
AUGMENT    = True

# ----------------------------------
# Reproducibility
# ----------------------------------
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); tf.random.set_seed(s)
set_seed()

# ----------------------------------
# Prep & sanity checks
# ----------------------------------
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
if not Path(DATA_ROOT).exists():
    raise FileNotFoundError(f"DATA_ROOT not found: {DATA_ROOT}")
for p in DATA_DIRS:
    if not Path(p).exists():
        raise FileNotFoundError(f"Class folder missing: {p}")

expected_classes = sorted([Path(p).name for p in DATA_DIRS])
print("[INFO] Data root:", DATA_ROOT)
print("[INFO] Classes:", expected_classes)

# ----------------------------------
# Data pipeline
# ----------------------------------
if AUGMENT:
    train_gen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        validation_split=VAL_SPLIT,
        rotation_range=3,
        width_shift_range=0.03,
        height_shift_range=0.03,
        zoom_range=0.05,
        brightness_range=(0.9, 1.1),
        fill_mode="nearest",
    )
else:
    train_gen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        validation_split=VAL_SPLIT,
    )

val_gen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=VAL_SPLIT
)

train_flow = train_gen.flow_from_directory(
    DATA_ROOT,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    classes=expected_classes,        # explicit class order
    class_mode="categorical",
    shuffle=True,
    subset="training",
    seed=SEED
)
val_flow = val_gen.flow_from_directory(
    DATA_ROOT,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    classes=expected_classes,
    class_mode="categorical",
    shuffle=False,                   # IMPORTANT for confusion matrix
    subset="validation",
    seed=SEED
)

num_classes = len(train_flow.class_indices)
print("[INFO] Class indices:", train_flow.class_indices)

# ----------------------------------
# Model (EfficientNetB0)
# ----------------------------------
device = "/GPU:0" if tf.config.list_physical_devices("GPU") else "/CPU:0"
with tf.device(device):
    base = EfficientNetB0(include_top=False, input_shape=(*IMG_SIZE, 3), weights="imagenet")
    base.trainable = False  # freeze for initial training

    inputs = layers.Input(shape=(*IMG_SIZE, 3))
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.25)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    model = models.Model(inputs, outputs)

    model.compile(optimizer=tf.keras.optimizers.Adam(LR),
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    model.summary()

# ----------------------------------
# Callbacks
# ----------------------------------
callbacks = [
    EarlyStopping(monitor="val_accuracy", patience=4, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
    ModelCheckpoint(MODEL_H5, monitor="val_accuracy", save_best_only=True, verbose=1),
]

# ----------------------------------
# Train
# ----------------------------------
history = model.fit(
    train_flow,
    validation_data=val_flow,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

# Ensure the best model is saved; also save current
model.save(MODEL_H5)
print(f"[INFO] Saved model: {MODEL_H5}")

# ----------------------------------
# Save class indices (PKL)
# ----------------------------------
with open(CLASS_PKL, "wb") as f:
    pickle.dump(train_flow.class_indices, f)
print(f"[INFO] Saved class map: {CLASS_PKL}")

# ----------------------------------
# Save metrics (JSON)
# ----------------------------------
metrics_payload = {
    "timestamp": datetime.datetime.now().isoformat(),
    "device": device,
    "epochs_run": len(history.history["loss"]),
    "final": {
        "train_accuracy": float(history.history["accuracy"][-1]),
        "train_loss": float(history.history["loss"][-1]),
        "val_accuracy": float(history.history["val_accuracy"][-1]),
        "val_loss": float(history.history["val_loss"][-1]),
    },
    "history": {k: [float(x) for x in v] for k, v in history.history.items()}
}
with open(METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(metrics_payload, f, indent=2)
print(f"[INFO] Saved metrics: {METRICS_JSON}")

# ----------------------------------
# Validation predictions (for CM/report)
# ----------------------------------
idx_to_class = {v: k for k, v in train_flow.class_indices.items()}

val_flow.reset()
probs = model.predict(val_flow, verbose=1)  # (N, C)
y_pred = np.argmax(probs, axis=1)
y_true = val_flow.classes

# Save quick JSON with predictions (optional, handy)
val_records = []
for rel_path, pred_i, conf in zip(val_flow.filenames, y_pred, np.max(probs, axis=1)):
    val_records.append({
        "file": rel_path.replace("\\", "/"),
        "pred_class": idx_to_class[int(pred_i)],
        "confidence": float(conf)
    })
with open(VAL_PRED_JSON, "w", encoding="utf-8") as f:
    json.dump(val_records, f, indent=2)
print(f"[INFO] Saved val predictions: {VAL_PRED_JSON}")

# ----------------------------------
# PLOTS: Accuracy/Loss curves
# ----------------------------------
# Separate Acc
plt.figure(figsize=(9, 6))
plt.plot(history.history["accuracy"], label="Train Acc")
plt.plot(history.history["val_accuracy"], label="Val Acc")
plt.xlabel("Epoch"); plt.ylabel("Accuracy"); plt.title("Training vs Validation Accuracy")
plt.legend(loc="lower right"); plt.grid(alpha=0.25); plt.tight_layout()
plt.savefig(ACC_ONLY_PNG, dpi=200); plt.close()

# Separate Loss
plt.figure(figsize=(9, 6))
plt.plot(history.history["loss"], label="Train Loss")
plt.plot(history.history["val_loss"], label="Val Loss")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Training vs Validation Loss")
plt.legend(loc="upper right"); plt.grid(alpha=0.25); plt.tight_layout()
plt.savefig(LOSS_ONLY_PNG, dpi=200); plt.close()

# Combined
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot(history.history["accuracy"], label="Train Acc")
ax1.plot(history.history["val_accuracy"], label="Val Acc")
ax1.set_xlabel("Epoch"); ax1.set_ylabel("Accuracy"); ax1.set_title("Accuracy")
ax1.grid(alpha=0.25); ax1.legend(loc="lower right")

ax2 = fig.add_subplot(2, 1, 2)
ax2.plot(history.history["loss"], label="Train Loss")
ax2.plot(history.history["val_loss"], label="Val Loss")
ax2.set_xlabel("Epoch"); ax2.set_ylabel("Loss"); ax2.set_title("Loss")
ax2.grid(alpha=0.25); ax2.legend(loc="upper right")

fig.tight_layout()
fig.savefig(ACC_COMBINED_PNG, dpi=200)
plt.close(fig)
print(f"[INFO] Saved accuracy/loss plots: {ACC_COMBINED_PNG} (+ separate acc/loss PNGs)")

# ----------------------------------
# Confusion Matrix + Classification Report
# ----------------------------------
labels_order = [idx_to_class[i] for i in range(len(idx_to_class))]
cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels_order))))

# Save raw counts matrix
pd.DataFrame(cm, index=labels_order, columns=labels_order).to_csv(CM_CSV, index=True)

# Save classification report CSV
cr_dict = classification_report(y_true, y_pred, target_names=labels_order, output_dict=True, zero_division=0)
pd.DataFrame(cr_dict).to_csv(CR_CSV)
print(f"[INFO] Saved classification report CSV: {CR_CSV}")
print(f"[INFO] Saved confusion matrix CSV: {CM_CSV}")

# Normalized CM for heatmap display
cm_norm = cm.astype("float") / cm.sum(axis=1, keepdims=True)
cm_norm = np.nan_to_num(cm_norm)

# Heatmap with counts + %
fig = plt.figure(figsize=(10, 8))
ax = plt.gca()
im = ax.imshow(cm_norm, interpolation="nearest", cmap="viridis")
plt.title("Confusion Matrix (Normalized)")
cbar = plt.colorbar(im, fraction=0.046, pad=0.04)
cbar.ax.set_ylabel("Proportion", rotation=90)

tick_marks = np.arange(len(labels_order))
plt.xticks(tick_marks, labels_order, rotation=45, ha="right")
plt.yticks(tick_marks, labels_order)

thresh = cm_norm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    count = cm[i, j]
    perc  = cm_norm[i, j] * 100.0
    txt = f"{count}\n{perc:.1f}%"
    ax.text(j, i, txt,
            ha="center", va="center",
            color="white" if cm_norm[i, j] > thresh else "black",
            fontsize=9)

plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.tight_layout()
plt.savefig(CM_PNG, dpi=220)
plt.close(fig)
print(f"[INFO] Saved confusion matrix heatmap: {CM_PNG}")

# ----------------------------------
# Save run config (YAML)
# ----------------------------------
run_cfg = {
    "run": {
        "timestamp": datetime.datetime.now().isoformat(),
        "seed": SEED,
        "device": device
    },
    "data": {
        "data_root": DATA_ROOT,
        "class_dirs": DATA_DIRS,
        "classes": expected_classes,
        "val_split": VAL_SPLIT,
        "image_size": list(IMG_SIZE),
        "batch_size": BATCH_SIZE,
        "augment": AUGMENT
    },
    "model": {
        "architecture": "EfficientNetB0",
        "transfer_learning": True,
        "frozen_base": True,
        "optimizer": "Adam",
        "learning_rate": LR,
        "epochs": EPOCHS,
        "num_classes": num_classes
    },
    "artifacts": {
        "model_h5": MODEL_H5,
        "class_indices_pkl": CLASS_PKL,
        "metrics_json": METRICS_JSON,
        "val_predictions_json": VAL_PRED_JSON,
        "accuracy_loss_png": ACC_COMBINED_PNG,
        "accuracy_only_png": ACC_ONLY_PNG,
        "loss_only_png": LOSS_ONLY_PNG,
        "confusion_matrix_png": CM_PNG,
        "classification_report_csv": CR_CSV,
        "confusion_matrix_csv": CM_CSV
    }
}
with open(RUN_YAML, "w", encoding="utf-8") as f:
    yaml.safe_dump(run_cfg, f, sort_keys=False, allow_unicode=True)
print(f"[INFO] Saved run config: {RUN_YAML}")

print("\n[DONE] All artifacts saved to:", OUTPUT_DIR)


[INFO] Data root: C:\Users\sagni\Downloads\Docu Verify\Forged Handwritten Document Database\Forged Handwritten Document Database\Handwritten Forged Document Dataset 2023
[INFO] Classes: ['Blur', 'Copy Paste', 'CopyPaste+Blur', 'CopyPaste+Insertion', 'CopyPaste+Noise', 'Insertion', 'Insertion+Blur', 'Insertion+Noise', 'Noise', 'Normal']
Found 400 images belonging to 10 classes.
Found 100 images belonging to 10 classes.
[INFO] Class indices: {'Blur': 0, 'Copy Paste': 1, 'CopyPaste+Blur': 2, 'CopyPaste+Insertion': 3, 'CopyPaste+Noise': 4, 'Insertion': 5, 'Insertion+Blur': 6, 'Insertion+Noise': 7, 'Noise': 8, 'Normal': 9}


  self._warn_if_super_not_called()


Epoch 1/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 593ms/step - accuracy: 0.1073 - loss: 2.3699
Epoch 1: val_accuracy improved from -inf to 0.10000, saving model to C:\Users\sagni\Downloads\Docu Verify\model.h5




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 912ms/step - accuracy: 0.1067 - loss: 2.3714 - val_accuracy: 0.1000 - val_loss: 2.3524 - learning_rate: 0.0010
Epoch 2/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 573ms/step - accuracy: 0.1034 - loss: 2.3682
Epoch 2: val_accuracy improved from 0.10000 to 0.11000, saving model to C:\Users\sagni\Downloads\Docu Verify\model.h5




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 691ms/step - accuracy: 0.1036 - loss: 2.3683 - val_accuracy: 0.1100 - val_loss: 2.3473 - learning_rate: 0.0010
Epoch 3/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 593ms/step - accuracy: 0.0906 - loss: 2.3625
Epoch 3: val_accuracy did not improve from 0.11000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 696ms/step - accuracy: 0.0904 - loss: 2.3633 - val_accuracy: 0.1000 - val_loss: 2.3378 - learning_rate: 0.0010
Epoch 4/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583ms/step - accuracy: 0.0888 - loss: 2.3627
Epoch 4: val_accuracy did not improve from 0.11000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 688ms/step - accuracy: 0.0888 - loss: 2.3626 - val_accuracy: 0.1000 - val_loss: 2.3156 - learning_rate: 0.0010
Epoch 5/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 556ms/step - accuracy: 0.1165 - loss: 2.3457
Epoch 5: v



[INFO] Saved model: C:\Users\sagni\Downloads\Docu Verify\model.h5
[INFO] Saved class map: C:\Users\sagni\Downloads\Docu Verify\class_indices.pkl
[INFO] Saved metrics: C:\Users\sagni\Downloads\Docu Verify\metrics.json
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 804ms/step
[INFO] Saved val predictions: C:\Users\sagni\Downloads\Docu Verify\val_predictions.json
[INFO] Saved accuracy/loss plots: C:\Users\sagni\Downloads\Docu Verify\accuracy_loss.png (+ separate acc/loss PNGs)
[INFO] Saved classification report CSV: C:\Users\sagni\Downloads\Docu Verify\classification_report.csv
[INFO] Saved confusion matrix CSV: C:\Users\sagni\Downloads\Docu Verify\confusion_matrix.csv
[INFO] Saved confusion matrix heatmap: C:\Users\sagni\Downloads\Docu Verify\confusion_matrix.png
[INFO] Saved run config: C:\Users\sagni\Downloads\Docu Verify\run_config.yaml

[DONE] All artifacts saved to: C:\Users\sagni\Downloads\Docu Verify
