In [9]:
# Create "valid" (metadata table)
import re
from pathlib import Path
import numpy as np
import pandas as pd

DATA_ROOT = Path(r"W:\SRH study\Case Study 2\Offline Signature Verification\Datasets\signatures")
ORG_DIR  = DATA_ROOT / "full_org"
FORG_DIR = DATA_ROOT / "full_forg"

PAT_ORG  = re.compile(r"^original_(\d+)_(\d+)\.png$", re.IGNORECASE)
PAT_FORG = re.compile(r"^forgeries_(\d+)_(\d+)\.png$", re.IGNORECASE)

rows = []

def scan(folder: Path, label: str, pat: re.Pattern):
    for fp in folder.iterdir():
        if not fp.is_file():
            continue
        if fp.suffix.lower() != ".png":
            continue

        m = pat.match(fp.name)
        if not m:
            rows.append({"writer_id": None, "sample_id": None, "label": label, "path": str(fp), "filename_ok": False})
            continue

        rows.append({"writer_id": int(m.group(1)), "sample_id": int(m.group(2)), "label": label, "path": str(fp), "filename_ok": True})

scan(ORG_DIR,  "genuine", PAT_ORG)
scan(FORG_DIR, "forgery", PAT_FORG)

df = pd.DataFrame(rows)
valid = df[df["filename_ok"]].copy()

print("Total images:", len(valid))
print(valid["label"].value_counts())
print("Unique writers:", valid["writer_id"].nunique())
display(valid.head())


Total images: 2640
label
genuine    1320
forgery    1320
Name: count, dtype: int64
Unique writers: 55


Unnamed: 0,writer_id,sample_id,label,path,filename_ok
0,10,1,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
1,10,10,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
2,10,11,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
3,10,12,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
4,10,13,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True


In [10]:
# writer independant and air generation
def split_writers(valid_df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    writers = np.array(sorted(valid_df["writer_id"].unique()))
    rng = np.random.default_rng(seed)
    rng.shuffle(writers)

    n = len(writers)
    n_train = int(round(n * train_ratio))
    n_val   = int(round(n * val_ratio))
    train_w = writers[:n_train]
    val_w   = writers[n_train:n_train+n_val]
    test_w  = writers[n_train+n_val:]
    return set(train_w), set(val_w), set(test_w)

def build_pools(df_subset):
    genuine_by_writer = {}
    forgery_by_writer = {}
    for wid, group in df_subset.groupby("writer_id"):
        g = group[group["label"] == "genuine"]["path"].tolist()
        f = group[group["label"] == "forgery"]["path"].tolist()
        if g: genuine_by_writer[wid] = g
        if f: forgery_by_writer[wid] = f
    return genuine_by_writer, forgery_by_writer

def generate_pairs_for_writers(valid_df, writer_set, n_pairs=20000, seed=42, neg_mix=0.5):
    df_subset = valid_df[valid_df["writer_id"].isin(writer_set)].copy()
    genuine_by_writer, forgery_by_writer = build_pools(df_subset)

    writers = sorted(genuine_by_writer.keys())
    rng = np.random.default_rng(seed)

    n_pos = n_pairs // 2
    n_neg = n_pairs - n_pos
    n_neg_same  = int(round(n_neg * neg_mix))
    n_neg_cross = n_neg - n_neg_same

    writers_with_forg = sorted(set(genuine_by_writer) & set(forgery_by_writer))
    if len(writers) < 2:
        raise ValueError("Need >=2 writers for cross-writer negatives.")
    if len(writers_with_forg) == 0:
        raise ValueError("Need writers with both genuine and forgery samples.")

    pairs = []

    # positives: genuine-genuine same writer
    for _ in range(n_pos):
        w = rng.choice(writers)
        g = genuine_by_writer[w]
        a, b = rng.choice(len(g), size=2, replace=False)
        pairs.append({"path_a": g[a], "path_b": g[b], "label": 1, "pair_type": "pos", "writer_a": w, "writer_b": w})

    # negatives: genuine-forgery same writer
    for _ in range(n_neg_same):
        w = rng.choice(writers_with_forg)
        g = genuine_by_writer[w]
        f = forgery_by_writer[w]
        a = rng.integers(0, len(g))
        b = rng.integers(0, len(f))
        pairs.append({"path_a": g[a], "path_b": f[b], "label": 0, "pair_type": "neg_same_writer", "writer_a": w, "writer_b": w})

    # negatives: genuine-genuine cross writer
    for _ in range(n_neg_cross):
        w1, w2 = rng.choice(writers, size=2, replace=False)
        g1 = genuine_by_writer[w1]
        g2 = genuine_by_writer[w2]
        a = rng.integers(0, len(g1))
        b = rng.integers(0, len(g2))
        pairs.append({"path_a": g1[a], "path_b": g2[b], "label": 0, "pair_type": "neg_cross_writer", "writer_a": w1, "writer_b": w2})

    pairs_df = pd.DataFrame(pairs).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return pairs_df

train_writers, val_writers, test_writers = split_writers(valid, seed=42)

train_pairs = generate_pairs_for_writers(valid, train_writers, n_pairs=40000, seed=1)
val_pairs   = generate_pairs_for_writers(valid, val_writers,   n_pairs=10000, seed=2)
test_pairs  = generate_pairs_for_writers(valid, test_writers,  n_pairs=10000, seed=3)

print("Train pairs:", len(train_pairs), "| Val pairs:", len(val_pairs), "| Test pairs:", len(test_pairs))
print("Train label balance:\n", train_pairs["label"].value_counts(normalize=True))


Train pairs: 40000 | Val pairs: 10000 | Test pairs: 10000
Train label balance:
 label
1    0.5
0    0.5
Name: proportion, dtype: float64


In [11]:
# sanity check
pos_bad = train_pairs[(train_pairs["label"] == 1) & (train_pairs["writer_a"] != train_pairs["writer_b"])]
cross_bad = train_pairs[(train_pairs["pair_type"] == "neg_cross_writer") & (train_pairs["writer_a"] == train_pairs["writer_b"])]
print("Positive pairs wrong:", len(pos_bad))
print("Cross-writer negatives wrong:", len(cross_bad))


Positive pairs wrong: 0
Cross-writer negatives wrong: 0


In [12]:
# shared CNN embedding network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

IMG_SIZE = (224, 224)
EMB_DIM  = 128

def build_embedding_network(input_shape=(224,224,1), emb_dim=128):
    inp = keras.Input(shape=input_shape)

    x = layers.Conv2D(32, 3, activation="relu")(inp)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(64, 3, activation="relu")(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(128, 3, activation="relu")(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(256, 3, activation="relu")(x)
    x = layers.GlobalAveragePooling2D()(x)

    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    emb = layers.Dense(emb_dim)(x)
    emb = layers.Lambda(lambda t: tf.nn.l2_normalize(t, axis=1), name="l2_norm")(emb)

    return keras.Model(inp, emb, name="embedding_net")

embedding_net = build_embedding_network(input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1), emb_dim=EMB_DIM)
embedding_net.summary()



In [13]:
# Siamese model that outputs a distance
class EuclideanDistance(layers.Layer):
    def call(self, inputs):
        emb_a, emb_b = inputs
        return tf.sqrt(tf.reduce_sum(tf.square(emb_a - emb_b), axis=1, keepdims=True) + 1e-9)

def build_siamese_model(embedding_model, input_shape=(224,224,1)):
    a = keras.Input(shape=input_shape, name="img_a")
    b = keras.Input(shape=input_shape, name="img_b")

    emb_a = embedding_model(a)
    emb_b = embedding_model(b)

    dist = EuclideanDistance(name="euclidean_distance")([emb_a, emb_b])

    return keras.Model(inputs=[a, b], outputs=dist, name="siamese_network")

siamese_model = build_siamese_model(embedding_net, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1))
siamese_model.summary()


In [14]:
# Contrastive loss (the standard Siamese loss)
class ContrastiveLoss(keras.losses.Loss):
    def __init__(self, margin=1.0, name="contrastive_loss"):
        super().__init__(name=name)
        self.margin = margin

    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)

        # y_true=1 => positive pairs: minimize distance^2
        pos = y_true * tf.square(y_pred)

        # y_true=0 => negative pairs: minimize max(margin - distance, 0)^2
        neg = (1.0 - y_true) * tf.square(tf.maximum(self.margin - y_pred, 0.0))

        return tf.reduce_mean(pos + neg)

loss_fn = ContrastiveLoss(margin=1.0)


In [15]:
import tensorflow as tf
import cv2
import numpy as np

IMG_SIZE = (224, 224)

def load_preprocess(path):
    path = path.numpy().decode("utf-8")
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, IMG_SIZE)
    img = img.astype(np.float32) / 255.0
    img = np.expand_dims(img, axis=-1)
    return img

def tf_load_preprocess(path):
    img = tf.py_function(load_preprocess, [path], Tout=tf.float32)
    img.set_shape([IMG_SIZE[0], IMG_SIZE[1], 1])
    return img

def make_pair_dataset(pairs_df, batch_size=32, shuffle=True):
    a_paths = pairs_df["path_a"].astype(str).values
    b_paths = pairs_df["path_b"].astype(str).values
    labels  = pairs_df["label"].astype(np.float32).values

    ds = tf.data.Dataset.from_tensor_slices((a_paths, b_paths, labels))

    def map_fn(a, b, y):
        img_a = tf_load_preprocess(a)
        img_b = tf_load_preprocess(b)
        return (img_a, img_b), y

    ds = ds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(2000)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_pair_dataset(train_pairs, batch_size=32, shuffle=True)
val_ds   = make_pair_dataset(val_pairs, batch_size=32, shuffle=False)

# Sanity check
(batch_imgs, batch_y) = next(iter(train_ds))
print(batch_imgs[0].shape, batch_imgs[1].shape, batch_y.shape)


(32, 224, 224, 1) (32, 224, 224, 1) (32,)


In [16]:
# compile and train
@tf.function
def mean_pos_dist(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    mask = tf.where(tf.equal(y_true, 1.0), 1.0, 0.0)
    num = tf.reduce_sum(mask * tf.squeeze(y_pred))
    den = tf.reduce_sum(mask) + 1e-9
    return num / den

@tf.function
def mean_neg_dist(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    mask = tf.where(tf.equal(y_true, 0.0), 1.0, 0.0)
    num = tf.reduce_sum(mask * tf.squeeze(y_pred))
    den = tf.reduce_sum(mask) + 1e-9
    return num / den

siamese_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=loss_fn,
    metrics=[mean_pos_dist, mean_neg_dist]
)

callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6)
]

history = siamese_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=callbacks
)


Epoch 1/15
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m929s[0m 740ms/step - loss: 0.2541 - mean_neg_dist: 0.4910 - mean_pos_dist: 0.4918 - val_loss: 0.5000 - val_mean_neg_dist: 5.7844e-05 - val_mean_pos_dist: 3.7340e-05 - learning_rate: 0.0010
Epoch 2/15
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m915s[0m 730ms/step - loss: 0.2536 - mean_neg_dist: 0.4927 - mean_pos_dist: 0.4932 - val_loss: 0.5000 - val_mean_neg_dist: 4.2331e-05 - val_mean_pos_dist: 3.3560e-05 - learning_rate: 0.0010
Epoch 3/15
[1m 252/1250[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m11:29[0m 691ms/step - loss: 0.2534 - mean_neg_dist: 0.4926 - mean_pos_dist: 0.4908

KeyboardInterrupt: 

In [None]:
# sanity check after training
print("Final train metrics:", {k: v[-1] for k, v in history.history.items() if isinstance(v, list)})
