In [2]:
import re
from pathlib import Path
import pandas as pd

# 1) Point to your dataset root
DATA_ROOT = Path(r"W:\SRH study\Case Study 2\Offline Signature Verification\Datasets\signatures")
ORG_DIR  = DATA_ROOT / "full_org"
FORG_DIR = DATA_ROOT / "full_forg"

print("ORG exists:", ORG_DIR.exists(), ORG_DIR)
print("FORG exists:", FORG_DIR.exists(), FORG_DIR)

# 2) Filename patterns
PAT_ORG  = re.compile(r"^original_(\d+)_(\d+)\.png$", re.IGNORECASE)
PAT_FORG = re.compile(r"^forgeries_(\d+)_(\d+)\.png$", re.IGNORECASE)

rows = []

def scan(folder: Path, label: str, pat: re.Pattern):
    for fp in folder.iterdir():
        if not fp.is_file():
            continue
        if fp.suffix.lower() != ".png":     # handles .PNG as well
            continue

        m = pat.match(fp.name)
        if not m:
            # file exists but naming doesn't match expected pattern
            rows.append({
                "writer_id": None,
                "sample_id": None,
                "label": label,
                "path": str(fp),
                "filename_ok": False
            })
            continue

        rows.append({
            "writer_id": int(m.group(1)),
            "sample_id": int(m.group(2)),
            "label": label,
            "path": str(fp),
            "filename_ok": True
        })

scan(ORG_DIR,  "genuine", PAT_ORG)
scan(FORG_DIR, "forgery", PAT_FORG)

df = pd.DataFrame(rows)

print("\nTotal rows:", len(df))
print("Bad filenames:", (~df["filename_ok"]).sum())
print("\nLabel counts:")
print(df["label"].value_counts())

# 3) This is the DataFrame you need for pair generation
valid = df[df["filename_ok"]].copy()

# Optional: quick check
print("\nvalid columns:", list(valid.columns))
display(valid.head())


ORG exists: True W:\SRH study\Case Study 2\Offline Signature Verification\Datasets\signatures\full_org
FORG exists: True W:\SRH study\Case Study 2\Offline Signature Verification\Datasets\signatures\full_forg

Total rows: 2640
Bad filenames: 0

Label counts:
label
genuine    1320
forgery    1320
Name: count, dtype: int64

valid columns: ['writer_id', 'sample_id', 'label', 'path', 'filename_ok']


Unnamed: 0,writer_id,sample_id,label,path,filename_ok
0,10,1,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
1,10,10,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
2,10,11,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
3,10,12,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True
4,10,13,genuine,W:\SRH study\Case Study 2\Offline Signature Ve...,True


In [3]:
# Writer-independent split (train/val/test)
import numpy as np
import pandas as pd

def split_writers(valid_df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-9
    
    writers = np.array(sorted(valid_df["writer_id"].unique()))
    rng = np.random.default_rng(seed)
    rng.shuffle(writers)

    n = len(writers)
    n_train = int(round(n * train_ratio))
    n_val   = int(round(n * val_ratio))
    # remaining goes to test
    train_w = writers[:n_train]
    val_w   = writers[n_train:n_train+n_val]
    test_w  = writers[n_train+n_val:]

    return set(train_w), set(val_w), set(test_w)

train_writers, val_writers, test_writers = split_writers(valid, seed=42)

print("Writers:", len(valid["writer_id"].unique()))
print("Train writers:", len(train_writers))
print("Val writers:", len(val_writers))
print("Test writers:", len(test_writers))

# sanity: no overlap
print("Overlap train-val:", len(train_writers & val_writers))
print("Overlap train-test:", len(train_writers & test_writers))
print("Overlap val-test:", len(val_writers & test_writers))


Writers: 55
Train writers: 38
Val writers: 8
Test writers: 9
Overlap train-val: 0
Overlap train-test: 0
Overlap val-test: 0


In [4]:
# Helper: build per-writer pools (genuine / forgery lists)
def build_pools(df_subset):
    """
    Returns:
      genuine_by_writer: {writer_id: [paths...]}
      forgery_by_writer: {writer_id: [paths...]}
    """
    genuine_by_writer = {}
    forgery_by_writer = {}
    
    for wid, group in df_subset.groupby("writer_id"):
        g_paths = group[group["label"] == "genuine"]["path"].tolist()
        f_paths = group[group["label"] == "forgery"]["path"].tolist()
        if len(g_paths) > 0:
            genuine_by_writer[wid] = g_paths
        if len(f_paths) > 0:
            forgery_by_writer[wid] = f_paths
    
    return genuine_by_writer, forgery_by_writer


In [5]:
# Generate balanced pairs
def generate_pairs_for_writers(valid_df, writer_set, n_pairs=20000, seed=42, neg_mix=0.5):
    """
    Generates a DataFrame with columns:
      path_a, path_b, label, pair_type, writer_a, writer_b
    
    neg_mix: fraction of negatives that are same-writer (genuine vs forgery)
             remaining negatives are cross-writer (genuine vs genuine from different writers)
    """
    df_subset = valid_df[valid_df["writer_id"].isin(writer_set)].copy()
    genuine_by_writer, forgery_by_writer = build_pools(df_subset)
    
    writers = sorted(set(genuine_by_writer.keys()))
    if len(writers) < 2:
        raise ValueError("Need at least 2 writers with genuine samples to form cross-writer negatives.")
    
    rng = np.random.default_rng(seed)
    pairs = []

    n_pos = n_pairs // 2
    n_neg = n_pairs - n_pos
    n_neg_same = int(round(n_neg * neg_mix))
    n_neg_cross = n_neg - n_neg_same

    # -------- Positive pairs: genuine-genuine same writer --------
    for _ in range(n_pos):
        w = rng.choice(writers)
        g_list = genuine_by_writer[w]
        # pick two DIFFERENT genuine samples
        a, b = rng.choice(len(g_list), size=2, replace=False)
        pairs.append({
            "path_a": g_list[a],
            "path_b": g_list[b],
            "label": 1,
            "pair_type": "pos_genuine_genuine",
            "writer_a": w,
            "writer_b": w
        })

    # -------- Negative pairs 1: genuine-forgery same writer --------
    writers_with_forg = sorted(set(genuine_by_writer.keys()) & set(forgery_by_writer.keys()))
    if len(writers_with_forg) == 0:
        raise ValueError("No writers have both genuine and forgery samples in this split.")

    for _ in range(n_neg_same):
        w = rng.choice(writers_with_forg)
        g_list = genuine_by_writer[w]
        f_list = forgery_by_writer[w]
        a = rng.integers(0, len(g_list))
        b = rng.integers(0, len(f_list))
        pairs.append({
            "path_a": g_list[a],
            "path_b": f_list[b],
            "label": 0,
            "pair_type": "neg_genuine_forgery_same_writer",
            "writer_a": w,
            "writer_b": w
        })

    # -------- Negative pairs 2: genuine-genuine different writers --------
    for _ in range(n_neg_cross):
        w1, w2 = rng.choice(writers, size=2, replace=False)
        g1 = genuine_by_writer[w1]
        g2 = genuine_by_writer[w2]
        a = rng.integers(0, len(g1))
        b = rng.integers(0, len(g2))
        pairs.append({
            "path_a": g1[a],
            "path_b": g2[b],
            "label": 0,
            "pair_type": "neg_genuine_genuine_cross_writer",
            "writer_a": w1,
            "writer_b": w2
        })

    pairs_df = pd.DataFrame(pairs)
    # shuffle rows
    pairs_df = pairs_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return pairs_df


In [6]:
# Create train/val/test pairs
train_pairs = generate_pairs_for_writers(valid, train_writers, n_pairs=40000, seed=1)
val_pairs   = generate_pairs_for_writers(valid, val_writers,   n_pairs=10000, seed=2)
test_pairs  = generate_pairs_for_writers(valid, test_writers,  n_pairs=10000, seed=3)

print("Train pairs:", len(train_pairs))
print("Val pairs:", len(val_pairs))
print("Test pairs:", len(test_pairs))

print("\nTrain label balance:")
display(train_pairs["label"].value_counts(normalize=True).rename("fraction"))

print("\nTrain pair types:")
display(train_pairs["pair_type"].value_counts())


Train pairs: 40000
Val pairs: 10000
Test pairs: 10000

Train label balance:


label
1    0.5
0    0.5
Name: fraction, dtype: float64


Train pair types:


pair_type
pos_genuine_genuine                 20000
neg_genuine_genuine_cross_writer    10000
neg_genuine_forgery_same_writer     10000
Name: count, dtype: int64

In [7]:
# sanity checks
# 1) Are positives truly same-writer?
pos_bad = train_pairs[(train_pairs["label"] == 1) & (train_pairs["writer_a"] != train_pairs["writer_b"])]
print("Positive pairs with different writers (should be 0):", len(pos_bad))

# 2) Are cross-writer negatives truly different-writer?
cross_bad = train_pairs[(train_pairs["pair_type"] == "neg_genuine_genuine_cross_writer") & (train_pairs["writer_a"] == train_pairs["writer_b"])]
print("Cross-writer negatives with same writer (should be 0):", len(cross_bad))

# 3) Ensure writer sets are disjoint (already checked earlier)


Positive pairs with different writers (should be 0): 0
Cross-writer negatives with same writer (should be 0): 0


In [10]:
# check whether writer-independent split is truly disjoint
train_ws = set(train_pairs["writer_a"]).union(set(train_pairs["writer_b"]))
val_ws   = set(val_pairs["writer_a"]).union(set(val_pairs["writer_b"]))
test_ws  = set(test_pairs["writer_a"]).union(set(test_pairs["writer_b"]))

print("Train-Val overlap writers:", len(train_ws & val_ws))
print("Train-Test overlap writers:", len(train_ws & test_ws))
print("Val-Test overlap writers:", len(val_ws & test_ws))


Train-Val overlap writers: 0
Train-Test overlap writers: 0
Val-Test overlap writers: 0


In [9]:
# Preprocessing function
import tensorflow as tf
import cv2
import numpy as np

IMG_SIZE = (224, 224)  # you can change to 128x128 if training is slow

def load_preprocess(path):
    path = path.numpy().decode("utf-8")
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, IMG_SIZE)
    img = img.astype(np.float32) / 255.0
    img = np.expand_dims(img, axis=-1)  # (H,W,1)
    return img

def tf_load_preprocess(path):
    img = tf.py_function(load_preprocess, [path], Tout=tf.float32)
    img.set_shape([IMG_SIZE[0], IMG_SIZE[1], 1])
    return img


In [11]:
def make_pair_dataset(pairs_df, batch_size=32, shuffle=True):
    a_paths = pairs_df["path_a"].astype(str).values
    b_paths = pairs_df["path_b"].astype(str).values
    labels  = pairs_df["label"].astype(np.float32).values

    ds = tf.data.Dataset.from_tensor_slices((a_paths, b_paths, labels))

    def map_fn(a, b, y):
        img_a = tf_load_preprocess(a)
        img_b = tf_load_preprocess(b)
        return (img_a, img_b), y

    ds = ds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(2000, reshuffle_each_iteration=True)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_pair_dataset(train_pairs, batch_size=32, shuffle=True)
val_ds   = make_pair_dataset(val_pairs,   batch_size=32, shuffle=False)

# sanity: inspect one batch
(batch_imgs, batch_y) = next(iter(train_ds))
print(batch_imgs[0].shape, batch_imgs[1].shape, batch_y.shape)


(32, 224, 224, 1) (32, 224, 224, 1) (32,)
