## Signature Verification using Siamese Networks

#### work flow
input - 2 images and a label  
get embeddings from a basic model (EffitientNet overfits)  
calculate similarity - how close these 2 vectors are in space  
The label (1 or 0) supervises this learning:
If two images are labeled as similar (1) but embeddings are far → loss is high → update weights.
If labeled as different (0) but embeddings are close → loss is high → update weights.  
higher similarity -> Genuine, lower -> forged

In [1]:
import tensorflow as tf
import os

2025-07-03 05:13:26.376727: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751519606.576695      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751519606.636844      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
path = '/kaggle/input/signature-verification-dataset/sign_data/train'
ls = os.listdir(path)
print('training data directories',len(ls))

training data directories 128


#### Preprocess & Pair the signatures  
input format: ( (img1, img2), label)  
label = 1: Two genuine signatures from the same person.  
label = 0: One genuine and one forged signature from the same person.  

In [3]:
import random
from PIL import Image
import numpy as np

def load_signature_pairs(data_dir):
    pairs = []
    labels = []
    
    users = sorted([name for name in os.listdir(data_dir) if '_' not in name])
    
    for user in users:
        genuine_dir = os.path.join(data_dir, user)
        forg_dir = genuine_dir + '_forg'
        
        genuine_imgs = os.listdir(genuine_dir)
        forgery_imgs = os.listdir(forg_dir)
        
        num_genuine = len(genuine_imgs)
        
        # Random positive pairs (genuine vs different genuine)
        random_indices = [
            random.choice([j for j in range(num_genuine) if j != i])
            for i in range(num_genuine)
        ]
        
        for i in range(num_genuine):
            img1 = os.path.join(genuine_dir, genuine_imgs[i])
            img2 = os.path.join(genuine_dir, genuine_imgs[random_indices[i]])
            pairs.append((img1, img2))
            labels.append(1)

        # Negative pairs (genuine vs forgery)
        for i in range(min(len(genuine_imgs), len(forgery_imgs))):
            img1 = os.path.join(genuine_dir, genuine_imgs[i])
            img2 = os.path.join(forg_dir, forgery_imgs[i])
            pairs.append((img1, img2))
            labels.append(0)
    
    return pairs, labels


In [4]:
path = '/kaggle/input/signature-verification-dataset/sign_data/train'
pairs, labels = load_signature_pairs(path)
print(len(pairs), len(labels))

1606 1606


### backbone
caluculate embeddings for each image to compare similarity (how close they are in space)  
Input: 224×224 grayscale image  
Output: 256-dimensional embedding for input image.

In [5]:
from tensorflow.keras import layers, Model
# from tensorflow.keras.applications import EfficientNetB0

IMG_SIZE = 224

def build_backbone():
    inputs = tf.keras.Input(shape=(224, 224, 1))
    x = layers.Conv2D(64, 3, activation='relu')(inputs) # basic feature extraction
    x = layers.MaxPooling2D()(x) # reduces spatial size
    x = layers.Conv2D(128, 3, activation='relu')(x) # d eeper pattern detection
    x = layers.GlobalAveragePooling2D()(x) # flattens spatial info into a single vector
    x = layers.Dense(256, activation='relu')(x) # embedding layer
    return Model(inputs, x, name="SimpleCNNBackbone")

Feature Extraction:
Pass both images through the backbone (shared weights) to get embeddings.  
Similarity Calculation:
Use L1 distance (absolute difference) between the two embeddings.  
Output Layer:
sigmoid activation gives a similarity score (between 0 and 1).  


### siamese neural network

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, Model

def build_siamese_network(backbone):
    inp1 = layers.Input((IMG_SIZE,IMG_SIZE,1))
    inp2 = layers.Input((IMG_SIZE,IMG_SIZE,1))
    e1 = backbone(inp1)
    e2 = backbone(inp2)
    # No Lambda—just concat
    merged = layers.Concatenate()([e1, e2])
    x = layers.Dense(128, activation="relu")(merged)
    out = layers.Dense(1, activation="sigmoid")(x)
    return Model([inp1, inp2], out)

backbone = build_backbone()
siamese_model = build_siamese_network(backbone)
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
siamese_model.summary()

I0000 00:00:1751519622.331884      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


input image -> grayscale, [224, 224] px, normalized

In [7]:
def preprocess_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=1)  # grayscale
    image = tf.image.resize(image, [224, 224])
    image = tf.cast(image, tf.float32) / 255.0
    return image  # shape: (224, 224, 1)
    
def make_tf_dataset(pairs, labels, batch_size=32, shuffle=True):
    path_ds = tf.data.Dataset.from_tensor_slices((pairs, labels))

    def load_images(pair, label):
        img1 = preprocess_image(pair[0])
        img2 = preprocess_image(pair[1])
        return (img1, img2), label

    dataset = path_ds.map(load_images, num_parallel_calls=tf.data.AUTOTUNE)
    # num_parallel_calls=tf.data.AUTOTUNE lets TensorFlow load multiple images in parallel = faster.
    dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset
    
train_dataset = make_tf_dataset(pairs, labels, batch_size=32)
print(len(train_dataset))

51


In [8]:
siamese_model.fit(train_dataset, epochs=10)

Epoch 1/10


I0000 00:00:1751519629.440104      59 service.cc:148] XLA service 0x7a3d40085b60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751519629.440924      59 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1751519629.829018      59 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/51[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8:21[0m 10s/step - accuracy: 0.5000 - loss: 0.6932

I0000 00:00:1751519633.610618      59 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 109ms/step - accuracy: 0.5430 - loss: 0.6895
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.5558 - loss: 0.6831
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 68ms/step - accuracy: 0.7800 - loss: 0.5252
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 69ms/step - accuracy: 0.8827 - loss: 0.3093
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 68ms/step - accuracy: 0.9131 - loss: 0.2600
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 68ms/step - accuracy: 0.9074 - loss: 0.2298
Epoch 7/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.9453 - loss: 0.1786
Epoch 8/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.9355 - loss: 0.1733
Epoch 9/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7a3e1c10e210>

In [9]:
# Load test data
path = '/kaggle/input/signature-verification-dataset/sign_data/test'
pairs, labels = load_signature_pairs(path)

print("Total samples:", len(pairs))

test_dataset = make_tf_dataset(pairs, labels, batch_size=32)
print("Batches:", tf.data.experimental.cardinality(test_dataset).numpy())

# Evaluate model
loss, accuracy = siamese_model.evaluate(test_dataset)
print(f"\nTest Accuracy: {accuracy:.4f}")

Total samples: 476
Batches: 15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 101ms/step - accuracy: 0.9528 - loss: 0.1231

Test Accuracy: 0.9601


In [10]:
img_1 = '/kaggle/input/signature-verification-dataset/sign_data/test/049/01_049.png'
img_1_forg = '/kaggle/input/signature-verification-dataset/sign_data/test/049_forg/01_0114049.PNG'
def predict_similarity(model, img_path1, img_path2):
    img1 = preprocess_image(img_path1)
    img2 = preprocess_image(img_path2)

    # Add batch dimension: (1, 224, 224, 1)
    img1 = tf.expand_dims(img1, axis=0)
    img2 = tf.expand_dims(img2, axis=0)

    prediction = model.predict([img1, img2])[0][0]  # sigmoid output

    print(f"Similarity Score: {prediction:.4f}")
    if prediction >= 0.5:
        print("Genuine")
    else:
        print("Forged ")

predict_similarity(siamese_model, img_1, img_1_forg)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step
Similarity Score: 0.0305
Forged 


#### save the model and some signatures to test

In [11]:
import zipfile

# Map custom names to original paths
files_to_download = {
    'img_1.png': '/kaggle/input/signature-verification-dataset/sign_data/test/049/01_049.png',
    'img_1_gen.png': '/kaggle/input/signature-verification-dataset/sign_data/test/049/05_049.png',
    'img_1_forg.png': '/kaggle/input/signature-verification-dataset/sign_data/test/049_forg/01_0114049.PNG',
    'img_2.png': '/kaggle/input/signature-verification-dataset/sign_data/test/050/01_050.png',
    'img_2_gen.png': '/kaggle/input/signature-verification-dataset/sign_data/test/050/05_050.png',
    'img_2_forg.png': '/kaggle/input/signature-verification-dataset/sign_data/test/050_forg/01_0125050.PNG',
    'img_3.png': '/kaggle/input/signature-verification-dataset/sign_data/test/063/01_063.png',
    'img_3_gen.png': '/kaggle/input/signature-verification-dataset/sign_data/test/063/04_063.png',
    'img_3_forg.png': '/kaggle/input/signature-verification-dataset/sign_data/test/063_forg/01_0104063.PNG'
}

# Create a zip file with custom names
with zipfile.ZipFile('/kaggle/working/signatures.zip', 'w') as zipf:
    for custom_name, file_path in files_to_download.items():
        zipf.write(file_path, arcname=custom_name)


In [12]:
# siamese_model.save("siamese_signature.keras")
# siamese_model.save('siamese_signature_noNorm.h5')
siamese_model.save("siamese_no_lambda.h5")
siamese_model.save_weights("/kaggle/working/siamese.weights.h5")

In [13]:
import numpy as np

pairs_test, labels_test = load_signature_pairs(path)
ds = make_tf_dataset(pairs_test, labels_test, batch_size=128, shuffle=False)

# Gather all scores and labels
all_scores = []
all_labels = []
for (x1, x2), y in ds:
    scores = siamese_model.predict([x1, x2]).flatten()
    all_scores.append(scores)
    all_labels.append(y.numpy())
all_scores = np.concatenate(all_scores)
all_labels = np.concatenate(all_labels)

# Compute means
genuine_scores = all_scores[all_labels == 1]
forged_scores  = all_scores[all_labels == 0]
print("Genuine: mean=%.3f, std=%.3f" % (genuine_scores.mean(), genuine_scores.std()))
print("Forged : mean=%.3f, std=%.3f" % (forged_scores.mean(), forged_scores.std()))

# Pick threshold in between the means
threshold = (genuine_scores.mean() + forged_scores.mean()) / 2
print("Recommended threshold:", threshold)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Genuine: mean=0.984, std=0.002
Forged : mean=0.124, std=0.221
Recommended threshold: 0.5543854236602783
