In [None]:

import os
import csv
import cv2
import numpy as np
import random
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import f1_score
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import load_model

# === CONFIG ===
val_dir = r"/kaggle/input/sunnycomsys-new/Comys_Hackathon5/Task_B/val"
model_path = "/kaggle/input/finalh5/keras/default/1/taskb_siamese_embedding.h5"
image_size = (100, 100)
embedding_dim = 128  # change only if your model returns something else

csv_path = "/kaggle/working/distances_output.csv"

# === Load model ===
print("🔄 Loading model...")
model = load_model(model_path)
print("✅ Model loaded.")


2025-07-03 12:25:56.383308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751545556.640380      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751545556.713641      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🔄 Loading model...


2025-07-03 12:26:12.140046: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


✅ Model loaded.


# 1-> get embeddings
# 2-> save images names and distances in csv file
# 3-> get best threshold with best score using a for loop 

In [2]:
def get_embedding(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, image_size)
    img = img / 255.0
    img = img.reshape((100, 100, 1))
    img = np.expand_dims(img, axis=0)
    return model.predict(img, verbose=0)[0]

# === Collect images
def load_all_images(val_dir):
    identity_to_images = {}
    for identity in os.listdir(val_dir):
        person_path = os.path.join(val_dir, identity)
        if not os.path.isdir(person_path): continue
        imgs = []

        for f in os.listdir(person_path):
            if f.lower().endswith(('.jpg', '.jpeg', '.png')) and f != 'distortion':
                imgs.append(os.path.join(person_path, f))

        dist_path = os.path.join(person_path, "distortion")
        if os.path.exists(dist_path):
            for f in os.listdir(dist_path):
                if f.lower().endswith(('.jpg', '.jpeg', '.png')):
                    imgs.append(os.path.join(dist_path, f))

        if len(imgs) >= 2:
            identity_to_images[identity] = imgs
    return identity_to_images

# === Embedding all
def compute_all_embeddings(identity_to_images):
    img_to_id = {}
    img_to_emb = {}
    print("🔄 Computing embeddings...")
    for identity, imgs in tqdm(identity_to_images.items()):
        for img_path in imgs:
            img_to_id[img_path] = identity
            img_to_emb[img_path] = get_embedding(img_path)
    return img_to_id, img_to_emb

# === Pair sampling
def generate_all_pairs(img_to_id):
    id_to_imgs = {}
    for img_path, identity in img_to_id.items():
        if identity not in id_to_imgs:
            id_to_imgs[identity] = []
        id_to_imgs[identity].append(img_path)

    identities = list(id_to_imgs.keys())
    matching_pairs = []
    different_pairs = []

    # === Matching pairs: all (i, j) where i < j for each identity
    for identity in id_to_imgs:
        img_list = id_to_imgs[identity]
        for i in range(len(img_list)):
            for j in range(i + 1, len(img_list)):
                matching_pairs.append((img_list[i], img_list[j], 1))

    # === Different pairs: one image from each different identity pair
    for i in range(len(identities)):
        for j in range(i + 1, len(identities)):
            id1 = identities[i]
            id2 = identities[j]
            img1 = random.choice(id_to_imgs[id1])
            img2 = random.choice(id_to_imgs[id2])
            different_pairs.append((img1, img2, 0))

    print(f"✅ Matching pairs: {len(matching_pairs)}")
    print(f"✅ Non-matching pairs: {len(different_pairs)}")
    return matching_pairs + different_pairs


# === Evaluate and Save CSV
def evaluate_threshold_and_save(pairs, img_to_emb, csv_path):
    distances = []
    labels = []
    rows = []

    print("🔍 Calculating distances and saving to CSV...")
    for img1, img2, label in tqdm(pairs):
        emb1 = img_to_emb[img1]
        emb2 = img_to_emb[img2]
        dist = np.linalg.norm(emb1 - emb2)
        distances.append(dist)
        labels.append(label)
        rows.append([os.path.basename(img1), os.path.basename(img2), label, dist])

    # Save to CSV
    with open(csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["image1", "image2", "label", "distance"])
        writer.writerows(rows)
    print(f"📁 CSV saved to: {csv_path}")

    # Find best threshold
    best_f1 = 0
    best_threshold = 0
    thresholds = np.linspace(min(distances), max(distances), 200)
    for t in thresholds:
        preds = [1 if d < t else 0 for d in distances]
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    print(f"\n✅ Best threshold: {best_threshold:.5f} with F1 Score: {best_f1:.4f}")
    return best_threshold

In [3]:
identity_to_images = load_all_images(val_dir)   #load images 
img_to_id, img_to_emb = compute_all_embeddings(identity_to_images)  #calculate embeddings
pairs = generate_all_pairs(img_to_id)   #generating pairs
best_threshold = evaluate_threshold_and_save(pairs, img_to_emb, csv_path)

🔄 Computing embeddings...


100%|██████████| 250/250 [06:17<00:00,  1.51s/it]


✅ Matching pairs: 51432
✅ Non-matching pairs: 31125
🔍 Calculating distances and saving to CSV...


100%|██████████| 82557/82557 [00:00<00:00, 152099.32it/s]


📁 CSV saved to: /kaggle/working/distances_output.csv

✅ Best threshold: 95.99980 with F1 Score: 0.7712


# getting best thres hold manually

In [6]:
import pandas as pd
csv_path = "/kaggle/working/distances_output.csv"
#threshold = 71.9999   #95.99980  64.3333 71.9999

# Load the CSV
df = pd.read_csv(csv_path)

max=0
best_threshold = 0
for threshold in range(50,100):   #as observed in the csv, threshold must be between 50-100
    df['prediction'] = df['distance'].apply(lambda d: 1 if d < threshold else 0)
    df['score'] = df.apply(lambda row: 1 if row['label'] == row['prediction'] else 0, axis=1)
    if(sum(df['score']) > max):
        max = sum(df['score'])
        score=max/len(df)
        best_threshold = threshold
    print(f"Threshold: {threshold}, Score: {score:.4f}, total correct: {max} out of {len(df)}")


print(f"Best threshold found: {best_threshold} with score: {score:.4f} and total correct= {max} out of {len(df)}")
df['prediction'] = df['distance'].apply(lambda d: 1 if d < best_threshold else 0)
df['score'] = df.apply(lambda row: 1 if row['label'] == row['prediction'] else 0, axis=1)
df.to_csv(csv_path, index=False)

print("✅ Updated CSV saved with 'prediction' column at:")
print(csv_path)

Threshold: 50, Score: 0.5350, total correct: 44171 out of 82557
Threshold: 51, Score: 0.5408, total correct: 44646 out of 82557
Threshold: 52, Score: 0.5475, total correct: 45198 out of 82557
Threshold: 53, Score: 0.5542, total correct: 45752 out of 82557
Threshold: 54, Score: 0.5609, total correct: 46307 out of 82557
Threshold: 55, Score: 0.5663, total correct: 46755 out of 82557
Threshold: 56, Score: 0.5723, total correct: 47245 out of 82557
Threshold: 57, Score: 0.5775, total correct: 47680 out of 82557
Threshold: 58, Score: 0.5828, total correct: 48112 out of 82557
Threshold: 59, Score: 0.5878, total correct: 48524 out of 82557
Threshold: 60, Score: 0.5921, total correct: 48884 out of 82557
Threshold: 61, Score: 0.5967, total correct: 49263 out of 82557
Threshold: 62, Score: 0.6017, total correct: 49671 out of 82557
Threshold: 63, Score: 0.6075, total correct: 50151 out of 82557
Threshold: 64, Score: 0.6119, total correct: 50518 out of 82557
Threshold: 65, Score: 0.6159, total corr

# best threshold found = 82