In [1]:
# Make sure GPU is on: Runtime > Change runtime type > T4 or L4 GPU
!nvidia-smi

# Minimal, stable deps (no albumentations)
!pip -q install facenet-pytorch==2.5.3 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
!pip -q install opencv-python pandas scikit-learn tqdm matplotlib

Mon Sep 15 19:23:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# TODO: set these two to match your Drive
DATA_ROOT = "/content/drive/MyDrive/data"   # folder that has the 105 subfolders
CSV_PATH  = "/content/drive/MyDrive/data/dataset.xlsx"

# Artifacts (saved outputs)
ARTIFACTS_DIR = "/content/drive/MyDrive/aiweek_artifacts"
import os; os.makedirs(ARTIFACTS_DIR, exist_ok=True)


Mounted at /content/drive


In [3]:
import os; os.makedirs(ARTIFACTS_DIR, exist_ok=True)

In [4]:
import os
print("DATA_ROOT exists?", os.path.exists(DATA_ROOT))
print("CSV_PATH exists?", os.path.exists(CSV_PATH))

DATA_ROOT exists? True
CSV_PATH exists? True


In [5]:
!pip -q install openpyxl

In [6]:
import os, pandas as pd

assert os.path.exists(CSV_PATH), f"Excel not found at {CSV_PATH}"

df = pd.read_excel(CSV_PATH)  # expects columns: ID, Name, Status, Age
# Normalize column names just in case
df.columns = [c.strip().lower() for c in df.columns]

required = {"id","name","status"}
missing = required - set(df.columns)
assert not missing, f"Missing columns in Excel: {missing}. Found: {df.columns}"

# Build maps
id2name   = {str(r["id"]).strip(): str(r["name"]).strip() for _, r in df.iterrows()}
name2id   = {str(r["name"]).strip(): str(r["id"]).strip() for _, r in df.iterrows()}
id2status = {str(r["id"]).strip(): str(r["status"]).strip().lower() for _, r in df.iterrows()}
id2age    = {str(r["id"]).strip(): int(r["age"]) if "age" in df.columns and pd.notna(r["age"]) else None
             for _, r in df.iterrows()}

print(f"Rows in Excel: {len(df)}")
print("Sample:", list(id2name.items())[:3])


Rows in Excel: 105
Sample: [('bf723908-8899-11f0-bbe4-0242ac1c000c', 'Adriana Lima'), ('bf723a48-8899-11f0-bbe4-0242ac1c000c', 'Alex Lawther'), ('bf723aca-8899-11f0-bbe4-0242ac1c000c', 'Alexandra Daddario')]


In [7]:
import os, re, glob
from collections import defaultdict

# ---------- helpers ----------
PREFIXES = ("pins_", "imgs_", "images_", "photos_", "pics_", "train_", "val_", "set_", "folder_")

def norm_name(s: str) -> str:
    s = str(s)
    # strip common prefixes
    for pref in PREFIXES:
        if s.lower().startswith(pref):
            s = s[len(pref):]
            break
    # replace separators with spaces
    s = s.replace("_", " ").replace("-", " ")
    # collapse spaces, lowercase
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

# Excel maps (MUST exist from your earlier cell B)
# id2name, name2id, id2status, id2age
assert "id2name" in globals() and "name2id" in globals() and "id2status" in globals()

# Build normalized name map from Excel
name2id_norm = {norm_name(name): uid for name, uid in name2id.items()}

# ---------- scan folders ----------
assert os.path.exists(DATA_ROOT), f"DATA_ROOT not found: {DATA_ROOT}"
image_exts = ("*.jpg","*.jpeg","*.png","*.bmp","*.webp")

folder_map = {}
canonical_to_images = defaultdict(list)
canonical_meta = {}

matched_by_id = matched_by_name = 0
unmatched = []

for folder in sorted(os.listdir(DATA_ROOT)):
    fpath = os.path.join(DATA_ROOT, folder)
    if not os.path.isdir(fpath):
        continue

    files = []
    for ext in image_exts:
        files += glob.glob(os.path.join(fpath, ext))
    if not files:
        continue

    canonical_id = None
    name = folder
    status, age = "unknown", None

    # try: exact folder == Excel ID
    if folder in id2name:
        canonical_id = folder
        name   = id2name[canonical_id]
        status = id2status.get(canonical_id, "unknown")
        age    = id2age.get(canonical_id, None)
        matched_by_id += 1
    else:
        # try: normalized folder name matches Excel name
        n = norm_name(folder)
        if n in name2id_norm:
            canonical_id = name2id_norm[n]
            name   = id2name[canonical_id]
            status = id2status.get(canonical_id, "unknown")
            age    = id2age.get(canonical_id, None)
            matched_by_name += 1
        else:
            canonical_id = folder   # fallback
            unmatched.append(folder)

    folder_map[folder] = canonical_id
    canonical_to_images[canonical_id].extend(sorted(files))
    canonical_meta[canonical_id] = {
        "name": name,
        "status": str(status).strip().lower(),
        "age": age
    }


In [8]:
USE_APPROVED_ONLY = True  # flip to False if you just want to move on

selected_ids = []
for cid, meta in canonical_meta.items():
    st = meta.get("status", "unknown")
    st = st.strip().lower() if isinstance(st, str) else "unknown"
    is_ok = (not USE_APPROVED_ONLY) or ("approved" in st)
    if is_ok and len(canonical_to_images[cid]) > 0:
        selected_ids.append(cid)

print(f"✅ Selected identities: {len(selected_ids)} (USE_APPROVED_ONLY={USE_APPROVED_ONLY})")
if USE_APPROVED_ONLY and len(selected_ids) == 0:
    print("⚠️ Still no approved users. Try USE_APPROVED_ONLY=False to proceed.")


✅ Selected identities: 48 (USE_APPROVED_ONLY=True)


In [9]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/3.2 MB[0m [31m21.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m58.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [10]:
import random
random.seed(42)

train_index, val_index = [], []

for cid in selected_ids:
    imgs = list(canonical_to_images[cid])
    random.shuffle(imgs)
    if len(imgs) == 1:
        train_index.append((cid, imgs[0]))
        continue
    cut = max(1, int(0.8 * len(imgs)))
    train_index += [(cid, p) for p in imgs[:cut]]
    val_index   += [(cid, p) for p in imgs[cut:]]

print("✅ Train images:", len(train_index))
print("✅ Val images:",   len(val_index))
print("Example train sample:", train_index[0] if train_index else "—")


✅ Train images: 5700
✅ Val images: 1445
Example train sample: ('bf723aca-8899-11f0-bbe4-0242ac1c000c', '/content/drive/MyDrive/data/pins_Alexandra Daddario/Alexandra Daddario233_345.jpg')


In [11]:
import torch
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image

# Pick device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Face detector (MTCNN)
mtcnn = MTCNN(image_size=160, margin=14, post_process=True, keep_all=False, device=device)


# Face embedding model (FaceNet, pretrained on VGGFace2)
embedder = InceptionResnetV1(pretrained='vggface2').eval().to(device)

# Helper function: image -> 512-D embedding
def image_to_embedding(path):
    try:
        img = Image.open(path).convert('RGB')
    except Exception as e:
        print(f"⚠️ Could not open image {path}: {e}")
        return None, False

    # Detect and crop face
    face = mtcnn(img)
    if face is None:
        return None, False

    # Generate embedding
    with torch.no_grad():
        emb = embedder(face.unsqueeze(0).to(device))
        emb = torch.nn.functional.normalize(emb, dim=1)  # normalize for cosine similarity

    return emb.squeeze(0).cpu(), True


Using device: cuda


  0%|          | 0.00/107M [00:00<?, ?B/s]

In [12]:
from tqdm import tqdm

def batch_embed(pairs):
    """Compute embeddings for (identity, image_path) pairs."""
    out, misses = [], 0
    for cid, path in tqdm(pairs):
        emb, ok = image_to_embedding(path)
        if ok:
            out.append((cid, path, emb))
        else:
            misses += 1
    return out, misses

# Run for train and val sets
train_embs, train_miss = batch_embed(train_index)
val_embs,   val_miss   = batch_embed(val_index)

print(f"✅ Got embeddings:")
print(f"   • Train = {len(train_embs)} (missed {train_miss})")
print(f"   • Val   = {len(val_embs)} (missed {val_miss})")


100%|██████████| 5700/5700 [35:17<00:00,  2.69it/s]
100%|██████████| 1445/1445 [08:57<00:00,  2.69it/s]

✅ Got embeddings:
   • Train = 5680 (missed 20)
   • Val   = 1438 (missed 7)





In [13]:
import torch
from collections import defaultdict

# Group embeddings per identity
proto = defaultdict(list)
for cid, _, emb in train_embs:
    proto[cid].append(emb)

# Mean + L2-normalize → the identity prototype
for cid in list(proto.keys()):
    m = torch.stack(proto[cid], dim=0).mean(0)
    proto[cid] = torch.nn.functional.normalize(m.unsqueeze(0), dim=1).squeeze(0)

print(f"✅ Built prototypes for {len(proto)} identities")


✅ Built prototypes for 48 identities


In [14]:
import random, json, os

def cosine(a, b):
    return torch.nn.functional.cosine_similarity(
        a.unsqueeze(0), b.unsqueeze(0)
    ).item()

# Collect positive and negative cosine scores
pos, neg = [], []

# Positive: same identity
for cid, _, emb in val_embs:
    if cid in proto:
        pos.append(cosine(emb, proto[cid]))

# Negative: claim wrong identity
candidate_ids = list(proto.keys())
for cid, _, emb in val_embs:
    others = [x for x in candidate_ids if x != cid]
    if not others:
        continue
    claim = random.choice(others)
    neg.append(cosine(emb, proto[claim]))

# Sweep thresholds
scores = sorted(set([round(s, 4) for s in (pos + neg)]))
best_t, best_acc = 0.0, 0.0
for t in scores:
    tp = sum(s >= t for s in pos)
    fn = sum(s <  t for s in pos)
    tn = sum(s <  t for s in neg)
    fp = sum(s >= t for s in neg)
    acc = (tp + tn) / max(1, (tp + tn + fp + fn))
    if acc > best_acc:
        best_acc, best_t = acc, t

print(f"✅ Threshold selected: {best_t:.4f}")
print(f"   Validation Accuracy: {best_acc*100:.2f}%")
print(f"   Npos={len(pos)} | Nneg={len(neg)}")

# Save artifacts for later inference
torch.save({k: v for k, v in proto.items()}, os.path.join(ARTIFACTS_DIR, "prototypes.pt"))
with open(os.path.join(ARTIFACTS_DIR, "id2name.json"), "w") as f:
    json.dump({k: canonical_meta[k]["name"] for k in proto.keys()}, f)
with open(os.path.join(ARTIFACTS_DIR, "id2status.json"), "w") as f:
    json.dump({k: canonical_meta[k]["status"] for k in proto.keys()}, f)
with open(os.path.join(ARTIFACTS_DIR, "threshold.json"), "w") as f:
    json.dump({"threshold": float(best_t)}, f)

print("✅ Artifacts saved to:", ARTIFACTS_DIR)


✅ Threshold selected: 0.4660
   Validation Accuracy: 99.48%
   Npos=1438 | Nneg=1438
✅ Artifacts saved to: /content/drive/MyDrive/aiweek_artifacts


In [15]:
import os, json, time, torch

# Reload artifacts (safe even in a fresh session)
prototypes = torch.load(os.path.join(ARTIFACTS_DIR, "prototypes.pt"), map_location='cpu')
threshold  = json.load(open(os.path.join(ARTIFACTS_DIR, "threshold.json")))["threshold"]
id2name_art   = json.load(open(os.path.join(ARTIFACTS_DIR, "id2name.json")))
id2status_art = json.load(open(os.path.join(ARTIFACTS_DIR, "id2status.json")))

def cosine(a, b):
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()

def verify_claim(claimed_id: str, image_path: str):
    """Verify a claimed ID against an image path."""
    t0 = time.time()
    emb, ok = image_to_embedding(image_path)
    if not ok:
        return {"status":"face_not_found","claimed_id":claimed_id}

    if claimed_id not in prototypes:
        return {"status":"unknown_user","claimed_id":claimed_id}

    score = cosine(emb, prototypes[claimed_id])
    decision = "ACCESS_GRANTED" if score >= threshold else "ACCESS_DENIED"
    return {
        "status": decision.lower(),
        "score": float(score),
        "threshold": float(threshold),
        "claimed_id": claimed_id,
        "claimed_name": id2name_art.get(claimed_id, claimed_id),
        "claimed_status": id2status_art.get(claimed_id, "unknown"),
        "time_ms": int((time.time()-t0)*1000),
        "model": "facenet_vggface2",
    }

# --- Smoke test ---
if val_embs:
    cid, path, _ = val_embs[0]
    result = verify_claim(cid, path)
    print("✅ Smoke test result:\n", result)
else:
    print("⚠️ No validation images available to test")


✅ Smoke test result:
 {'status': 'access_granted', 'score': 0.7191963195800781, 'threshold': 0.466, 'claimed_id': 'bf723aca-8899-11f0-bbe4-0242ac1c000c', 'claimed_name': 'Alexandra Daddario', 'claimed_status': 'approved', 'time_ms': 47, 'model': 'facenet_vggface2'}


In [16]:
import random

# pick a real val image
cid_true, path, _ = val_embs[0]

# claim a different ID on purpose
other_ids = [k for k in prototypes.keys() if k != cid_true]
claimed_wrong = random.choice(other_ids)

print("True ID:", cid_true, "| Wrong claim:", claimed_wrong)
print("NEG example:", verify_claim(claimed_wrong, path))


True ID: bf723aca-8899-11f0-bbe4-0242ac1c000c | Wrong claim: bf724916-8899-11f0-bbe4-0242ac1c000c
NEG example: {'status': 'access_denied', 'score': -0.23200075328350067, 'threshold': 0.466, 'claimed_id': 'bf724916-8899-11f0-bbe4-0242ac1c000c', 'claimed_name': 'Eliza Taylor', 'claimed_status': 'approved', 'time_ms': 220, 'model': 'facenet_vggface2'}


In [17]:
import random, torch

tp = tn = fp = fn = 0

# positives: correct claims
for cid, _, emb in val_embs:
    if cid not in prototypes:
        continue
    s = torch.nn.functional.cosine_similarity(
        emb.unsqueeze(0), prototypes[cid].unsqueeze(0)
    ).item()
    pred = (s >= threshold)
    tp += int(pred)
    fn += int(not pred)

# negatives: wrong claims (one impostor per sample)
proto_ids = list(prototypes.keys())
for cid, _, emb in val_embs:
    impostors = [x for x in proto_ids if x != cid]
    if not impostors:
        continue
    wrong = random.choice(impostors)
    s = torch.nn.functional.cosine_similarity(
        emb.unsqueeze(0), prototypes[wrong].unsqueeze(0)
    ).item()
    pred = (s >= threshold)  # predicting granted here counts as false accept
    fp += int(pred)
    tn += int(not pred)

acc = (tp + tn) / max(1, (tp+tn+fp+fn))
far = fp / max(1, (fp+tn))  # impostor acceptance rate
frr = fn / max(1, (fn+tp))  # genuine rejection rate

print(f"Accuracy: {acc*100:.2f}% | FAR: {far*100:.2f}% | FRR: {frr*100:.2f}%")
print(f"Confusion -> TP:{tp} TN:{tn} FP:{fp} FN:{fn}")


Accuracy: 99.17% | FAR: 1.04% | FRR: 0.63%
Confusion -> TP:1429 TN:1423 FP:15 FN:9


In [19]:
!pip -q install gradio

import gradio as gr
import torch, time

# reuse what's already in memory:
# - prototypes, threshold, id2name_art, image_to_embedding()

proto_ids   = list(prototypes.keys())
id_choices  = [(id2name_art.get(cid, cid), cid) for cid in proto_ids]  # (label, value)

def cosine(a,b):
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()

def verify_ui(claimed_id, image):
    if image is None:
        return {"status":"no_file"}
    # gradio gives PIL.Image
    face = mtcnn(image.convert("RGB"))
    if face is None:
        return {"status":"face_not_found"}
    with torch.no_grad():
        emb = embedder(face.unsqueeze(0).to(device))
        emb = torch.nn.functional.normalize(emb, dim=1).squeeze(0).cpu()
    if claimed_id not in prototypes:
        return {"status":"unknown_user"}
    score = cosine(emb, prototypes[claimed_id])
    decision = "ACCESS_GRANTED" if score >= threshold else "ACCESS_DENIED"
    return {
        "status": decision.lower(),
        "score": float(score),
        "threshold": float(threshold),
        "claimed_id": claimed_id,
        "claimed_name": id2name_art.get(claimed_id, claimed_id),
    }

with gr.Blocks() as demo:
    gr.Markdown("## 🔐 Face Verification Demo\nUpload a face image and select the claimed user.")
    with gr.Row():
        claimed = gr.Dropdown(choices=[v for _, v in id_choices], label="Claimed ID")
        img     = gr.Image(type="pil", label="Face image")
    btn = gr.Button("Verify")
    out = gr.JSON(label="Result")

    btn.click(fn=verify_ui, inputs=[claimed, img], outputs=out)

demo.launch(share=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

