# Extract Embeddings - Kaggle

Notebook tao embeddings database tren Kaggle voi GPU mien phi.

## Chuan bi:
1. Upload dataset `CelebA_Aligned_Balanced` len Kaggle Datasets
2. Upload folder `data/celeb` (chua anh celebrities) len Kaggle Datasets
3. Upload checkpoint model (arcface_best.pth/facenet_best.pth) len Kaggle Datasets
4. Add tat ca datasets vao notebook nay
5. Bat GPU: Settings > Accelerator > GPU P100/T4

In [None]:
import os
import sys

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
print(f"Kaggle environment: {IS_KAGGLE}")

if not IS_KAGGLE:
    print("WARNING: Notebook nay duoc thiet ke cho Kaggle!")

In [None]:
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
!pip install protobuf==3.20.* --quiet

In [None]:
# === CAU HINH DUONG DAN ===
# Thay doi cac ten dataset theo ten ban da upload len Kaggle

ROOT = "/kaggle/working/FaceRecognition"
OUTPUT_DIR = "/kaggle/working/embeddings"

# Dataset chua anh celebrities de tao embeddings
CELEB_DATASET_NAME = "celeb-dataset"  # Thay doi neu can
CELEB_DATA_DIR = f"/kaggle/input/{CELEB_DATASET_NAME}"

# Dataset chua checkpoint model
CHECKPOINT_DATASET_NAME = "arcface-checkpoints"  # Thay doi neu can
CHECKPOINT_DIR = f"/kaggle/input/{CHECKPOINT_DATASET_NAME}"

# Chon model type: 'arcface' hoac 'facenet'
MODEL_TYPE = "arcface"  # Thay doi neu can

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"ROOT: {ROOT}")
print(f"CELEB_DATA_DIR: {CELEB_DATA_DIR}")
print(f"CHECKPOINT_DIR: {CHECKPOINT_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"MODEL_TYPE: {MODEL_TYPE}")

In [None]:
print("=== KIEM TRA DATASETS ===\n")

print("Tat ca datasets:")
!ls -la /kaggle/input/

print(f"\n--- Celeb Dataset ({CELEB_DATA_DIR}) ---")
if os.path.exists(CELEB_DATA_DIR):
    !ls -la {CELEB_DATA_DIR} | head -20
    num_persons = len([d for d in os.listdir(CELEB_DATA_DIR) if os.path.isdir(os.path.join(CELEB_DATA_DIR, d))])
    print(f"\n[OK] Tim thay {num_persons} thu muc celebrity")
else:
    print(f"[ERROR] Khong tim thay: {CELEB_DATA_DIR}")
    print("        Hay add dataset celeb vao notebook")

print(f"\n--- Checkpoint Dataset ({CHECKPOINT_DIR}) ---")
if os.path.exists(CHECKPOINT_DIR):
    !ls -la {CHECKPOINT_DIR}
else:
    print(f"[ERROR] Khong tim thay: {CHECKPOINT_DIR}")
    print("        Hay add dataset checkpoint vao notebook")

In [None]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
    print("[OK] Da lay GITHUB_TOKEN tu Kaggle Secrets")
except Exception as e:
    GITHUB_TOKEN = None
    print("[WARN] Khong lay duoc token tu Kaggle Secrets")
    print(f"       Loi: {e}")

if GITHUB_TOKEN:
    REPO_URL = f"https://{GITHUB_TOKEN}@github.com/sin0235/FaceRecognition.git"
    print("[OK] GitHub token da duoc cau hinh")
else:
    REPO_URL = "https://github.com/sin0235/FaceRecognition.git"
    print("[INFO] Su dung public URL (khong can token)")

In [None]:
if os.path.exists(ROOT):
    print("Repository da ton tai, dang pull updates...")
    %cd {ROOT}
    if GITHUB_TOKEN:
        !git remote set-url origin {REPO_URL}
    !git pull
else:
    print(f"Dang clone repository...")
    !git clone {REPO_URL} {ROOT}
    %cd {ROOT}

print(f"\nWorking directory: {os.getcwd()}")
!ls -la

In [None]:
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
    print(f"Da them {ROOT} vao Python path")

In [None]:
print("Cai dat dependencies...")
!pip install -q opencv-python-headless Pillow scikit-learn tqdm pyyaml facenet-pytorch
!pip install -q scikit-image
print("\nHoan tat cai dat!")

In [None]:
import torch

print("=== GPU INFO ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  CUDA version: {torch.version.cuda}")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nSu dung device: {DEVICE}")

In [None]:
import glob

print("=== TIM CHECKPOINT ===\n")

pth_files = glob.glob(os.path.join(CHECKPOINT_DIR, "**/*.pth"), recursive=True)

if pth_files:
    print(f"Tim thay {len(pth_files)} file checkpoint:")
    for f in pth_files:
        print(f"  - {f}")
    
    if MODEL_TYPE == "arcface":
        model_file = next((f for f in pth_files if 'arcface' in f.lower() and 'best' in f.lower()), None)
        if not model_file:
            model_file = next((f for f in pth_files if 'arcface' in f.lower()), None)
    else:
        model_file = next((f for f in pth_files if 'facenet' in f.lower() and 'best' in f.lower()), None)
        if not model_file:
            model_file = next((f for f in pth_files if 'facenet' in f.lower()), None)
    
    if not model_file:
        model_file = pth_files[0]
    
    MODEL_PATH = model_file
    print(f"\n[OK] Se su dung: {MODEL_PATH}")
else:
    print(f"[ERROR] Khong tim thay file .pth trong {CHECKPOINT_DIR}")
    MODEL_PATH = None

In [None]:
import numpy as np
import cv2
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from tqdm import tqdm

try:
    from skimage.transform import SimilarityTransform
    HAS_SKIMAGE = True
    print("[OK] scikit-image available")
except ImportError:
    HAS_SKIMAGE = False
    print("[WARN] scikit-image not available")

In [None]:
ARCFACE_TEMPLATE = np.array([
    [38.2946, 51.6963],
    [73.5318, 51.5014],
    [56.0252, 71.7366],
    [41.5493, 92.3655],
    [70.7299, 92.2041]
], dtype=np.float32)


def get_transform(image_size: int = 112):
    return transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])


def get_facenet_transform(image_size: int = 160):
    return transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

In [None]:
def load_arcface_model(model_path: str, device: str = 'cpu'):
    from models.arcface.arcface_model import ArcFaceModel
    
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    
    config = checkpoint.get('config', {})
    num_classes = config.get('num_classes', checkpoint.get('num_classes', 100))
    embedding_size = config.get('model', {}).get('embedding_size', 512)
    
    model = ArcFaceModel(num_classes=num_classes, embedding_size=embedding_size)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    
    print(f"Loaded ArcFace model:")
    print(f"  - Num classes: {num_classes}")
    print(f"  - Embedding size: {embedding_size}")
    print(f"  - Epoch: {checkpoint.get('epoch', 'N/A')}")
    
    return model, embedding_size


def load_facenet_model(model_path: str, device: str = 'cpu'):
    from models.facenet.facenet_model import FaceNetModel
    
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    
    config = checkpoint.get('config', {})
    embedding_size = config.get('model', {}).get('embedding_size', 512)
    
    model = FaceNetModel(embedding_size=embedding_size, pretrained='vggface2', device=device)
    
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    elif 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'], strict=False)
    
    model.to(device)
    model.eval()
    
    print(f"Loaded FaceNet model:")
    print(f"  - Embedding size: {embedding_size}")
    print(f"  - Epoch: {checkpoint.get('epoch', 'N/A')}")
    
    return model, embedding_size

In [None]:
def extract_embedding_single(img_input, model, transform, device, model_type='arcface'):
    try:
        if isinstance(img_input, str):
            img = Image.open(img_input).convert('RGB')
        else:
            img = img_input.convert('RGB')
            
        img_tensor = transform(img).unsqueeze(0).to(device)
        
        with torch.no_grad():
            if model_type == 'facenet':
                embedding = model(img_tensor)
            else:
                embedding = model(img_tensor, labels=None)
            embedding = F.normalize(embedding, p=2, dim=1)
            embedding = embedding.cpu().numpy().flatten()
        
        return embedding
    except Exception as e:
        return None


def extract_embedding_for_folder(folder, model, transform, device, model_type='arcface'):
    if not os.path.exists(folder):
        return None
    
    embeddings = []
    
    for f in os.listdir(folder):
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp')):
            img_path = os.path.join(folder, f)
            emb = extract_embedding_single(img_path, model, transform, device, model_type)
            if emb is not None:
                embeddings.append(emb)
    
    if len(embeddings) == 0:
        return None
    
    stacked = np.stack(embeddings, axis=0)
    mean_emb = np.mean(stacked, axis=0)
    mean_emb = mean_emb / (np.linalg.norm(mean_emb) + 1e-8)
    
    return mean_emb

In [None]:
if MODEL_PATH is None:
    print("[ERROR] Khong co model checkpoint!")
else:
    print("="*60)
    print(f"LOADING MODEL: {MODEL_TYPE.upper()}")
    print("="*60)
    
    if MODEL_TYPE == "facenet":
        model, embedding_size = load_facenet_model(MODEL_PATH, DEVICE)
        transform = get_facenet_transform()
    else:
        model, embedding_size = load_arcface_model(MODEL_PATH, DEVICE)
        transform = get_transform()
    
    print(f"\n[OK] Model loaded successfully!")

In [None]:
print("="*60)
print("EXTRACT EMBEDDINGS DATABASE")
print("="*60)

actual_celeb_dir = CELEB_DATA_DIR
if not os.path.exists(actual_celeb_dir):
    for subdir in os.listdir(CELEB_DATA_DIR):
        subpath = os.path.join(CELEB_DATA_DIR, subdir)
        if os.path.isdir(subpath):
            num_subdirs = len([d for d in os.listdir(subpath) if os.path.isdir(os.path.join(subpath, d))])
            if num_subdirs > 0:
                actual_celeb_dir = subpath
                break

print(f"Data directory: {actual_celeb_dir}")

persons = [p for p in os.listdir(actual_celeb_dir)
           if os.path.isdir(os.path.join(actual_celeb_dir, p))]

print(f"Tim thay {len(persons)} celebrities")
print(f"Model type: {MODEL_TYPE}")
print(f"Device: {DEVICE}")
print("\nDang extract embeddings...\n")

In [None]:
db = {}
success_count = 0
failed_persons = []

for person in tqdm(persons, desc="Extracting embeddings"):
    person_folder = os.path.join(actual_celeb_dir, person)
    emb = extract_embedding_for_folder(person_folder, model, transform, DEVICE, MODEL_TYPE)
    if emb is not None:
        db[person] = emb
        success_count += 1
    else:
        failed_persons.append(person)

print(f"\n=== KET QUA ===")
print(f"Thanh cong: {success_count}/{len(persons)}")
print(f"That bai: {len(failed_persons)}")

if failed_persons:
    print(f"\nCac thu muc that bai:")
    for p in failed_persons[:10]:
        print(f"  - {p}")
    if len(failed_persons) > 10:
        print(f"  ... va {len(failed_persons) - 10} thu muc khac")

In [None]:
if len(db) > 0:
    output_filename = f"{MODEL_TYPE}_embeddings_db.npy"
    output_path = os.path.join(OUTPUT_DIR, output_filename)
    
    np.save(output_path, db)
    
    print(f"\n[OK] Da luu embeddings database!")
    print(f"     File: {output_path}")
    print(f"     So celebrities: {len(db)}")
    print(f"     Embedding size: {embedding_size}")
    
    file_size = os.path.getsize(output_path) / 1024 / 1024
    print(f"     File size: {file_size:.2f} MB")
else:
    print("[ERROR] Khong co embeddings nao duoc tao!")

In [None]:
print("=== KIEM TRA DATABASE ===\n")

if len(db) > 0:
    sample_names = list(db.keys())[:5]
    print(f"Sample celebrities:")
    for name in sample_names:
        emb = db[name]
        print(f"  - {name}: shape={emb.shape}, norm={np.linalg.norm(emb):.4f}")
    
    print(f"\nTinh similarity giua 2 nguoi dau tien:")
    if len(sample_names) >= 2:
        emb1 = db[sample_names[0]]
        emb2 = db[sample_names[1]]
        similarity = np.dot(emb1, emb2)
        print(f"  {sample_names[0]} vs {sample_names[1]}: {similarity:.4f}")

In [None]:
print("=== OUTPUT FILES ===\n")
!ls -la {OUTPUT_DIR}

print("\n" + "="*60)
print("HOAN TAT!")
print("="*60)
print(f"\nDownload file embeddings tu: {OUTPUT_DIR}")
print("Copy file nay vao thu muc data/ cua project de su dung.")

## Huong dan su dung

1. **Chuan bi datasets tren Kaggle:**
   - Upload folder `data/celeb` (chua anh celebrities) thanh 1 dataset
   - Upload file checkpoint (arcface_best.pth hoac facenet_best.pth) thanh 1 dataset

2. **Cau hinh notebook:**
   - Sua ten dataset trong cell cau hinh (CELEB_DATASET_NAME, CHECKPOINT_DATASET_NAME)
   - Chon MODEL_TYPE: 'arcface' hoac 'facenet'

3. **Chay notebook:**
   - Bat GPU trong Settings
   - Run All cells

4. **Download output:**
   - File embeddings se o trong /kaggle/working/embeddings/
   - Download va copy vao thu muc data/ cua project