# FaceNet Training - Kaggle

Notebook huấn luyện FaceNet trên Kaggle với GPU miễn phí.

## Chuẩn bị:
1. Upload dataset `CelebA_Aligned_Balanced` lên Kaggle Datasets
2. Add dataset vào notebook này
3. Bật GPU: Settings > Accelerator > GPU P100/T4

In [None]:
# Detect môi trường
import os
import sys

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
print(f"Kaggle environment: {IS_KAGGLE}")

if not IS_KAGGLE:
    print("WARNING: Notebook này được thiết kế cho Kaggle!")

In [None]:
# Cấu hình đường dẫn Kaggle
ROOT = "/kaggle/working/FaceRecognition"
CHECKPOINT_DIR = "/kaggle/working/checkpoints/facenet"

# Dataset path - thay đổi theo tên dataset của bạn trên Kaggle
KAGGLE_DATASET_NAME = "celeba-aligned-balanced"
DATA_DIR = f"/kaggle/input/{KAGGLE_DATASET_NAME}"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"ROOT: {ROOT}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"CHECKPOINT_DIR: {CHECKPOINT_DIR}")

In [None]:
# === CAU HINH CHECKPOINT DATASET ===
CHECKPOINT_DATASET_NAME = ""

import shutil
import glob

if CHECKPOINT_DATASET_NAME:
    checkpoint_input_dir = f"/kaggle/input/{CHECKPOINT_DATASET_NAME}"
    if os.path.exists(checkpoint_input_dir):
        print(f"[OK] Tim thay checkpoint dataset")
        pth_files = glob.glob(os.path.join(checkpoint_input_dir, "**/*.pth"), recursive=True)
        if pth_files:
            os.makedirs(CHECKPOINT_DIR, exist_ok=True)
            for pth_file in pth_files:
                dest_path = os.path.join(CHECKPOINT_DIR, os.path.basename(pth_file))
                if not os.path.exists(dest_path):
                    shutil.copy(pth_file, dest_path)
                    print(f"[COPY] {os.path.basename(pth_file)}")
else:
    print("[INFO] Training tu dau (khong co checkpoint)")

In [None]:
# Kiểm tra Kaggle dataset
print("=== KAGGLE INPUT DATASETS ===")
!ls -la /kaggle/input/

if os.path.exists(DATA_DIR):
    print(f"\n[OK] Dataset found at: {DATA_DIR}")
    !ls -la {DATA_DIR}
else:
    print(f"\n[ERROR] Dataset not found at: {DATA_DIR}")

In [None]:
# Cau hinh GitHub token
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
    print("[OK] Da lay GITHUB_TOKEN")
except Exception as e:
    GITHUB_TOKEN = None
    print("[INFO] Su dung public URL")

if GITHUB_TOKEN:
    REPO_URL = f"https://{GITHUB_TOKEN}@github.com/sin0235/FaceRecognition.git"
else:
    REPO_URL = "https://github.com/sin0235/FaceRecognition.git"

In [None]:
# Clone repository
if os.path.exists(ROOT):
    print("Repository da ton tai, dang pull updates...")
    %cd {ROOT}
    if GITHUB_TOKEN:
        !git remote set-url origin {REPO_URL}
    !git pull
else:
    print(f"Dang clone repository...")
    !git clone {REPO_URL} {ROOT}
    %cd {ROOT}

print(f"\nWorking directory: {os.getcwd()}")
!ls -la

In [None]:
# Thêm ROOT vào Python path
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
    print(f"Da them {ROOT} vao Python path")

In [None]:
# Cài đặt dependencies (KHONG cai lai torch)
print("Cai dat dependencies...")
!pip install -q facenet-pytorch --no-deps
!pip install -q opencv-python-headless Pillow scikit-learn tqdm pyyaml
print("\nHoan tat cai dat!")

In [None]:
# Kiểm tra GPU
import torch

print("=== GPU INFO ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Kiểm tra dữ liệu training
train_img_dir = os.path.join(DATA_DIR, "train")
val_img_dir = os.path.join(DATA_DIR, "val")

if not os.path.exists(train_img_dir):
    train_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "train")
    val_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "val")

print("=== KIEM TRA DU LIEU ===")

if os.path.exists(train_img_dir):
    train_identities = [d for d in os.listdir(train_img_dir) 
                        if os.path.isdir(os.path.join(train_img_dir, d))]
    print(f"[OK] Train: {len(train_identities)} identities")
else:
    print(f"[ERROR] Train folder not found")

if os.path.exists(val_img_dir):
    val_identities = [d for d in os.listdir(val_img_dir) 
                      if os.path.isdir(os.path.join(val_img_dir, d))]
    print(f"[OK] Val: {len(val_identities)} identities")
else:
    print(f"[ERROR] Val folder not found")

## Training FaceNet

In [None]:
# Import modules
import yaml
import time
import json
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch import optim

from models.facenet.facenet_model import FaceNetModel, TripletLoss
from models.facenet.facenet_dataloader import FaceNetTripletDataset

print("[OK] Imports successful!")

In [None]:
# Load config từ file YAML
CONFIG_PATH = os.path.join(ROOT, "configs/facenet_kaggle.yaml")

with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print(f"Loaded config from: {CONFIG_PATH}")

# Extract config values
model_cfg = config['model']
train_cfg = config['training']
data_cfg = config['dataset']

BATCH_SIZE = train_cfg['batch_size']
NUM_EPOCHS = train_cfg['num_epochs']
LEARNING_RATE = train_cfg['learning_rate']
MARGIN = model_cfg['margin']
EMBEDDING_SIZE = model_cfg['embedding_size']
IMAGE_SIZE = data_cfg['image_size']
PATIENCE = train_cfg['patience']
NUM_WORKERS = train_cfg.get('num_workers', 4)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

print(f"\n=== CONFIG ===")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Margin: {MARGIN}")
print(f"Embedding size: {EMBEDDING_SIZE}")
print(f"Image size: {IMAGE_SIZE}")
print(f"Patience: {PATIENCE}")
print(f"Num workers: {NUM_WORKERS}")

In [None]:
# Tạo DataLoader
print("Loading datasets...")

train_dataset = FaceNetTripletDataset(
    root_dir=train_img_dir,
    image_size=IMAGE_SIZE,
    augment=True
)
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

val_dataset = FaceNetTripletDataset(
    root_dir=val_img_dir,
    image_size=IMAGE_SIZE,
    augment=False
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print(f"Train identities: {len(train_dataset.identities)}")
print(f"Val identities: {len(val_dataset.identities)}")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# Tạo model
print("Creating model...")

model = FaceNetModel(
    embedding_size=EMBEDDING_SIZE,
    pretrained='vggface2',
    device=device
)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total params: {total_params:,}")

criterion = TripletLoss(margin=MARGIN)

optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=train_cfg.get('weight_decay', 0.0001)
)

scheduler = optim.lr_scheduler.StepLR(
    optimizer,
    step_size=train_cfg.get('scheduler_step', 10),
    gamma=train_cfg.get('scheduler_gamma', 0.1)
)

print("Model created!")

In [None]:
# Helper functions
def compute_triplet_metrics(anchor_emb, positive_emb, negative_emb):
    pos_dist = torch.norm(anchor_emb - positive_emb, p=2, dim=1)
    neg_dist = torch.norm(anchor_emb - negative_emb, p=2, dim=1)
    correct = (pos_dist < neg_dist).float().sum()
    return {
        'accuracy': (correct / anchor_emb.size(0)).item(),
        'pos_dist': pos_dist.mean().item(),
        'neg_dist': neg_dist.mean().item()
    }

def train_one_epoch(model, loader, criterion, optimizer, device, epoch):
    model.train()
    total_loss, total_acc = 0.0, 0.0
    num_batches = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch} [Train]")
    for anchor, positive, negative in pbar:
        anchor = anchor.to(device)
        positive = positive.to(device)
        negative = negative.to(device)

        optimizer.zero_grad()
        emb_a = model(anchor)
        emb_p = model(positive)
        emb_n = model(negative)

        loss = criterion(emb_a, emb_p, emb_n)
        loss.backward()
        optimizer.step()

        metrics = compute_triplet_metrics(emb_a, emb_p, emb_n)
        total_loss += loss.item()
        total_acc += metrics['accuracy']
        num_batches += 1

        pbar.set_postfix({'loss': f'{loss.item():.4f}', 'acc': f"{metrics['accuracy']:.4f}"})

    return total_loss / num_batches, total_acc / num_batches

def validate(model, loader, criterion, device, epoch):
    model.eval()
    total_loss, total_acc = 0.0, 0.0
    num_batches = 0

    with torch.no_grad():
        pbar = tqdm(loader, desc=f"Epoch {epoch} [Val]")
        for anchor, positive, negative in pbar:
            anchor = anchor.to(device)
            positive = positive.to(device)
            negative = negative.to(device)

            emb_a = model(anchor)
            emb_p = model(positive)
            emb_n = model(negative)

            loss = criterion(emb_a, emb_p, emb_n)
            metrics = compute_triplet_metrics(emb_a, emb_p, emb_n)
            total_loss += loss.item()
            total_acc += metrics['accuracy']
            num_batches += 1

            pbar.set_postfix({'val_loss': f'{loss.item():.4f}', 'val_acc': f"{metrics['accuracy']:.4f}"})

    return total_loss / num_batches, total_acc / num_batches

print("Helper functions defined!")

In [None]:
# Training loop
print("="*60)
print("BAT DAU TRAINING FACENET")
print("="*60)

best_val_acc = 0.0
patience_counter = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'lr': []}

training_start = time.time()

for epoch in range(1, NUM_EPOCHS + 1):
    train_loss, train_acc = train_one_epoch(
        model, train_loader, criterion, optimizer, device, epoch
    )
    
    val_loss, val_acc = validate(
        model, val_loader, criterion, device, epoch
    )
    
    current_lr = optimizer.param_groups[0]['lr']
    
    history['train_loss'].append(float(train_loss))
    history['train_acc'].append(float(train_acc))
    history['val_loss'].append(float(val_loss))
    history['val_acc'].append(float(val_acc))
    history['lr'].append(float(current_lr))
    
    print(f"\nEpoch {epoch}/{NUM_EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print(f"  LR: {current_lr:.6f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        
        best_path = os.path.join(CHECKPOINT_DIR, "facenet_best.pth")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
            'config': config
        }, best_path)
        print(f"  [SAVED] Best model (val_acc: {val_acc:.4f})")
    else:
        patience_counter += 1
        print(f"  Patience: {patience_counter}/{PATIENCE}")
    
    if patience_counter >= PATIENCE:
        print(f"\n[EARLY STOPPING] Triggered at epoch {epoch}")
        break
    
    scheduler.step()

total_time = time.time() - training_start
print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Total time: {total_time/60:.2f} minutes")
print(f"Best validation accuracy: {best_val_acc:.4f}")

In [None]:
# Lưu checkpoint cuối và history
last_path = os.path.join(CHECKPOINT_DIR, "facenet_last.pth")
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'val_acc': val_acc,
    'val_loss': val_loss,
    'config': config
}, last_path)
print(f"Last checkpoint: {last_path}")

history_path = os.path.join(CHECKPOINT_DIR, "training_history.json")
with open(history_path, 'w') as f:
    json.dump(history, f, indent=2)
print(f"History: {history_path}")

print(f"\nCheckpoint files:")
!ls -la {CHECKPOINT_DIR}

In [None]:
# Visualize training history
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history['train_acc'], label='Train')
axes[1].plot(history['val_acc'], label='Val')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy')
axes[1].legend()
axes[1].grid(True)

axes[2].plot(history['lr'])
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('LR')
axes[2].set_title('Learning Rate')
axes[2].grid(True)

plt.tight_layout()
plt.savefig(os.path.join(CHECKPOINT_DIR, 'training_curves.png'), dpi=150)
plt.show()

## Download Checkpoint

In [None]:
# Hiển thị checkpoint files
print("=== CHECKPOINT FILES ===")
!ls -lh {CHECKPOINT_DIR}

print("\n=== Download ===")
print(f"Best model: {CHECKPOINT_DIR}/facenet_best.pth")
print(f"Last model: {CHECKPOINT_DIR}/facenet_last.pth")

In [None]:
# Zip checkpoint folder de tai ve
import shutil

zip_name = "facenet_checkpoints"
zip_path = f"/kaggle/working/{zip_name}"

shutil.make_archive(zip_path, "zip", CHECKPOINT_DIR)

print(f"[OK] Da tao file zip: {zip_path}.zip")
print(f"\nDownload file nay tu panel Output ben phai.")
!ls -lh /kaggle/working/*.zip