# ArcFace Training - Kaggle

Notebook huấn luyện ArcFace trên Kaggle với GPU miễn phí.

## Chuẩn bị:
1. Upload dataset `CelebA_Aligned_Balanced` lên Kaggle Datasets
2. Add dataset vào notebook này
3. Bật GPU: Settings > Accelerator > GPU P100/T4

In [None]:
# Detect môi trường
import os
import sys

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
print(f"Kaggle environment: {IS_KAGGLE}")

if not IS_KAGGLE:
    print("WARNING: Notebook này được thiết kế cho Kaggle!")

In [None]:
# Cấu hình đường dẫn Kaggle
# ROOT: thư mục chứa source code (clone từ GitHub)
# DATA_DIR: thư mục chứa dataset (từ Kaggle Datasets)
# CHECKPOINT_DIR: thư mục lưu model checkpoint

ROOT = "/kaggle/working/FaceRecognition"
CHECKPOINT_DIR = "/kaggle/working/checkpoints/arcface"

# Dataset path - thay đổi theo tên dataset của bạn trên Kaggle
# Sau khi add dataset, kiểm tra đường dẫn: !ls /kaggle/input/
KAGGLE_DATASET_NAME = "celeba-aligned-balanced"  # Thay đổi nếu cần
DATA_DIR = f"/kaggle/input/{KAGGLE_DATASET_NAME}"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"ROOT: {ROOT}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"CHECKPOINT_DIR: {CHECKPOINT_DIR}")

In [None]:
# Kiểm tra Kaggle dataset đã được add chưa
print("=== KAGGLE INPUT DATASETS ===")
!ls -la /kaggle/input/

if os.path.exists(DATA_DIR):
    print(f"\n[OK] Dataset found at: {DATA_DIR}")
    !ls -la {DATA_DIR}
else:
    print(f"\n[ERROR] Dataset not found at: {DATA_DIR}")
    print("Hãy add dataset vào notebook:")
    print("  1. Click 'Add data' ở sidebar bên phải")
    print("  2. Tìm và add dataset của bạn")
    print("  3. Cập nhật KAGGLE_DATASET_NAME ở cell trên")

In [None]:
# Clone repository từ GitHub
REPO_URL = "https://github.com/sin0235/FaceRecognition.git"

if os.path.exists(ROOT):
    print("Repository đã tồn tại, đang pull updates...")
    %cd {ROOT}
    !git pull
else:
    print(f"Đang clone repository...")
    !git clone {REPO_URL} {ROOT}
    %cd {ROOT}

print(f"\nWorking directory: {os.getcwd()}")
!ls -la

In [None]:
# Thêm ROOT vào Python path
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
    print(f"Đã thêm {ROOT} vào Python path")

print(f"Python path: {sys.path[:3]}...")

In [None]:
# Cai dat dependencies
print("Cai dat dependencies...")

# Fix NumPy version conflict voi matplotlib
!pip install -q "numpy<2.0"

# PyTorch thuong da co san tren Kaggle, chi cai them packages con thieu
!pip install -q opencv-python-headless Pillow scikit-learn tqdm pyyaml

# Upgrade matplotlib de tuong thich
!pip install -q --upgrade matplotlib

print("\nHoan tat cai dat!")

In [None]:
# Kiểm tra dependencies
import torch

print("=== GPU INFO ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  CUDA version: {torch.version.cuda}")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\n=== DEPENDENCIES ===")
try:
    import onnxruntime as ort
    print(f"onnxruntime: OK ({ort.get_available_providers()})")
except ImportError:
    print("onnxruntime: NOT INSTALLED")

try:
    import insightface
    print(f"insightface: OK (v{insightface.__version__})")
except ImportError:
    print("insightface: NOT INSTALLED")

try:
    import cv2
    print(f"opencv: OK")
except ImportError:
    print("opencv: NOT INSTALLED")

In [None]:
# Cau hinh file paths
# Su dung config toi uu cho Kaggle (batch_size nho hon, tat TensorBoard)
CONFIG_PATH = os.path.join(ROOT, "configs", "arcface_kaggle.yaml")
TRAIN_SCRIPT = os.path.join(ROOT, "models", "arcface", "train_arcface.py")

# Neu khong co config kaggle, dung config mac dinh
if not os.path.exists(CONFIG_PATH):
    CONFIG_PATH = os.path.join(ROOT, "configs", "arcface_config.yaml")
    print("[WARN] arcface_kaggle.yaml not found, using default config")

# Tim thu muc train/val (mode: folder khong can CSV)
train_img_dir = os.path.join(DATA_DIR, "train")
val_img_dir = os.path.join(DATA_DIR, "val")

# Thu duong dan alternative neu cau truc khac
if not os.path.exists(train_img_dir):
    train_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "train")
    val_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "val")

print("=== FILE PATHS ===")
print(f"CONFIG: {CONFIG_PATH}")
print(f"  Exists: {os.path.exists(CONFIG_PATH)}")
print(f"\nTRAIN_SCRIPT: {TRAIN_SCRIPT}")
print(f"  Exists: {os.path.exists(TRAIN_SCRIPT)}")
print(f"\nTrain folder: {train_img_dir}")
print(f"  Exists: {os.path.exists(train_img_dir)}")
print(f"\nVal folder: {val_img_dir}")
print(f"  Exists: {os.path.exists(val_img_dir)}")

In [None]:
# Kiem tra du lieu training (mode: folder - khong can CSV)
print("=== KIEM TRA DU LIEU ===")
print("Mode: folder (khong can file CSV metadata)\n")

data_ready = True

# Kiem tra thu muc train
if os.path.exists(train_img_dir):
    train_identities = [d for d in os.listdir(train_img_dir) 
                        if os.path.isdir(os.path.join(train_img_dir, d))]
    print(f"[OK] Train folder: {len(train_identities)} identities")
    
    # Dem so anh
    total_train_images = 0
    for identity in train_identities[:5]:  # Chi dem 5 identity dau
        identity_path = os.path.join(train_img_dir, identity)
        num_images = len([f for f in os.listdir(identity_path) 
                         if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        total_train_images += num_images
    print(f"     Sample: {train_identities[:3]}...")
else:
    print(f"[ERROR] Train folder not found: {train_img_dir}")
    data_ready = False

# Kiem tra thu muc val
if os.path.exists(val_img_dir):
    val_identities = [d for d in os.listdir(val_img_dir) 
                      if os.path.isdir(os.path.join(val_img_dir, d))]
    print(f"[OK] Val folder: {len(val_identities)} identities")
else:
    print(f"[ERROR] Val folder not found: {val_img_dir}")
    data_ready = False

if data_ready:
    print("\n[OK] Du lieu san sang cho training!")
else:
    print("\n[ERROR] Thieu du lieu. Kiem tra lai dataset.")
    print("Cau truc thu muc can co:")
    print("  DATA_DIR/")
    print("    train/")
    print("      identity_1/")
    print("        img1.jpg, img2.jpg, ...")
    print("      identity_2/")
    print("        ...")
    print("    val/")
    print("      ...")

In [None]:
# Chay training
RESUME_FROM_LAST = True

if not os.path.exists(TRAIN_SCRIPT):
    print(f"[ERROR] Training script not found: {TRAIN_SCRIPT}")
elif not os.path.exists(CONFIG_PATH):
    print(f"[ERROR] Config not found: {CONFIG_PATH}")
elif not data_ready:
    print("[ERROR] Du lieu chua san sang!")
    print("Chay lai cell 'Kiem tra du lieu' o tren")
else:
    print("="*60)
    print("BAT DAU TRAINING ARCFACE")
    print("="*60)
    print(f"Config: {CONFIG_PATH}")
    print(f"Data: {DATA_DIR}")
    print(f"Train: {train_img_dir}")
    print(f"Checkpoints: {CHECKPOINT_DIR}")
    
    # Kiem tra checkpoint de resume
    resume_arg = ""
    last_checkpoint = os.path.join(CHECKPOINT_DIR, "arcface_last.pth")
    if RESUME_FROM_LAST and os.path.exists(last_checkpoint):
        resume_arg = f"--resume {last_checkpoint}"
        print(f"\n[RESUME] Found checkpoint: {last_checkpoint}")
        ckpt = torch.load(last_checkpoint, map_location='cpu', weights_only=False)
        print(f"  Epoch: {ckpt['epoch']+1}")
        print(f"  Best val acc: {ckpt['best_val_acc']:.2f}%")
    else:
        print("\n[NEW] Training tu dau")
    
    print("="*60 + "\n")
    
    cmd = f"python {TRAIN_SCRIPT} --config {CONFIG_PATH} --data_dir {DATA_DIR} --checkpoint_dir {CHECKPOINT_DIR} {resume_arg}"
    !{cmd}

In [None]:
# Kiểm tra checkpoints sau training
print("=== CHECKPOINTS ===")
if os.path.exists(CHECKPOINT_DIR):
    !ls -lah {CHECKPOINT_DIR}
else:
    print("Chưa có checkpoint nào.")

In [None]:
# Test model sau training
checkpoint_path = os.path.join(CHECKPOINT_DIR, "arcface_best.pth")

if os.path.exists(checkpoint_path):
    print(f"Testing model: {checkpoint_path}")
    
    from models.arcface.arcface_model import ArcFaceModel
    
    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
    num_classes = checkpoint.get('num_classes', 100)
    
    model = ArcFaceModel(num_classes=num_classes, embedding_size=512)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"\n[OK] Loaded model - Epoch {checkpoint.get('epoch', 'N/A')}")
    if 'val_acc' in checkpoint:
        print(f"Validation accuracy: {checkpoint['val_acc']:.2f}%")
    
    dummy_input = torch.randn(1, 3, 112, 112)
    with torch.no_grad():
        embedding = model.extract_features(dummy_input)
    
    print(f"Embedding shape: {embedding.shape}")
    print("[OK] Model sẵn sàng!")
else:
    print(f"[WAIT] Chưa có checkpoint: {checkpoint_path}")
    print("Chạy cell training trước.")

In [None]:
# Download checkpoints (QUAN TRỌNG - chạy trước khi session kết thúc)
# Kaggle không lưu files sau khi session kết thúc!

from IPython.display import FileLink, display
import shutil

print("=== DOWNLOAD CHECKPOINTS ===")
print("Quan trọng: Kaggle sẽ xóa files khi session kết thúc!")
print("Hãy download các checkpoints bên dưới:\n")

# Copy checkpoints sang /kaggle/working để có thể download
download_dir = "/kaggle/working/download"
os.makedirs(download_dir, exist_ok=True)

checkpoints = ["arcface_best.pth", "arcface_last.pth"]
for ckpt_name in checkpoints:
    ckpt_path = os.path.join(CHECKPOINT_DIR, ckpt_name)
    if os.path.exists(ckpt_path):
        dest_path = os.path.join(download_dir, ckpt_name)
        shutil.copy(ckpt_path, dest_path)
        print(f"[{ckpt_name}]")
        display(FileLink(dest_path))
        print()
    else:
        print(f"[SKIP] {ckpt_name} - không tồn tại")

In [None]:
# Tạo file zip để download tất cả checkpoints
import shutil

zip_path = "/kaggle/working/arcface_checkpoints"
if os.path.exists(CHECKPOINT_DIR) and os.listdir(CHECKPOINT_DIR):
    shutil.make_archive(zip_path, 'zip', CHECKPOINT_DIR)
    print(f"Đã tạo: {zip_path}.zip")
    print("\nClick link bên dưới để download:")
    display(FileLink(f"{zip_path}.zip"))
else:
    print("Chưa có checkpoints để zip.")