# ArcFace Training - Kaggle

Notebook huấn luyện ArcFace trên Kaggle với GPU miễn phí.

## Chuẩn bị:
1. Upload dataset `CelebA_Aligned_Balanced` lên Kaggle Datasets
2. Add dataset vào notebook này
3. Bật GPU: Settings > Accelerator > GPU P100/T4

In [1]:
# Detect môi trường
import os
import sys

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
print(f"Kaggle environment: {IS_KAGGLE}")

if not IS_KAGGLE:
    print("WARNING: Notebook này được thiết kế cho Kaggle!")

Kaggle environment: True


In [2]:
# Fix protobuf compatibility issue
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# Hoặc nếu muốn fix triệt để, chạy:
!pip install protobuf==3.20.* --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 3.20.3 which is incompatible.
ray 2.51.1 requires click!=8.3.0,>=7.0, but you have click 8.3.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
tensorflow-metadata 1.17.2 requires protobuf>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompa

In [3]:
# Cấu hình đường dẫn Kaggle
# ROOT: thư mục chứa source code (clone từ GitHub)
# DATA_DIR: thư mục chứa dataset (từ Kaggle Datasets)
# CHECKPOINT_DIR: thư mục lưu model checkpoint

ROOT = "/kaggle/working/FaceRecognition"
CHECKPOINT_DIR = "/kaggle/working/checkpoints/arcface"

# Dataset path - thay đổi theo tên dataset của bạn trên Kaggle
# Sau khi add dataset, kiểm tra đường dẫn: !ls /kaggle/input/
KAGGLE_DATASET_NAME = "celeba-aligned-balanced"  # Thay đổi nếu cần
DATA_DIR = f"/kaggle/input/{KAGGLE_DATASET_NAME}"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"ROOT: {ROOT}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"CHECKPOINT_DIR: {CHECKPOINT_DIR}")

ROOT: /kaggle/working/FaceRecognition
DATA_DIR: /kaggle/input/celeba-aligned-balanced
CHECKPOINT_DIR: /kaggle/working/checkpoints/arcface


In [4]:
# === CAU HINH CHECKPOINT DATASET ===
# Neu ban da upload checkpoint (.pth) len Kaggle Dataset de resume training,
# hay dien ten dataset vao day.
# Vi du: neu ban tao dataset ten 'arcface-checkpoints' chua file arcface_last.pth
#        thi dat CHECKPOINT_DATASET_NAME = 'arcface-checkpoints'

CHECKPOINT_DATASET_NAME = "arcface-checkpoints"  # Thay doi neu co dataset checkpoint
# Vi du: CHECKPOINT_DATASET_NAME = "arcface-checkpoints"

import shutil
import glob

if CHECKPOINT_DATASET_NAME:
    checkpoint_input_dir = f"/kaggle/input/{CHECKPOINT_DATASET_NAME}"
    
    if os.path.exists(checkpoint_input_dir):
        print(f"[OK] Tim thay checkpoint dataset: {checkpoint_input_dir}")
        
        # Tim tat ca file .pth trong dataset
        pth_files = glob.glob(os.path.join(checkpoint_input_dir, "**/*.pth"), recursive=True)
        
        if pth_files:
            print(f"    Tim thay {len(pth_files)} file checkpoint:")
            for pth_file in pth_files:
                print(f"      - {os.path.basename(pth_file)}")
            
            # Copy cac file .pth sang CHECKPOINT_DIR de resume
            os.makedirs(CHECKPOINT_DIR, exist_ok=True)
            for pth_file in pth_files:
                dest_path = os.path.join(CHECKPOINT_DIR, os.path.basename(pth_file))
                if not os.path.exists(dest_path):
                    shutil.copy(pth_file, dest_path)
                    print(f"[COPY] {os.path.basename(pth_file)} -> {CHECKPOINT_DIR}")
                else:
                    print(f"[SKIP] {os.path.basename(pth_file)} - da ton tai")
            
            # Hien thi thong tin checkpoint
            last_ckpt = os.path.join(CHECKPOINT_DIR, "arcface_last.pth")
            if os.path.exists(last_ckpt):
                import torch
                ckpt = torch.load(last_ckpt, map_location='cpu', weights_only=False)
                print(f"\n[INFO] Checkpoint info:")
                print(f"       Epoch: {ckpt.get('epoch', 0) + 1}")
                print(f"       Best val acc: {ckpt.get('best_val_acc', 0):.2f}%")
        else:
            print(f"[WARN] Khong tim thay file .pth trong {checkpoint_input_dir}")
    else:
        print(f"[ERROR] Khong tim thay checkpoint dataset: {checkpoint_input_dir}")
        print("        Kiem tra lai ten dataset hoac add dataset vao notebook")
else:
    print("[INFO] Khong co CHECKPOINT_DATASET_NAME - Training tu dau")
    print("       Neu muon resume, hay:")
    print("       1. Upload file .pth len Kaggle Dataset")
    print("       2. Add dataset vao notebook")
    print("       3. Dat CHECKPOINT_DATASET_NAME = 'ten-dataset-cua-ban'")

[ERROR] Khong tim thay checkpoint dataset: /kaggle/input/arcface-checkpoints
        Kiem tra lai ten dataset hoac add dataset vao notebook


In [5]:
# Kiểm tra Kaggle dataset đã được add chưa
print("=== KAGGLE INPUT DATASETS ===")
!ls -la /kaggle/input/

if os.path.exists(DATA_DIR):
    print(f"\n[OK] Dataset found at: {DATA_DIR}")
    !ls -la {DATA_DIR}
else:
    print(f"\n[ERROR] Dataset not found at: {DATA_DIR}")
    print("Hãy add dataset vào notebook:")
    print("  1. Click 'Add data' ở sidebar bên phải")
    print("  2. Tìm và add dataset của bạn")
    print("  3. Cập nhật KAGGLE_DATASET_NAME ở cell trên")

=== KAGGLE INPUT DATASETS ===
total 8
drwxr-xr-x 3 root   root    4096 Dec 13 01:51 .
drwxr-xr-x 5 root   root    4096 Dec 13 01:51 ..
drwxr-xr-x 3 nobody nogroup    0 Dec 13 01:49 celeba-aligned-balanced

[OK] Dataset found at: /kaggle/input/celeba-aligned-balanced
total 4
drwxr-xr-x 3 nobody nogroup    0 Dec 13 01:49 .
drwxr-xr-x 3 root   root    4096 Dec 13 01:51 ..
drwxr-xr-x 6 nobody nogroup    0 Dec 13 01:50 CelebA_Aligned_Balanced


In [None]:
# Cau hinh GitHub token (neu repository la private hoac can authentication)
# 
# HUONG DAN:
# 1. Vao: https://github.com/settings/tokens
# 2. Click "Generate new token" > "Generate new token (classic)"
# 3. Chon quyen: "repo" (neu PRIVATE) hoac "public_repo" (neu PUBLIC)
# 4. Copy token
# 5. Vao Kaggle notebook: Settings > Add-ons > Secrets
# 6. Them secret: name="GITHUB_TOKEN", value="your_token_here"

# Lay token tu Kaggle Secrets (an toan, khong lo token)
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
    print("[OK] Da lay GITHUB_TOKEN tu Kaggle Secrets")
except Exception as e:
    GITHUB_TOKEN = None
    print("[WARN] Khong lay duoc token tu Kaggle Secrets")
    print(f"       Loi: {e}")
    print("       Neu repo la private, hay them secret 'GITHUB_TOKEN' trong Settings > Secrets")

if GITHUB_TOKEN:
    REPO_URL = f"https://{GITHUB_TOKEN}@github.com/sin0235/FaceRecognition.git"
    print("     Repository: https://[TOKEN]@github.com/sin0235/FaceRecognition.git")
else:
    REPO_URL = "https://github.com/sin0235/FaceRecognition.git"
    print("[INFO] Su dung public URL (khong can token)")

[OK] GitHub token da duoc cau hinh
     Repository URL: https://[TOKEN]@github.com/sin0235/FaceRecognition.git


In [None]:
# Clone repository tu GitHub

if os.path.exists(ROOT):
    print("Repository da ton tai, dang pull updates...")
    %cd {ROOT}
    if GITHUB_TOKEN:
        !git remote set-url origin {REPO_URL}
    !git pull
else:
    print(f"Dang clone repository...")
    !git clone {REPO_URL} {ROOT}
    %cd {ROOT}

print(f"\nWorking directory: {os.getcwd()}")
!ls -la

Repository da ton tai, dang pull updates...
/kaggle/working/FaceRecognition
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 4 (delta 3), reused 4 (delta 3), pack-reused 0 (from 0)[K
Unpacking objects: 100% (4/4), 1021 bytes | 1021.00 KiB/s, done.
From https://github.com/sin0235/FaceRecognition
   bb28cec..1f47230  main       -> origin/main
Updating bb28cec..1f47230
Fast-forward
 configs/arcface_kaggle.yaml | 1 [32m+[m
 1 file changed, 1 insertion(+)

Working directory: /kaggle/working/FaceRecognition
total 116
drwxr-xr-x 16 root root  4096 Dec 13 01:51 .
drwxr-xr-x  8 root root  4096 Dec 13 01:54 ..
drwxr-xr-x  2 root root  4096 Dec 13 01:51 app
drwxr-xr-x  2 root root  4096 Dec 13 02:00 configs
drwxr-xr-x  8 root root  4096 Dec 13 02:00 .git
drwxr-xr-x  3 root root  4096 Dec 13 01:51 .github
-rw-r--r--  1 root root  1219 Dec 13 01:51 .gitignore
drwxr-xr-x  2 root root  4096 Dec 1

In [8]:
# Thêm ROOT vào Python path
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
    print(f"Đã thêm {ROOT} vào Python path")

print(f"Python path: {sys.path[:3]}...")

Đã thêm /kaggle/working/FaceRecognition vào Python path
Python path: ['/kaggle/working/FaceRecognition', '/kaggle/working', '/kaggle/lib/kagglegym']...


In [9]:
# Cai dat dependencies
print("Cai dat dependencies...")

# Fix NumPy version conflict voi matplotlib
!pip install -q "numpy<2.0"

# PyTorch thuong da co san tren Kaggle, chi cai them packages con thieu
!pip install -q opencv-python-headless Pillow scikit-learn tqdm pyyaml

# Upgrade matplotlib de tuong thich
!pip install -q --upgrade matplotlib

print("\nHoan tat cai dat!")

Cai dat dependencies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.2.6 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.2.6 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
datasets 4.4.1 requires pyarrow>=21.0.0, but you have pyarrow

In [10]:
# Kiểm tra dependencies
import torch

print("=== GPU INFO ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  CUDA version: {torch.version.cuda}")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\n=== DEPENDENCIES ===")
try:
    import onnxruntime as ort
    print(f"onnxruntime: OK ({ort.get_available_providers()})")
except ImportError:
    print("onnxruntime: NOT INSTALLED")

try:
    import insightface
    print(f"insightface: OK (v{insightface.__version__})")
except ImportError:
    print("insightface: NOT INSTALLED")

try:
    import cv2
    print(f"opencv: OK")
except ImportError:
    print("opencv: NOT INSTALLED")

=== GPU INFO ===
CUDA available: True
  CUDA version: 12.4
  Device: Tesla T4
  Memory: 15.8 GB

=== DEPENDENCIES ===
onnxruntime: NOT INSTALLED
insightface: NOT INSTALLED
opencv: OK


In [11]:
# Cau hinh file paths
# Su dung config toi uu cho Kaggle (batch_size nho hon, tat TensorBoard)
CONFIG_PATH = os.path.join(ROOT, "configs", "arcface_kaggle.yaml")
TRAIN_SCRIPT = os.path.join(ROOT, "models", "arcface", "train_arcface.py")

# Neu khong co config kaggle, dung config mac dinh
if not os.path.exists(CONFIG_PATH):
    CONFIG_PATH = os.path.join(ROOT, "configs", "arcface_config.yaml")
    print("[WARN] arcface_kaggle.yaml not found, using default config")

# Tim thu muc train/val (mode: folder khong can CSV)
train_img_dir = os.path.join(DATA_DIR, "train")
val_img_dir = os.path.join(DATA_DIR, "val")

# Thu duong dan alternative neu cau truc khac
if not os.path.exists(train_img_dir):
    train_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "train")
    val_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "val")

print("=== FILE PATHS ===")
print(f"CONFIG: {CONFIG_PATH}")
print(f"  Exists: {os.path.exists(CONFIG_PATH)}")
print(f"\nTRAIN_SCRIPT: {TRAIN_SCRIPT}")
print(f"  Exists: {os.path.exists(TRAIN_SCRIPT)}")
print(f"\nTrain folder: {train_img_dir}")
print(f"  Exists: {os.path.exists(train_img_dir)}")
print(f"\nVal folder: {val_img_dir}")
print(f"  Exists: {os.path.exists(val_img_dir)}")

=== FILE PATHS ===
CONFIG: /kaggle/working/FaceRecognition/configs/arcface_kaggle.yaml
  Exists: True

TRAIN_SCRIPT: /kaggle/working/FaceRecognition/models/arcface/train_arcface.py
  Exists: True

Train folder: /kaggle/input/celeba-aligned-balanced/CelebA_Aligned_Balanced/train
  Exists: True

Val folder: /kaggle/input/celeba-aligned-balanced/CelebA_Aligned_Balanced/val
  Exists: True


In [12]:
# Kiem tra du lieu training (mode: folder - khong can CSV)
print("=== KIEM TRA DU LIEU ===")
print("Mode: folder (khong can file CSV metadata)\n")

data_ready = True

# Kiem tra thu muc train
if os.path.exists(train_img_dir):
    train_identities = [d for d in os.listdir(train_img_dir) 
                        if os.path.isdir(os.path.join(train_img_dir, d))]
    print(f"[OK] Train folder: {len(train_identities)} identities")
    
    # Dem so anh
    total_train_images = 0
    for identity in train_identities[:5]:  # Chi dem 5 identity dau
        identity_path = os.path.join(train_img_dir, identity)
        num_images = len([f for f in os.listdir(identity_path) 
                         if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        total_train_images += num_images
    print(f"     Sample: {train_identities[:3]}...")
else:
    print(f"[ERROR] Train folder not found: {train_img_dir}")
    data_ready = False

# Kiem tra thu muc val
if os.path.exists(val_img_dir):
    val_identities = [d for d in os.listdir(val_img_dir) 
                      if os.path.isdir(os.path.join(val_img_dir, d))]
    print(f"[OK] Val folder: {len(val_identities)} identities")
else:
    print(f"[ERROR] Val folder not found: {val_img_dir}")
    data_ready = False

if data_ready:
    print("\n[OK] Du lieu san sang cho training!")
else:
    print("\n[ERROR] Thieu du lieu. Kiem tra lai dataset.")
    print("Cau truc thu muc can co:")
    print("  DATA_DIR/")
    print("    train/")
    print("      identity_1/")
    print("        img1.jpg, img2.jpg, ...")
    print("      identity_2/")
    print("        ...")
    print("    val/")
    print("      ...")

=== KIEM TRA DU LIEU ===
Mode: folder (khong can file CSV metadata)

[OK] Train folder: 9343 identities
     Sample: ['5550', '3347', '3531']...
[OK] Val folder: 9343 identities

[OK] Du lieu san sang cho training!


In [None]:
# Chay training
RESUME_FROM_LAST = True

if not os.path.exists(TRAIN_SCRIPT):
    print(f"[ERROR] Training script not found: {TRAIN_SCRIPT}")
elif not os.path.exists(CONFIG_PATH):
    print(f"[ERROR] Config not found: {CONFIG_PATH}")
elif not data_ready:
    print("[ERROR] Du lieu chua san sang!")
    print("Chay lai cell 'Kiem tra du lieu' o tren")
else:
    print("="*60)
    print("BAT DAU TRAINING ARCFACE")
    print("="*60)
    print(f"Config: {CONFIG_PATH}")
    print(f"Data: {DATA_DIR}")
    print(f"Train: {train_img_dir}")
    print(f"Checkpoints: {CHECKPOINT_DIR}")
    
    # Kiem tra checkpoint de resume
    resume_arg = ""
    last_checkpoint = os.path.join(CHECKPOINT_DIR, "arcface_last.pth")
    if RESUME_FROM_LAST and os.path.exists(last_checkpoint):
        resume_arg = f"--resume {last_checkpoint}"
        print(f"\n[RESUME] Found checkpoint: {last_checkpoint}")
        ckpt = torch.load(last_checkpoint, map_location='cpu', weights_only=False)
        print(f"  Epoch: {ckpt['epoch']+1}")
        print(f"  Best val acc: {ckpt['best_val_acc']:.2f}%")
    else:
        print("\n[NEW] Training tu dau")
    
    print("="*60 + "\n")
    
    cmd = f"python {TRAIN_SCRIPT} --config {CONFIG_PATH} --data_dir {DATA_DIR} --checkpoint_dir {CHECKPOINT_DIR} {resume_arg}"
    !{cmd}

BAT DAU TRAINING ARCFACE
Config: /kaggle/working/FaceRecognition/configs/arcface_kaggle.yaml
Data: /kaggle/input/celeba-aligned-balanced
Train: /kaggle/input/celeba-aligned-balanced/CelebA_Aligned_Balanced/train
Checkpoints: /kaggle/working/checkpoints/arcface

[NEW] Training tu dau

E0000 00:00:1765591249.486560     207 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765591249.493383     207 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[WARN] TensorBoard not available: ValueError
[INFO] TensorBoard available
ARCFACE TRAINING
Config: /kaggle/working/FaceRecognition/configs/arcface_kaggle.yaml
Pretrained backbone: None
Data dir: /kaggle/input/celeba-aligned-balanced
Checkpoint dir: /kaggle/working/checkpoints/arcface
Resume: None
Su dung device: cuda
Data directory: /kaggle/input/celeba-aligne

In [None]:
# Kiểm tra checkpoints sau training
print("=== CHECKPOINTS ===")
if os.path.exists(CHECKPOINT_DIR):
    !ls -lah {CHECKPOINT_DIR}
else:
    print("Chưa có checkpoint nào.")

In [None]:
import sys
import os

# Kiểm tra đường dẫn hiện tại
print("Current working directory:", os.getcwd())

# Liệt kê nội dung /kaggle/working
print("\nContent of /kaggle/working:")
for item in os.listdir('/kaggle/working'):
    print(f"  - {item}")

# Thêm đường dẫn
repo_path = '/kaggle/working/FaceRecognition'
if os.path.exists(repo_path):
    sys.path.insert(0, repo_path)
    print(f"\nAdded to sys.path: {repo_path}")
else:
    print(f"\nPath not found: {repo_path}")
    print("Please check your repo path!")

print("\nCurrent sys.path:")
for p in sys.path[:5]:
    print(f"  - {p}")

In [None]:
# Test import
try:
    from models.arcface.arcface_model import ArcFaceModel
    print("Import thành công!")
    print(f"ArcFaceModel: {ArcFaceModel}")
except Exception as e:
    print(f"Lỗi: {type(e).__name__}: {e}")
    
    # Debug thêm
    import os
    models_path = '/kaggle/working/FaceRecognition/models'
    print(f"\nKiểm tra thư mục models:")
    print(f"  Tồn tại: {os.path.exists(models_path)}")
    if os.path.exists(models_path):
        print(f"  Nội dung: {os.listdir(models_path)}")
        
    arcface_path = '/kaggle/working/FaceRecognition/models/arcface'
    print(f"\nKiểm tra thư mục arcface:")
    print(f"  Tồn tại: {os.path.exists(arcface_path)}")
    if os.path.exists(arcface_path):
        print(f"  Nội dung: {os.listdir(arcface_path)}")

In [None]:
# Test model sau training
import sys
import os
import importlib.util

checkpoint_path = os.path.join(CHECKPOINT_DIR, "arcface_best.pth")

if os.path.exists(checkpoint_path):
    print(f"Testing model: {checkpoint_path}")
    
    # Load module trực tiếp từ file
    spec = importlib.util.spec_from_file_location(
        "arcface_model", 
        "/kaggle/working/FaceRecognition/models/arcface/arcface_model.py"
    )
    arcface_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(arcface_module)
    ArcFaceModel = arcface_module.ArcFaceModel
    
    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
    num_classes = checkpoint.get('num_classes', 100)
    
    model = ArcFaceModel(num_classes=num_classes, embedding_size=512)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"\n[OK] Loaded model - Epoch {checkpoint.get('epoch', 'N/A')}")
    if 'val_acc' in checkpoint:
        print(f"Validation accuracy: {checkpoint['val_acc']:.2f}%")
    
    dummy_input = torch.randn(1, 3, 112, 112)
    with torch.no_grad():
        embedding = model.extract_features(dummy_input)
    
    print(f"Embedding shape: {embedding.shape}")
    print("[OK] Model sẵn sàng!")
else:
    print(f"[WAIT] Chưa có checkpoint: {checkpoint_path}")
    print("Chạy cell training trước.")

In [None]:
# Download checkpoints (QUAN TRỌNG - chạy trước khi session kết thúc)
# Kaggle không lưu files sau khi session kết thúc!

from IPython.display import FileLink, display
import shutil

print("=== DOWNLOAD CHECKPOINTS ===")
print("Quan trọng: Kaggle sẽ xóa files khi session kết thúc!")
print("Hãy download các checkpoints bên dưới:\n")

# Copy checkpoints sang /kaggle/working để có thể download
download_dir = "/kaggle/working/download"
os.makedirs(download_dir, exist_ok=True)

checkpoints = ["arcface_best.pth", "arcface_last.pth"]
for ckpt_name in checkpoints:
    ckpt_path = os.path.join(CHECKPOINT_DIR, ckpt_name)
    if os.path.exists(ckpt_path):
        dest_path = os.path.join(download_dir, ckpt_name)
        shutil.copy(ckpt_path, dest_path)
        print(f"[{ckpt_name}]")
        display(FileLink(dest_path))
        print()
    else:
        print(f"[SKIP] {ckpt_name} - không tồn tại")

In [None]:
# Tạo file zip để download tất cả checkpoints
import shutil

zip_path = "/kaggle/working/arcface_checkpoints"
if os.path.exists(CHECKPOINT_DIR) and os.listdir(CHECKPOINT_DIR):
    shutil.make_archive(zip_path, 'zip', CHECKPOINT_DIR)
    print(f"Đã tạo: {zip_path}.zip")
    print("\nClick link bên dưới để download:")
    display(FileLink(f"{zip_path}.zip"))
else:
    print("Chưa có checkpoints để zip.")

In [None]:
# Nén logs
log_dir = '/kaggle/working/FaceRecognition/logs/arcface'
if os.path.exists(log_dir):
    shutil.make_archive('/kaggle/working/arcface_logs', 'zip', log_dir)
    display(FileLink('/kaggle/working/arcface_logs.zip'))
    print("Click link trên để tải logs")

In [None]:
# Ve bieu do training (Loss va Accuracy)
import matplotlib.pyplot as plt
import json

def load_training_history():
    """Load history tu checkpoint hoac file JSON"""
    history = None
    
    # Thu doc tu file JSON truoc
    json_path = os.path.join(CHECKPOINT_DIR, 'training_history.json')
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            data = json.load(f)
            history = data.get('history', None)
            print(f"[OK] Loaded from: {json_path}")
    
    # Neu khong co JSON, doc tu checkpoint
    if history is None:
        ckpt_path = os.path.join(CHECKPOINT_DIR, 'arcface_last.pth')
        if os.path.exists(ckpt_path):
            ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
            history = ckpt.get('history', None)
            print(f"[OK] Loaded from checkpoint: {ckpt_path}")
    
    return history

def plot_training_history(history):
    """Ve bieu do Loss va Accuracy"""
    if not history or len(history.get('epoch', [])) == 0:
        print("[ERROR] Khong co du lieu training history")
        return
    
    epochs = history['epoch']
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot Loss
    ax1 = axes[0]
    ax1.plot(epochs, history['train_loss'], 'b-', label='Train Loss', linewidth=2)
    ax1.plot(epochs, history['val_loss'], 'r-', label='Val Loss', linewidth=2)
    ax1.set_xlabel('Epoch', fontsize=12)
    ax1.set_ylabel('Loss', fontsize=12)
    ax1.set_title('Training & Validation Loss', fontsize=14)
    ax1.legend(loc='upper right')
    ax1.grid(True, alpha=0.3)
    
    # Danh dau epoch co val_loss thap nhat
    best_loss_idx = history['val_loss'].index(min(history['val_loss']))
    ax1.axvline(x=epochs[best_loss_idx], color='g', linestyle='--', alpha=0.7)
    ax1.scatter([epochs[best_loss_idx]], [history['val_loss'][best_loss_idx]], 
                color='g', s=100, zorder=5, marker='*')
    
    # Plot Accuracy
    ax2 = axes[1]
    ax2.plot(epochs, history['train_acc'], 'b-', label='Train Acc', linewidth=2)
    ax2.plot(epochs, history['val_acc'], 'r-', label='Val Acc', linewidth=2)
    ax2.set_xlabel('Epoch', fontsize=12)
    ax2.set_ylabel('Accuracy (%)', fontsize=12)
    ax2.set_title('Training & Validation Accuracy', fontsize=14)
    ax2.legend(loc='lower right')
    ax2.grid(True, alpha=0.3)
    
    # Danh dau epoch co val_acc cao nhat
    best_acc_idx = history['val_acc'].index(max(history['val_acc']))
    ax2.axvline(x=epochs[best_acc_idx], color='g', linestyle='--', alpha=0.7)
    ax2.scatter([epochs[best_acc_idx]], [history['val_acc'][best_acc_idx]], 
                color='g', s=100, zorder=5, marker='*')
    
    plt.tight_layout()
    
    # Luu bieu do
    plot_path = os.path.join(CHECKPOINT_DIR, 'training_plot.png')
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"\n[OK] Saved plot: {plot_path}")
    
    plt.show()
    
    # In thong so
    print(f"\n=== THONG SO TRAINING ===")
    print(f"Tong so epochs: {len(epochs)}")
    print(f"Best Val Loss: {min(history['val_loss']):.4f} (epoch {epochs[best_loss_idx]})")
    print(f"Best Val Acc: {max(history['val_acc']):.2f}% (epoch {epochs[best_acc_idx]})")
    print(f"Final Train Acc: {history['train_acc'][-1]:.2f}%")
    print(f"Final Val Acc: {history['val_acc'][-1]:.2f}%")
    print(f"Gap (Train - Val): {history['train_acc'][-1] - history['val_acc'][-1]:.2f}%")

# Load va ve bieu do
print("=== VE BIEU DO TRAINING ===")
history = load_training_history()
if history:
    plot_training_history(history)
else:
    print("[WAIT] Chua co du lieu training. Chay cell training truoc.")