# CelebA Dataset Preprocessing - Xu ly mat can bang

**THU TU XU LY DUNG:**
1. **Loc bo** identity co 1-4 anh (834 identity)
2. **Gom anh** theo identity
3. **Alignment TRUOC** - Su dung landmarks goc (QUAN TRONG!)
4. **Augmentation SAU** - Ap dung tren anh da align (landmarks da chuan hoa)
5. **Chia dataset** theo IDENTITY hoac theo ANH
6. **Tao metadata** cho training

**Tai sao phai Alignment TRUOC Augmentation?**
- Landmarks chi dung cho anh GOC
- Sau flip/rotate, vi tri landmarks BI SAI
- Align truoc -> augment tren anh 112x112 da chuan -> DUNG!

**Files metadata goc:**
- `data/meta_origin/identity_CelebA.txt` - Mapping anh -> identity
- `data/meta_origin/list_landmarks_align_celeba.csv` - 5 diem landmark (chi dung cho anh goc)
- `data/meta_origin/list_attr_celeba.csv` - 40 thuoc tinh
- `data/meta_origin/list_bbox_celeba.csv` - Bounding box


In [None]:
# Cai dat thu vien can thiet
%pip install kaggle tqdm opencv-python-headless numpy albumentations scikit-image


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_OUTPUT = "/content/drive/MyDrive/FaceRecognition"
os.makedirs(DRIVE_OUTPUT, exist_ok=True)
print(f"Output: {DRIVE_OUTPUT}")


In [None]:
# Setup Kaggle API - THAY DOI KEY CUA BAN
import json

data = {
    "username": "phctontrn",
    "key": "KGAT_bd48d6409aeff2468d3963d28e9d7bcc"  # Thay doi key cua ban
}

with open("kaggle.json", "w") as f:
    json.dump(data, f)

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
# Tai dataset CelebA tu Kaggle
!kaggle datasets download -d jessicali9530/celeba-dataset -p /content/celeba
!unzip -q /content/celeba/celeba-dataset.zip -d /content/celeba


In [None]:
# Upload file identity_CelebA.txt (file nay khong co san trong Kaggle dataset)
from google.colab import files
print("Upload file identity_CelebA.txt:")
uploaded = files.upload()
!mv identity_CelebA.txt /content/celeba/


## Cau hinh xu ly

| Tham so | Gia tri | Mo ta |
|---------|---------|-------|
| MIN_IMAGES | 5 | Loai bo identity co < 5 anh |
| AUGMENT_THRESHOLD | 10 | Augment identity co 5-9 anh |
| TARGET_MIN_IMAGES | 10 | Muc tieu toi thieu sau augment |


In [None]:
# CONFIG - Chon che do COLAB hoac LOCAL
import os
import json
import random
import shutil
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from collections import defaultdict
import albumentations as A
from skimage.transform import SimilarityTransform

# ============================================================
# CHON CHE DO: 'colab' hoac 'local'
# ============================================================
MODE = 'colab'  # Doi thanh 'local' neu chay tren may tinh ca nhan

if MODE == 'colab':
    # Paths cho Google Colab - LUU TAM TREN LOCAL COLAB (NHANH)
    CELEBA_DIR = "/content/celeba"
    IMG_DIR = "/content/celeba/img_align_celeba/img_align_celeba"
    IDENTITY_FILE = "/content/celeba/identity_CelebA.txt"
    LANDMARK_FILE = "/content/celeba/list_landmarks_align_celeba.csv"
    TEMP_BY_ID = "/content/celeba_by_id"
    TEMP_SPLIT = "/content/celeba_split"
    FINAL_OUTPUT = "/content/CelebA_Aligned_Balanced"  # Local Colab - nhanh
else:
    # Paths cho Local (su dung file meta_origin da co san)
    PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(".")))
    CELEBA_DIR = "data"
    IMG_DIR = "data/img_align_celeba"  # Can download anh rieng
    IDENTITY_FILE = "data/meta_origin/identity_CelebA.txt"
    LANDMARK_FILE = "data/meta_origin/list_landmarks_align_celeba.csv"
    TEMP_BY_ID = "data/celeba_by_id"
    TEMP_SPLIT = "data/celeba_split"
    FINAL_OUTPUT = "data/CelebA_Aligned_Balanced"

# Filtering parameters
MIN_IMAGES = 5          # Loai bo identity < 5 anh
AUGMENT_THRESHOLD = 10  # Augment identity co 5-9 anh
TARGET_MIN_IMAGES = 10  # Muc tieu toi thieu sau augment

# Split ratio - CHIA THEO IDENTITY
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1

# Seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ArcFace template 112x112 (5 landmarks)
ARCFACE_TEMPLATE = np.array([
    [38.2946, 51.6963],   # left_eye
    [73.5318, 51.5014],   # right_eye
    [56.0252, 71.7366],   # nose
    [41.5493, 92.3655],   # left_mouth
    [70.7299, 92.2041],   # right_mouth
], dtype=np.float32)

print(f"Mode: {MODE}")
print(f"Identity file: {IDENTITY_FILE}")
print(f"Output: {FINAL_OUTPUT}")
if MODE == 'colab':
    print(f"ZIP se luu vao Drive: {DRIVE_OUTPUT}")
print("Config loaded!")


## Buoc 1: Phan tich va loc identity


In [None]:
# Load va phan tich identity
identity_df = pd.read_csv(IDENTITY_FILE, sep=" ", header=None, names=["image", "identity_id"])
print(f"Tong so anh: {len(identity_df):,}")
print(f"Tong so identity: {identity_df['identity_id'].nunique():,}")

# Thong ke so anh moi identity
identity_counts = identity_df['identity_id'].value_counts()

# Phan loai identity
ids_to_remove = identity_counts[identity_counts < MIN_IMAGES].index.tolist()
ids_to_augment = identity_counts[(identity_counts >= MIN_IMAGES) & (identity_counts < AUGMENT_THRESHOLD)].index.tolist()
ids_normal = identity_counts[identity_counts >= AUGMENT_THRESHOLD].index.tolist()

print(f"\nPhan loai identity:")
print(f"  LOAI BO (< {MIN_IMAGES} anh): {len(ids_to_remove):,}")
print(f"  CAN AUGMENT ({MIN_IMAGES}-{AUGMENT_THRESHOLD-1} anh): {len(ids_to_augment):,}")
print(f"  BINH THUONG (>= {AUGMENT_THRESHOLD} anh): {len(ids_normal):,}")

removed_images = identity_counts[ids_to_remove].sum()
print(f"\nSo anh se bi loai: {removed_images:,} ({100*removed_images/len(identity_df):.1f}%)")

# Loc dataset
valid_ids = set(ids_to_augment + ids_normal)
filtered_df = identity_df[identity_df['identity_id'].isin(valid_ids)].copy()

print(f"\nSau khi loc:")
print(f"  - So anh: {len(filtered_df):,}")
print(f"  - So identity: {filtered_df['identity_id'].nunique():,}")


## Buoc 2: Gom anh theo identity


In [None]:
# Gom anh theo identity (chi anh da loc)
os.makedirs(TEMP_BY_ID, exist_ok=True)

for _, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Gom anh theo ID"):
    img_file = row['image']
    pid = str(row['identity_id'])
    
    dst_dir = f"{TEMP_BY_ID}/{pid}"
    os.makedirs(dst_dir, exist_ok=True)
    
    src_path = f"{IMG_DIR}/{img_file}"
    dst_path = f"{dst_dir}/{img_file}"
    
    if os.path.exists(src_path):
        shutil.copy(src_path, dst_path)


## Buoc 3: Offline Augmentation cho identity it anh (5-9 anh)


In [None]:
# Augmentation pipeline
augment_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=15, p=0.7, border_mode=cv2.BORDER_REPLICATE),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.15, hue=0.05, p=0.8),
    A.OneOf([
        A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),
        A.GaussianBlur(blur_limit=(3, 5), p=1.0),
    ], p=0.3),
])

def augment_identity_images(person_dir, target_count):
    images = [f for f in os.listdir(person_dir) if f.endswith('.jpg')]
    current_count = len(images)
    
    if current_count >= target_count:
        return 0
    
    needed = target_count - current_count
    augmented = 0
    
    while augmented < needed:
        src_img_name = random.choice(images)
        src_path = os.path.join(person_dir, src_img_name)
        
        img = cv2.imread(src_path)
        if img is None:
            continue
        
        augmented_img = augment_transform(image=img)['image']
        
        base_name = os.path.splitext(src_img_name)[0]
        new_name = f"{base_name}_aug{augmented+1}.jpg"
        new_path = os.path.join(person_dir, new_name)
        
        cv2.imwrite(new_path, augmented_img)
        augmented += 1
    
    return augmented

# Chay augmentation
total_augmented = 0
for pid in tqdm(ids_to_augment, desc="Augmenting"):
    person_dir = os.path.join(TEMP_BY_ID, str(pid))
    if os.path.exists(person_dir):
        total_augmented += augment_identity_images(person_dir, TARGET_MIN_IMAGES)

print(f"\nTong so anh da augment: {total_augmented:,}")


## Buoc 4: Chia Train/Val/Test

**Chon 1 trong 2 cach:**

| Cach | Mo ta | Dung cho |
|------|-------|----------|
| `split_by_identity` | Moi identity CHI o 1 tap | Face Identification |
| `split_by_image` | Moi identity co anh o CA 3 tap | Face Verification |

**Mac dinh: `split_by_image`** - Phu hop cho du an nhan dien khuon mat nguoi noi tieng


In [None]:
# ============================================================
# CHON CACH CHIA: 'by_image' hoac 'by_identity'
# ============================================================
SPLIT_METHOD = 'by_image'  # 'by_image': moi identity co anh o ca 3 tap
                           # 'by_identity': moi identity chi o 1 tap

all_ids = sorted([d for d in os.listdir(TEMP_BY_ID) if os.path.isdir(os.path.join(TEMP_BY_ID, d))])
print(f"Tong so identity: {len(all_ids)}")
print(f"Phuong phap chia: {SPLIT_METHOD}")

# Tao thu muc split
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(TEMP_SPLIT, split), exist_ok=True)

stats = {'train': 0, 'val': 0, 'test': 0}
identity_stats = {'train': set(), 'val': set(), 'test': set()}

if SPLIT_METHOD == 'by_image':
    # Chia ANH trong moi identity vao ca 3 tap
    # Dam bao moi identity co anh trong TRAIN, VAL, TEST
    MIN_IMAGES_FOR_SPLIT = 3  # Can it nhat 3 anh de chia
    
    for pid in tqdm(all_ids, desc="Chia anh theo identity"):
        src_dir = os.path.join(TEMP_BY_ID, pid)
        images = [f for f in os.listdir(src_dir) if f.endswith('.jpg')]
        
        if len(images) < MIN_IMAGES_FOR_SPLIT:
            # Neu qua it anh, cho tat ca vao train
            dst_dir = os.path.join(TEMP_SPLIT, "train", pid)
            os.makedirs(dst_dir, exist_ok=True)
            for img in images:
                shutil.copy(os.path.join(src_dir, img), os.path.join(dst_dir, img))
            stats['train'] += len(images)
            identity_stats['train'].add(pid)
            continue
        
        random.shuffle(images)
        n = len(images)
        
        # Chia: 80% train, 10% val, 10% test (toi thieu 1 anh moi tap)
        n_val = max(1, int(0.1 * n))
        n_test = max(1, int(0.1 * n))
        n_train = n - n_val - n_test
        
        # Dam bao train co it nhat 1 anh
        if n_train < 1:
            n_train = 1
            remaining = n - n_train
            n_val = remaining // 2
            n_test = remaining - n_val
        
        splits_imgs = {
            'train': images[:n_train],
            'val': images[n_train:n_train + n_val],
            'test': images[n_train + n_val:]
        }
        
        for split, split_imgs in splits_imgs.items():
            if len(split_imgs) == 0:
                continue
            dst_dir = os.path.join(TEMP_SPLIT, split, pid)
            os.makedirs(dst_dir, exist_ok=True)
            for img in split_imgs:
                shutil.copy(os.path.join(src_dir, img), os.path.join(dst_dir, img))
            stats[split] += len(split_imgs)
            identity_stats[split].add(pid)

else:
    # Chia theo IDENTITY - moi identity chi o 1 tap
    random.shuffle(all_ids)
    n_total = len(all_ids)
    n_val = int(VAL_RATIO * n_total)
    n_test = int(TEST_RATIO * n_total)
    
    split_ids = {
        'train': all_ids[:n_total - n_val - n_test],
        'val': all_ids[n_total - n_val - n_test:n_total - n_test],
        'test': all_ids[n_total - n_test:]
    }
    
    for split, ids_list in split_ids.items():
        for pid in tqdm(ids_list, desc=f"Copying {split}"):
            src_dir = os.path.join(TEMP_BY_ID, pid)
            dst_dir = os.path.join(TEMP_SPLIT, split, pid)
            if os.path.exists(src_dir):
                shutil.copytree(src_dir, dst_dir)
                stats[split] += len([f for f in os.listdir(dst_dir) if f.endswith('.jpg')])
                identity_stats[split].add(pid)

# In ket qua
print(f"\n{'='*50}")
print(f"KET QUA CHIA DATASET ({SPLIT_METHOD})")
print(f"{'='*50}")
total = sum(stats.values())
for split in ['train', 'val', 'test']:
    n_imgs = stats[split]
    n_ids = len(identity_stats[split])
    print(f"  {split:5}: {n_imgs:,} anh ({100*n_imgs/total:.1f}%), {n_ids} identities")

# Kiem tra overlap
if SPLIT_METHOD == 'by_image':
    all_have_3_splits = identity_stats['train'] & identity_stats['val'] & identity_stats['test']
    print(f"\nIdentity co anh trong CA 3 tap: {len(all_have_3_splits)}")
else:
    overlap_tv = identity_stats['train'] & identity_stats['val']
    overlap_tt = identity_stats['train'] & identity_stats['test']
    print(f"\nTrain & Val overlap: {len(overlap_tv)}")
    print(f"Train & Test overlap: {len(overlap_tt)}")


## Buoc 5: Alignment theo chuan ArcFace (112x112)


In [None]:
# Load landmarks
df_landmark = pd.read_csv(LANDMARK_FILE)
landmarks = {}
for _, row in df_landmark.iterrows():
    img = row['image_id']
    landmarks[img] = {
        "left_eye": (row['lefteye_x'], row['lefteye_y']),
        "right_eye": (row['righteye_x'], row['righteye_y']),
        "nose": (row['nose_x'], row['nose_y']),
        "left_mouth": (row['leftmouth_x'], row['leftmouth_y']),
        "right_mouth": (row['rightmouth_x'], row['rightmouth_y']),
    }
print(f"Loaded {len(landmarks)} landmarks")

def align_face(img, landmark):
    src = np.array([
        landmark["left_eye"], landmark["right_eye"], landmark["nose"],
        landmark["left_mouth"], landmark["right_mouth"]
    ], dtype=np.float32)
    
    tform = SimilarityTransform()
    tform.estimate(src, ARCFACE_TEMPLATE)
    M = tform.params[0:2, :]
    return cv2.warpAffine(img, M, (112, 112), borderValue=0)

def align_face_center_crop(img):
    h, w = img.shape[:2]
    if h > w:
        start = (h - w) // 2
        img = img[start:start+w, :]
    elif w > h:
        start = (w - h) // 2
        img = img[:, start:start+h]
    return cv2.resize(img, (112, 112), interpolation=cv2.INTER_LINEAR)


In [None]:
# Chay alignment
os.makedirs(FINAL_OUTPUT, exist_ok=True)
align_stats = {'aligned': 0, 'center_crop': 0, 'failed': 0}

for split in ["train", "val", "test"]:
    split_dir = os.path.join(TEMP_SPLIT, split)
    out_split_dir = os.path.join(FINAL_OUTPUT, split)
    os.makedirs(out_split_dir, exist_ok=True)
    
    persons = os.listdir(split_dir)
    
    for person in tqdm(persons, desc=f"Aligning {split}"):
        src_person_dir = os.path.join(split_dir, person)
        dst_person_dir = os.path.join(out_split_dir, person)
        os.makedirs(dst_person_dir, exist_ok=True)
        
        for img_name in os.listdir(src_person_dir):
            if not img_name.endswith('.jpg'):
                continue
            
            img_path = os.path.join(src_person_dir, img_name)
            img = cv2.imread(img_path)
            if img is None:
                align_stats['failed'] += 1
                continue
            
            original_name = img_name.split('_aug')[0] + '.jpg' if '_aug' in img_name else img_name
            
            if original_name in landmarks:
                aligned = align_face(img, landmarks[original_name])
                align_stats['aligned'] += 1
            else:
                aligned = align_face_center_crop(img)
                align_stats['center_crop'] += 1
            
            cv2.imwrite(os.path.join(dst_person_dir, img_name), aligned)

print(f"\nAlignment stats:")
print(f"  Aligned with landmarks: {align_stats['aligned']:,}")
print(f"  Center crop (augmented): {align_stats['center_crop']:,}")
print(f"  Failed: {align_stats['failed']}")


## Buoc 6: Tao Metadata cho training


In [None]:
# Tao metadata
META_OUTPUT = os.path.join(FINAL_OUTPUT, "metadata")
os.makedirs(META_OUTPUT, exist_ok=True)

# Tao GLOBAL label mapping tu train set
train_dir = os.path.join(FINAL_OUTPUT, "train")
all_train_ids = sorted(os.listdir(train_dir))
global_id_to_label = {pid: idx for idx, pid in enumerate(all_train_ids)}
print(f"Total training identities: {len(global_id_to_label)}")

# Luu global mapping
global_mapping_df = pd.DataFrame([
    {"identity_id": pid, "label": label} for pid, label in global_id_to_label.items()
])
global_mapping_df.to_csv(os.path.join(META_OUTPUT, "global_id_mapping.csv"), index=False)

# Tao labels file cho moi split
for split in ["train", "val", "test"]:
    split_dir = os.path.join(FINAL_OUTPUT, split)
    records = []
    
    for pid in os.listdir(split_dir):
        person_dir = os.path.join(split_dir, pid)
        if not os.path.isdir(person_dir):
            continue
        
        label = global_id_to_label.get(pid, -1)
        
        for img_name in os.listdir(person_dir):
            if img_name.endswith('.jpg'):
                records.append({
                    "image": f"{pid}/{img_name}",
                    "identity_id": pid,
                    "label": label,
                    "is_augmented": 1 if '_aug' in img_name else 0
                })
    
    df = pd.DataFrame(records)
    df.to_csv(os.path.join(META_OUTPUT, f"{split}_labels.csv"), index=False)
    
    n_ids = df['identity_id'].nunique()
    n_imgs = len(df)
    n_aug = df['is_augmented'].sum()
    print(f"{split}: {n_imgs:,} images, {n_ids} identities, {n_aug:,} augmented")


In [None]:
# Tao dataset config
dataset_config = {
    "dataset_name": "CelebA_Aligned_Balanced",
    "preprocessing": {
        "min_images_per_identity": MIN_IMAGES,
        "augment_threshold": AUGMENT_THRESHOLD,
        "target_min_images": TARGET_MIN_IMAGES
    },
    "image_size": [112, 112],
    "arcface_landmarks": {
        "left_eye": [38.2946, 51.6963],
        "right_eye": [73.5318, 51.5014],
        "nose": [56.0252, 71.7366],
        "left_mouth": [41.5493, 92.3655],
        "right_mouth": [70.7299, 92.2041]
    },
    "split_method": SPLIT_METHOD,  # 'by_image' hoac 'by_identity'
    "split_ratio": {"train": 0.8, "val": 0.1, "test": 0.1},
    "splits": {}
}

for split in ["train", "val", "test"]:
    labels_df = pd.read_csv(os.path.join(META_OUTPUT, f"{split}_labels.csv"))
    dataset_config["splits"][split] = {
        "num_identities": int(labels_df['identity_id'].nunique()),
        "num_images": int(len(labels_df)),
        "num_augmented": int(labels_df['is_augmented'].sum())
    }

with open(os.path.join(META_OUTPUT, "dataset_config.json"), "w") as f:
    json.dump(dataset_config, f, indent=2)

print(json.dumps(dataset_config, indent=2))


## Kiem tra va hoan thanh


In [None]:
# Kiem tra phan bo identity giua cac split
train_ids_final = set(os.listdir(os.path.join(FINAL_OUTPUT, "train")))
val_ids_final = set(os.listdir(os.path.join(FINAL_OUTPUT, "val")))
test_ids_final = set(os.listdir(os.path.join(FINAL_OUTPUT, "test")))

print(f"Phuong phap chia: {SPLIT_METHOD}")
print(f"\nSo identity moi split:")
print(f"  Train: {len(train_ids_final)}")
print(f"  Val: {len(val_ids_final)}")
print(f"  Test: {len(test_ids_final)}")

# Kiem tra overlap
overlap_tv = train_ids_final & val_ids_final
overlap_tt = train_ids_final & test_ids_final
overlap_vt = val_ids_final & test_ids_final
overlap_all = train_ids_final & val_ids_final & test_ids_final

print(f"\nIdentity overlap giua cac split:")
print(f"  Train & Val: {len(overlap_tv)}")
print(f"  Train & Test: {len(overlap_tt)}")
print(f"  Val & Test: {len(overlap_vt)}")
print(f"  CA 3 tap: {len(overlap_all)}")

if SPLIT_METHOD == 'by_image':
    if len(overlap_all) > 0:
        print(f"\n[OK] Co {len(overlap_all)} identity xuat hien trong CA 3 tap - Dung cho Face Verification!")
    else:
        print("\n[WARNING] Khong co identity nao xuat hien trong ca 3 tap!")
else:
    if len(overlap_tv) == 0 and len(overlap_tt) == 0:
        print("\n[OK] KHONG CO CHONG CHEO - Dung cho Face Identification!")
    else:
        print("\n[ERROR] CO CHONG CHEO - Can kiem tra lai!")


In [None]:
# Tao file ZIP va luu vao Drive
print("Tao file ZIP tu thu muc local...")
zip_filename = "CelebA_Aligned_Balanced.zip"

if MODE == 'colab':
    # Tren Colab: Tao ZIP tren local roi copy len Drive
    local_zip = f"/content/{zip_filename}"
    drive_zip = f"{DRIVE_OUTPUT}/{zip_filename}"
    
    # Tao ZIP tu FINAL_OUTPUT (local Colab)
    !cd /content && zip -r "{local_zip}" "CelebA_Aligned_Balanced/" -x "*.DS_Store"
    
    # Copy ZIP len Drive
    print(f"Upload ZIP len Drive: {drive_zip}")
    shutil.copy(local_zip, drive_zip)
    
    print(f"\n[OK] File ZIP da luu: {drive_zip}")
    !ls -lh "{drive_zip}"
    
    # Xoa cac thu muc tam va ZIP local
    print("\nXoa cac thu muc tam va file ZIP local...")
    !rm -rf "{TEMP_BY_ID}"
    !rm -rf "{TEMP_SPLIT}"
    !rm -rf "{FINAL_OUTPUT}"
    !rm -f "{local_zip}"
    
    final_location = drive_zip
else:
    # Tren Local: Chi tao ZIP
    local_zip = f"{os.path.dirname(FINAL_OUTPUT)}/{zip_filename}"
    !cd "{os.path.dirname(FINAL_OUTPUT)}" && zip -r "{zip_filename}" "{os.path.basename(FINAL_OUTPUT)}/" -x "*.DS_Store"
    
    print(f"\n[OK] File ZIP da tao: {local_zip}")
    final_location = local_zip

print("\n" + "="*60)
print("HOAN THANH XU LY DATASET")
print("="*60)
print(f"\nFile ZIP: {final_location}")
print(f"Kich thuoc: ", end="")
!du -h "{final_location}"
print("\nChi co file ZIP duoc luu, cac thu muc tam da bi xoa.")
print("\nCac buoc tiep theo:")
print("1. Download file ZIP ve local (neu dang tren Colab)")
print("2. Giai nen va su dung FolderBasedDataset/ArcFaceDataset de load du lieu")
print("3. Training voi class_balanced_sampling=True")
