# LBPH Training - KaggleNotebook huấn luyện LBPH (Local Binary Patterns Histograms) trên Kaggle.## Chuẩn bị:1. Upload dataset `celeba-aligned-balanced` lên Kaggle Datasets2. Add dataset vào notebook này3. Accelerator: CPU (LBPH không cần GPU)

In [None]:
# Detect môi trườngimport osimport sysIS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environprint(f"Kaggle environment: {IS_KAGGLE}")if not IS_KAGGLE:    print("WARNING: Notebook này được thiết kế cho Kaggle!")

In [None]:
# Cấu hình đường dẫn KaggleROOT = "/kaggle/working/FaceRecognition"CHECKPOINT_DIR = "/kaggle/working/checkpoints/lbph"# Dataset pathKAGGLE_DATASET_NAME = "celeba-aligned-balanced"DATA_DIR = f"/kaggle/input/{KAGGLE_DATASET_NAME}"os.makedirs(CHECKPOINT_DIR, exist_ok=True)print(f"ROOT: {ROOT}")print(f"DATA_DIR: {DATA_DIR}")print(f"CHECKPOINT_DIR: {CHECKPOINT_DIR}")

In [None]:
# === CAU HINH CHECKPOINT DATASET (optional) ===CHECKPOINT_DATASET_NAME = ""import shutilimport globif CHECKPOINT_DATASET_NAME:    checkpoint_input_dir = f"/kaggle/input/{CHECKPOINT_DATASET_NAME}"    if os.path.exists(checkpoint_input_dir):        print("[OK] Tim thay checkpoint dataset")        xml_files = glob.glob(os.path.join(checkpoint_input_dir, "**/*.xml"), recursive=True)        if xml_files:            os.makedirs(CHECKPOINT_DIR, exist_ok=True)            for xml_file in xml_files:                dest_path = os.path.join(CHECKPOINT_DIR, os.path.basename(xml_file))                if not os.path.exists(dest_path):                    shutil.copy(xml_file, dest_path)                    print(f"[COPY] {os.path.basename(xml_file)}")else:    print("[INFO] Training tu dau (khong co checkpoint)")

In [None]:
# Kiểm tra Kaggle datasetprint("=== KAGGLE INPUT DATASETS ===")!ls -la /kaggle/input/if os.path.exists(DATA_DIR):    print(f"\n[OK] Dataset found at: {DATA_DIR}")    !ls -la {DATA_DIR}else:    print(f"\n[ERROR] Dataset not found at: {DATA_DIR}")

In [None]:
# Cau hinh GitHub tokentry:    from kaggle_secrets import UserSecretsClient    user_secrets = UserSecretsClient()    GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")    print("[OK] Da lay GITHUB_TOKEN")except Exception as e:    GITHUB_TOKEN = None    print("[INFO] Su dung public URL")if GITHUB_TOKEN:    REPO_URL = f"https://{GITHUB_TOKEN}@github.com/sin0235/FaceRecognition.git"else:    REPO_URL = "https://github.com/sin0235/FaceRecognition.git"

In [None]:
# Clone repositoryif os.path.exists(ROOT):    print("Repository da ton tai, dang pull updates...")    %cd {ROOT}    if GITHUB_TOKEN:        !git remote set-url origin {REPO_URL}    !git pullelse:    print(f"Dang clone repository...")    !git clone {REPO_URL} {ROOT}    %cd {ROOT}print(f"\nWorking directory: {os.getcwd()}")!ls -la

In [None]:
# Thêm ROOT vào Python pathif ROOT not in sys.path:    sys.path.insert(0, ROOT)    print(f"Da them {ROOT} vao Python path")

In [None]:
# Cài đặt dependenciesprint("Cai dat dependencies...")!pip install -q opencv-python-headless Pillow scikit-learn tqdm pyyaml matplotlib seabornprint("\nHoan tat cai dat!")

In [None]:
# Kiểm tra cấu trúc dữ liệutrain_img_dir = os.path.join(DATA_DIR, "train")val_img_dir = os.path.join(DATA_DIR, "val")test_img_dir = os.path.join(DATA_DIR, "test")if not os.path.exists(train_img_dir):    train_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "train")    val_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "val")    test_img_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "test")print("=== KIEM TRA DU LIEU ===")if os.path.exists(train_img_dir):    train_identities = [d for d in os.listdir(train_img_dir)                        if os.path.isdir(os.path.join(train_img_dir, d))]    print(f"[OK] Train: {len(train_identities)} identities")else:    print("[ERROR] Train folder not found")if os.path.exists(val_img_dir):    val_identities = [d for d in os.listdir(val_img_dir)                      if os.path.isdir(os.path.join(val_img_dir, d))]    print(f"[OK] Val: {len(val_identities)} identities")else:    print("[ERROR] Val folder not found")if os.path.exists(test_img_dir):    test_identities = [d for d in os.listdir(test_img_dir)                       if os.path.isdir(os.path.join(test_img_dir, d))]    print(f"[OK] Test: {len(test_identities)} identities")else:    print("[ERROR] Test folder not found")

## Training LBPHLBPH (Local Binary Patterns Histograms) là traditional computer vision method:- Không cần GPU (CPU-only)- Fast training- Interpretable features- Good baseline cho face recognition

In [None]:
# Load data functionimport cv2import numpy as npfrom tqdm import tqdmdef load_lbph_data(base_dir, max_per_identity=None):    """    Load ảnh từ dataset và convert sang grayscale cho LBPH.        Args:        base_dir: Thư mục chứa data (train/val/test)        max_per_identity: Giới hạn số ảnh mỗi identity (None = load all)        Returns:        faces: List ảnh grayscale        labels: numpy array labels (int)    """    faces = []    labels = []        identity_folders = sorted(os.listdir(base_dir), key=int)        for label in tqdm(identity_folders, desc=f"Loading from {os.path.basename(base_dir)}"):        label_dir = os.path.join(base_dir, label)        if not os.path.isdir(label_dir):            continue                img_files = os.listdir(label_dir)        if max_per_identity:            img_files = img_files[:max_per_identity]                for img_name in img_files:            img_path = os.path.join(label_dir, img_name)            # Load ảnh RGB            img = cv2.imread(img_path)            if img is None:                continue                        # Convert sang grayscale cho LBPH            img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)                        faces.append(img_gray)            labels.append(int(label))        return faces, np.array(labels, dtype=np.int32)print("Data loading function ready!")

In [None]:
# Load train và validation dataprint("="*60)print("Loading training data...")print("="*60)train_faces, train_labels = load_lbph_data(train_img_dir)print(f"\nTrain: {len(train_faces)} images")print(f"Unique train identities: {len(np.unique(train_labels))}")print("\n" + "="*60)print("Loading validation data...")print("="*60)val_faces, val_labels = load_lbph_data(val_img_dir)print(f"\nVal: {len(val_faces)} images")print(f"Unique val identities: {len(np.unique(val_labels))}")

In [None]:
# Train LBPH modelfrom models.lbphmodel.train_lbph import train_lbph_modelprint("="*60)print("BAT DAU TRAINING LBPH")print("="*60)# Train với default parametersmodel = train_lbph_model(    faces=train_faces,    labels=train_labels,    radius=1,    neighbors=8,    grid_x=8,    grid_y=8)print("\n[OK] LBPH trained successfully!")

## Threshold TuningLBPH sử dụng confidence score (càng **THẤP** càng tốt).Cần tìm threshold tối ưu trên validation set.

In [None]:
# Find optimal thresholdfrom models.lbphmodel.threshold_lbph import find_optimal_thresholdprint("="*60)print("FINDING OPTIMAL THRESHOLD ON VALIDATION SET")print("="*60)best_threshold, best_score, threshold_results = find_optimal_threshold(    model=model,    faces=val_faces,    labels=val_labels,    threshold_range=range(40, 121, 5),    min_coverage=0.3  # Ít nhất 30% samples phải được accept)print(f"\n[RESULT] Best threshold: {best_threshold}")print(f"[RESULT] Best score: {best_score:.4f}")# Hiển thị một số kết quảprint("\nTop 5 thresholds:")for i, (thr, acc, cov, score) in enumerate(threshold_results[:5], 1):    print(f"{i}. Threshold={thr:3d}: acc={acc:.3f}, coverage={cov:.3f}, score={score:.4f}")

## Evaluation on Test SetĐánh giá model với threshold tìm được trên test set.

In [None]:
# Load test dataprint("="*60)print("Loading test data...")print("="*60)test_faces, test_labels = load_lbph_data(test_img_dir)print(f"\nTest: {len(test_faces)} images")print(f"Unique test identities: {len(np.unique(test_labels))}")

In [None]:
# Evaluate on test setfrom models.lbphmodel.evaluate_lbph import evaluate_lbphprint("="*60)print("EVALUATING ON TEST SET")print("="*60)test_acc, test_cov, test_used, test_confidences = evaluate_lbph(    model=model,    faces=test_faces,    labels=test_labels,    threshold=best_threshold)print(f"\n{'='*60}")print("FINAL TEST RESULTS")print(f"{'='*60}")print(f"Threshold: {best_threshold}")print(f"Accuracy: {test_acc:.2%}")print(f"Coverage: {test_cov:.2%}")print(f"Used: {test_used} / {len(test_faces)}")print(f"{'='*60}")

## Save Model & Metadata

In [None]:
# Save model checkpointimport jsonMODEL_PATH = os.path.join(CHECKPOINT_DIR, "lbph_model.xml")METADATA_PATH = os.path.join(CHECKPOINT_DIR, "metadata.json")# Save modelmodel.save(MODEL_PATH)print(f"[OK] Model saved: {MODEL_PATH}")# Save metadatametadata = {    "threshold": int(best_threshold),    "val_score": float(best_score),    "test_accuracy": float(test_acc),    "test_coverage": float(test_cov),    "num_classes": int(len(np.unique(train_labels))),    "train_images": int(len(train_faces)),    "val_images": int(len(val_faces)),    "test_images": int(len(test_faces)),    "model_params": {        "radius": 1,        "neighbors": 8,        "grid_x": 8,        "grid_y": 8    }}with open(METADATA_PATH, "w") as f:    json.dump(metadata, f, indent=2)print(f"[OK] Metadata saved: {METADATA_PATH}")print("\nMetadata:")print(json.dumps(metadata, indent=2))

## Download CheckpointZip checkpoint folder để download.

In [None]:
# Zip checkpoint folder để downloadimport shutilzip_name = "lbph_checkpoints"zip_path = f"/kaggle/working/{zip_name}"shutil.make_archive(zip_path, "zip", CHECKPOINT_DIR)print(f"[OK] Da tao file zip: {zip_path}.zip")print(f"\nDownload file nay tu panel Output ben phai.")!ls -lh /kaggle/working/*.zip