In [None]:
# ---
# LBPH Evaluation - Kaggle (Optimized)
# 
# Phiên bản tối ưu với:
# - Giới hạn max 3 ảnh/identity để giảm thời gian từ ~10h xuống vài phút
# - Lưu intermediate results để phục hồi khi kernel restart
# - File zip đầy đủ dữ liệu để trực quan hóa
# ---


# LBPH Evaluation - Kaggle (Optimized)

Evaluation notebook cho LBPH model sử dụng threshold-based classification.

## Optimization:
- **MAX_IMAGES_PER_IDENTITY = 3**: Giảm thời gian đánh giá từ ~10 tiếng xuống vài phút
- **Intermediate logging**: Lưu kết quả sau mỗi bước để phục hồi khi session crash
- **Extended zip file**: Bao gồm logs chi tiết để visualize offline

## Approach:
- Load LBPH model từ checkpoint (XML file)
- Load validation và test datasets (giới hạn số ảnh)
- Tìm threshold tối ưu trên validation set
- Evaluate trên test set với threshold đã chọn
- Metrics: Accuracy, Coverage, Threshold analysis


In [None]:
  !git config --global user.email "phuctoan235@gmail.com"
  !git config --global user.name "sin0235"

In [None]:
import os, sys, time, json
import shutil, glob
from datetime import datetime
from tqdm import tqdm
from collections import defaultdict

ROOT = "/kaggle/working/FaceRecognition"
CHECKPOINT_DIR = "/kaggle/working/checkpoints/lbph"
KAGGLE_DATASET_NAME = "celeba-aligned-balanced"
DATA_DIR = f"/kaggle/input/{KAGGLE_DATASET_NAME}"
CHECKPOINT_DATASET_NAME = "lbph-checkpoints"
OUTPUT_DIR = "/kaggle/working"
LOG_DIR = f"{OUTPUT_DIR}/logs/lbph"

# Giới hạn số identity và số ảnh để đánh giá nhanh
# 200 identities * 3 ảnh = 600 samples → ~2-3 phút thay vì 1+ tiếng
MAX_IDENTITIES = 200  # Số identity tối đa
MAX_IMAGES_PER_IDENTITY = 3  # Số ảnh tối đa mỗi identity

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

print(f"[CONFIG] MAX_IDENTITIES = {MAX_IDENTITIES}")
print(f"[CONFIG] MAX_IMAGES_PER_IDENTITY = {MAX_IMAGES_PER_IDENTITY}")
print(f"[CONFIG] Expected samples: ~{MAX_IDENTITIES * MAX_IMAGES_PER_IDENTITY}")
print(f"[CONFIG] LOG_DIR = {LOG_DIR}")


In [None]:
# Copy checkpoint từ Kaggle input dataset (nếu có)
checkpoint_input = f"/kaggle/input/{CHECKPOINT_DATASET_NAME}"
checkpoint_found = False

if os.path.exists(checkpoint_input):
    for xml_file in glob.glob(os.path.join(checkpoint_input, "**/*.xml"), recursive=True):
        dest = os.path.join(CHECKPOINT_DIR, os.path.basename(xml_file))
        if not os.path.exists(dest) or os.path.getsize(dest) != os.path.getsize(xml_file):
            print(f"Copying {os.path.basename(xml_file)} from Kaggle input...")
            shutil.copy(xml_file, dest)
            if os.path.getsize(dest) == os.path.getsize(xml_file):
                print(f"  [OK] Copied successfully")
                checkpoint_found = True
            else:
                print(f"  [WARNING] Size mismatch after copy!")
        else:
            checkpoint_found = True
    if checkpoint_found:
        print(f"Checkpoints from Kaggle input: {os.listdir(CHECKPOINT_DIR)}")
else:
    print(f"[INFO] Không tìm thấy checkpoint trong Kaggle input: {checkpoint_input}")
    print(f"      Sẽ thử copy từ repo sau khi clone...")


In [None]:
# Cau hinh GitHub token
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
    print("[OK] Da lay GITHUB_TOKEN")
except Exception as e:
    GITHUB_TOKEN = None
    print("[INFO] Su dung public URL")

if GITHUB_TOKEN:
    REPO_URL = f"https://{GITHUB_TOKEN}@github.com/sin0235/FaceRecognition.git"
else:
    REPO_URL = "https://github.com/sin0235/FaceRecognition.git"

# Clone repository
if os.path.exists(ROOT):
    print("Repository da ton tai, dang pull updates...")
    os.chdir(ROOT)
    if GITHUB_TOKEN:
        os.system(f"git remote set-url origin {REPO_URL}")
    os.system("git pull --no-rebase origin fix/lbph-module")
else:
    print(f"Dang clone repository...")
    os.system(f"git clone {REPO_URL} {ROOT}")
    os.chdir(ROOT)

print(f"\nWorking directory: {os.getcwd()}")

# Thêm ROOT vào sys.path để import modules
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
    print(f"\n[OK] Added {ROOT} to sys.path")

# Copy checkpoint từ repo nếu chưa có từ Kaggle input
model_path_check = os.path.join(CHECKPOINT_DIR, "lbph_model.xml")
if not os.path.exists(model_path_check):
    repo_checkpoint_path = os.path.join(ROOT, "models", "checkpoints", "LBHP", "lbph_model.xml")
    if os.path.exists(repo_checkpoint_path):
        print(f"\nCopying checkpoint from repo: {repo_checkpoint_path}")
        shutil.copy(repo_checkpoint_path, model_path_check)
        if os.path.exists(model_path_check):
            file_size = os.path.getsize(model_path_check)
            print(f"  [OK] Copied successfully ({file_size / 1024 / 1024:.2f} MB)")
        else:
            print(f"  [ERROR] Failed to copy checkpoint")
    else:
        print(f"[WARNING] Không tìm thấy checkpoint trong repo: {repo_checkpoint_path}")
else:
    print(f"\n[OK] Checkpoint đã có sẵn: {model_path_check}")


In [None]:
# Hủy merge đang dở
!git merge --abort

# Fetch nhánh từ remote
!git fetch origin fix/lbph-module

# Reset local về nhánh remote (xóa toàn bộ thay đổi local)
!git reset --hard origin/fix/lbph-module
!git pull --no-rebase origin fix/lbph-module


In [None]:
# CHỈ install opencv-contrib-python-headless
# KHÔNG upgrade numpy/scipy vì sẽ phá vỡ các packages pre-installed của Kaggle
!pip install -q opencv-contrib-python-headless


In [None]:
# Suppress warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')

# Import các thư viện có sẵn trên Kaggle (KHÔNG reinstall)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import cv2
from PIL import Image

print(f"NumPy version: {np.__version__}")
print(f"OpenCV version: {cv2.__version__}")
print("[OK] All imports successful")


---
## 1. Load Model


In [None]:
model_path = os.path.join(CHECKPOINT_DIR, "lbph_model.xml")

# Validate checkpoint file trước khi load
if not os.path.exists(model_path):
    raise FileNotFoundError(
        f"Model không tồn tại: {model_path}\n"
        f"Vui lòng kiểm tra:"
        f"  1. Dataset checkpoint đã được add vào Kaggle input chưa?"
        f"  2. Tên dataset có đúng '{CHECKPOINT_DATASET_NAME}' không?"
        f"  3. File lbph_model.xml có trong dataset không?"
    )

file_size = os.path.getsize(model_path)
print(f"Model file: {model_path}")
print(f"File size: {file_size / 1024 / 1024:.2f} MB")

if file_size < 1024:
    raise ValueError(f"Model file quá nhỏ ({file_size} bytes), có thể bị hỏng")

# Load LBPH model
model = cv2.face.LBPHFaceRecognizer_create()
model.read(model_path)

print(f"\n[OK] LBPH model loaded successfully")


---
## 2. Load Data (với giới hạn số ảnh)


In [None]:
from models.lbphmodel.dataset_lbph import load_data_no_haar

# Tìm data dirs
train_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "train")
val_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "val")
test_dir = os.path.join(DATA_DIR, "CelebA_Aligned_Balanced", "test")

if not os.path.exists(train_dir):
    train_dir = os.path.join(DATA_DIR, "train")
    val_dir = os.path.join(DATA_DIR, "val")
    test_dir = os.path.join(DATA_DIR, "test")

print(f"Train dir: {train_dir}")
print(f"Val dir: {val_dir}")
print(f"Test dir: {test_dir}")

# Load validation và test data với giới hạn
# Giúp giảm thời gian đánh giá từ ~1 tiếng xuống ~2-3 phút
print("\n" + "="*50)
print("Loading VALIDATION data...")
val_faces, val_labels = load_data_no_haar(
    val_dir, 
    max_images_per_identity=MAX_IMAGES_PER_IDENTITY,
    max_identities=MAX_IDENTITIES
)

print("\n" + "="*50)
print("Loading TEST data...")
test_faces, test_labels = load_data_no_haar(
    test_dir, 
    max_images_per_identity=MAX_IMAGES_PER_IDENTITY,
    max_identities=MAX_IDENTITIES
)

print(f"\n{'='*50}")
print(f"DATA SUMMARY")
print(f"{'='*50}")
print(f"Val samples: {len(val_faces)}")
print(f"Test samples: {len(test_faces)}")
print(f"Val identities: {len(set(val_labels))}")
print(f"Test identities: {len(set(test_labels))}")

# Lưu thông tin data load để phục hồi
data_info = {
    'timestamp': datetime.now().isoformat(),
    'max_identities': MAX_IDENTITIES,
    'max_images_per_identity': MAX_IMAGES_PER_IDENTITY,
    'val_samples': len(val_faces),
    'test_samples': len(test_faces),
    'val_identities': len(set(val_labels)),
    'test_identities': len(set(test_labels))
}
with open(f'{LOG_DIR}/data_info.json', 'w') as f:
    json.dump(data_info, f, indent=2)
print(f"\n[OK] Data info saved to {LOG_DIR}/data_info.json")


---
## 3. Find Optimal Threshold


In [None]:
from models.lbphmodel.threshold_lbph import find_optimal_threshold

# Tìm threshold tối ưu trên validation set
print("Finding optimal threshold on validation set...")
start_time = time.time()
best_threshold, best_score, threshold_results = find_optimal_threshold(
    model, val_faces, val_labels, min_coverage=0.3
)
threshold_search_time = time.time() - start_time

print(f"\n{'='*50}")
print(f"OPTIMAL THRESHOLD")
print(f"{'='*50}")
print(f"Search time: {threshold_search_time:.2f}s ({threshold_search_time/60:.2f} min)")
print(f"Best threshold: {best_threshold}")
print(f"Best score (acc * coverage): {best_score:.4f}")
print(f"\nTop 5 thresholds:")
for thr, acc, cov, score in sorted(threshold_results, key=lambda x: x[3], reverse=True)[:5]:
    print(f"  Threshold={thr:3d}: Accuracy={acc:.3f}, Coverage={cov:.3f}, Score={score:.4f}")

# Lưu kết quả threshold ngay lập tức để tránh mất dữ liệu
threshold_data = {
    'timestamp': datetime.now().isoformat(),
    'search_time_seconds': threshold_search_time,
    'best_threshold': int(best_threshold),
    'best_score': float(best_score),
    'all_results': [{'threshold': int(t), 'accuracy': float(a), 'coverage': float(c), 'score': float(s)} for t, a, c, s in threshold_results]
}
with open(f'{LOG_DIR}/threshold_search.json', 'w') as f:
    json.dump(threshold_data, f, indent=2)
print(f"\n[OK] Threshold results saved to {LOG_DIR}/threshold_search.json")


---
## 4. Evaluate on Test Set


In [None]:
from models.lbphmodel.evaluate_lbph import evaluate_lbph

# Evaluate trên test set với threshold đã chọn
start_time = time.time()
test_acc, test_cov, test_used, test_confidences = evaluate_lbph(
    model, test_faces, test_labels, best_threshold
)
eval_time = time.time() - start_time

print(f"\n{'='*50}")
print(f"TEST SET EVALUATION")
print(f"{'='*50}")
print(f"Evaluation time: {eval_time:.2f}s ({eval_time/60:.2f} min)")
print(f"Threshold: {best_threshold}")
print(f"Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"Coverage: {test_cov:.4f} ({test_cov*100:.2f}%)")
print(f"Used samples: {test_used} / {len(test_labels)}")
print(f"Rejected samples: {len(test_labels) - test_used}")

# Lưu kết quả test ngay lập tức
test_data = {
    'timestamp': datetime.now().isoformat(),
    'eval_time_seconds': eval_time,
    'threshold': int(best_threshold),
    'accuracy': float(test_acc),
    'coverage': float(test_cov),
    'used_samples': int(test_used),
    'total_samples': len(test_labels),
    'rejected_samples': len(test_labels) - test_used,
    'confidence_stats': {
        'min': float(test_confidences.min()),
        'max': float(test_confidences.max()),
        'mean': float(test_confidences.mean()),
        'std': float(test_confidences.std())
    }
}
with open(f'{LOG_DIR}/test_evaluation.json', 'w') as f:
    json.dump(test_data, f, indent=2)
print(f"\n[OK] Test results saved to {LOG_DIR}/test_evaluation.json")


---
## 5. Visualizations


In [None]:
# Threshold Analysis Plot
thresholds = [r[0] for r in threshold_results]
accuracies = [r[1] for r in threshold_results]
coverages = [r[2] for r in threshold_results]
scores = [r[3] for r in threshold_results]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Accuracy vs Threshold
axes[0].plot(thresholds, accuracies, 'b-o', label='Accuracy')
axes[0].axvline(best_threshold, color='r', linestyle='--', label=f'Best threshold={best_threshold}')
axes[0].set_xlabel('Threshold')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Accuracy vs Threshold')
axes[0].grid(True)
axes[0].legend()

# Coverage vs Threshold
axes[1].plot(thresholds, coverages, 'g-o', label='Coverage')
axes[1].axvline(best_threshold, color='r', linestyle='--', label=f'Best threshold={best_threshold}')
axes[1].set_xlabel('Threshold')
axes[1].set_ylabel('Coverage')
axes[1].set_title('Coverage vs Threshold')
axes[1].grid(True)
axes[1].legend()

# Score vs Threshold
axes[2].plot(thresholds, scores, 'm-o', label='Score (acc * cov)')
axes[2].axvline(best_threshold, color='r', linestyle='--', label=f'Best threshold={best_threshold}')
axes[2].set_xlabel('Threshold')
axes[2].set_ylabel('Score')
axes[2].set_title('Score (Accuracy * Coverage) vs Threshold')
axes[2].grid(True)
axes[2].legend()

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/lbph_threshold_analysis.png', dpi=150)
plt.show()


In [None]:
# Confidence Distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(test_confidences, bins=50, edgecolor='black')
plt.axvline(best_threshold, color='r', linestyle='--', linewidth=2, label=f'Threshold={best_threshold}')
plt.xlabel('Confidence (lower is better)')
plt.ylabel('Count')
plt.title('Test Set Confidence Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# Predictions để tính confusion matrix
test_predictions = []
test_true_labels_filtered = []
for img, true_label in zip(test_faces, test_labels):
    pred, conf = model.predict(img)
    if conf < best_threshold:
        test_predictions.append(pred)
        test_true_labels_filtered.append(true_label)

if len(test_predictions) > 0:
    cm = confusion_matrix(test_true_labels_filtered, test_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix (Threshold={best_threshold})')
else:
    plt.text(0.5, 0.5, 'No predictions above threshold', ha='center', va='center')
    plt.title('Confusion Matrix (No data)')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/lbph_confusion_matrix.png', dpi=150)
plt.show()


---
## 6. Export Results


In [None]:
# Export CSV và JSON
try:
    import pandas as pd
except ImportError:
    os.system("pip install -q pandas")
    import pandas as pd

# 1. Export predictions CSV
predictions_data = []
for img, true_label in zip(test_faces, test_labels):
    pred, conf = model.predict(img)
    predictions_data.append({
        'true_label': int(true_label),
        'pred_label': int(pred),
        'confidence': float(conf),
        'accepted': conf < best_threshold,
        'is_correct': pred == true_label if conf < best_threshold else False
    })

df_predictions = pd.DataFrame(predictions_data)
df_predictions.to_csv(f'{OUTPUT_DIR}/lbph_predictions.csv', index=False)
print(f"[OK] Exported predictions CSV: {len(df_predictions)} samples")

# 2. Export threshold results CSV
df_thresholds = pd.DataFrame(threshold_results, columns=['threshold', 'accuracy', 'coverage', 'score'])
df_thresholds.to_csv(f'{OUTPUT_DIR}/lbph_threshold_results.csv', index=False)
print(f"[OK] Exported threshold results CSV")

# 3. Export evaluation report JSON
report = {
    'timestamp': datetime.now().isoformat(),
    'model': 'LBPH',
    'method': 'threshold-based classification',
    'optimal_threshold': int(best_threshold),
    'max_images_per_identity': MAX_IMAGES_PER_IDENTITY,
    'timing': {
        'threshold_search_seconds': threshold_search_time,
        'test_eval_seconds': eval_time
    },
    'metrics': {
        'test_accuracy': float(test_acc),
        'test_coverage': float(test_cov),
        'test_used_samples': int(test_used),
        'test_total_samples': int(len(test_labels))
    },
    'threshold_results': [
        {'threshold': int(t), 'accuracy': float(a), 'coverage': float(c), 'score': float(s)}
        for t, a, c, s in threshold_results
    ]
}

with open(f'{OUTPUT_DIR}/lbph_evaluation_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print(f"[OK] Exported evaluation report JSON")

print(f"\n{'='*50}")
print(f"LBPH FINAL REPORT")
print(f"{'='*50}")
print(f"Optimal Threshold: {best_threshold}")
print(f"Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"Test Coverage: {test_cov:.4f} ({test_cov*100:.2f}%)")
print(f"Used Samples: {test_used} / {len(test_labels)}")
print(f"\nReport saved to: lbph_evaluation_report.json")


In [None]:
# Zip tất cả kết quả để download
import zipfile
from pathlib import Path

output_dir = Path(OUTPUT_DIR)
log_dir = Path(LOG_DIR)
zip_path = output_dir / 'lbph_evaluation_results.zip'

# Danh sách các file cần zip - bao gồm cả intermediate logs
files_to_zip = [
    # Reports và metrics chính
    'lbph_evaluation_report.json',
    # CSV files
    'lbph_predictions.csv',
    'lbph_threshold_results.csv',
    # Visualization plots
    'lbph_threshold_analysis.png',
    'lbph_confusion_matrix.png'
]

# Các file log intermediate
log_files = [
    'data_info.json',
    'threshold_search.json',
    'test_evaluation.json'
]

# Tạo zip file
added_files = []
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Thêm files từ output_dir
    for file_name in files_to_zip:
        file_path = output_dir / file_name
        if file_path.exists():
            file_size_mb = file_path.stat().st_size / (1024 * 1024)
            zipf.write(file_path, file_name)
            print(f"[OK] Added {file_name} ({file_size_mb:.2f} MB)")
            added_files.append(file_name)
        else:
            print(f"[WARNING] {file_name} not found, skipping")
    
    # Thêm files từ log_dir với prefix logs/
    for file_name in log_files:
        file_path = log_dir / file_name
        if file_path.exists():
            file_size_mb = file_path.stat().st_size / (1024 * 1024)
            zipf.write(file_path, f'logs/{file_name}')
            print(f"[OK] Added logs/{file_name} ({file_size_mb:.4f} MB)")
            added_files.append(f'logs/{file_name}')
        else:
            print(f"[WARNING] logs/{file_name} not found, skipping")
    
    # Thêm confidence distribution data (để phục hồi nếu cần)
    conf_data = {'confidences': test_confidences.tolist()}
    conf_path = output_dir / 'confidence_distribution.json'
    with open(conf_path, 'w') as f:
        json.dump(conf_data, f)
    zipf.write(conf_path, 'confidence_distribution.json')
    added_files.append('confidence_distribution.json')
    print(f"[OK] Added confidence_distribution.json")

# Hiển thị thông tin
if zip_path.exists():
    zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
    print(f"\n{'='*50}")
    print(f"ZIP FILE CREATED: {zip_path.name}")
    print(f"Size: {zip_size_mb:.2f} MB")
    print(f"Files included: {len(added_files)}")
    print(f"{'='*50}")
    print("\nFiles trong zip:")
    for f in added_files:
        print(f"  - {f}")
    print("\nLưu ý: Zip file bao gồm intermediate logs để phục hồi kết quả nếu session bị ngắt.")
    print("\nĐể download:")
    print("1. Click vào file 'lbph_evaluation_results.zip' trong panel bên phải")
    print("2. Hoặc chạy: !cp lbph_evaluation_results.zip /kaggle/working/")
else:
    print("[ERROR] Failed to create zip file")
