In [30]:
# Setup: install/import packages and define paths
import sys
import subprocess

try:
    import cv2
except Exception:
    # Fallback install for environments missing OpenCV
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "opencv-python-headless", "pandas", "numpy"])  # noqa: E402
    import cv2  # noqa: E402

import os
from pathlib import Path
import numpy as np
import pandas as pd

# Project paths
PROJECT_ROOT = Path("/Users/bellalu/DS6012/ML1_Project")
IMAGES_DIR = PROJECT_ROOT / "faces_database"
PHI = (1 + 5 ** 0.5) / 2  # golden ratio

# Haar cascade files (bundled with OpenCV)
HAAR_DIR = Path(cv2.data.haarcascades)
CASCADE_FACE = str(HAAR_DIR / "haarcascade_frontalface_default.xml")
CASCADE_EYES = str(HAAR_DIR / "haarcascade_eye.xml")
CASCADE_NOSE = str(HAAR_DIR / "haarcascade_mcs_nose.xml") if (HAAR_DIR / "haarcascade_mcs_nose.xml").exists() else None
CASCADE_MOUTH = str(HAAR_DIR / "haarcascade_smile.xml")  # smile is a common proxy for mouth region

face_cascade = cv2.CascadeClassifier(CASCADE_FACE)
eyes_cascade = cv2.CascadeClassifier(CASCADE_EYES)
nose_cascade = cv2.CascadeClassifier(CASCADE_NOSE) if CASCADE_NOSE else None
mouth_cascade = cv2.CascadeClassifier(CASCADE_MOUTH)

assert IMAGES_DIR.exists(), f"Images directory not found: {IMAGES_DIR}"


In [31]:
# Filename parsing for categorical labels and small helpers
from typing import Dict, Tuple, Optional

AGE_MAP = {"y": "young", "m": "middle", "o": "old"}
GENDER_MAP = {"m": "male", "f": "female"}
EXPR_MAP = {"h": "happy"}  # extend if more expressions exist


def parse_labels(file_name: str) -> Dict[str, str]:
    """Parse ID, age, gender, expression, variant from file name like '140_y_f_h_a.jpg'."""
    stem = Path(file_name).stem
    parts = stem.split("_")
    # Expected: [id, age, gender, expr, variant]
    out = {
        "subject_id": parts[0] if len(parts) > 0 else "",
        "age_group": AGE_MAP.get(parts[1], parts[1]) if len(parts) > 1 else "",
        "gender": GENDER_MAP.get(parts[2], parts[2]) if len(parts) > 2 else "",
        "expression": EXPR_MAP.get(parts[3], parts[3]) if len(parts) > 3 else "",
        "variant": parts[4] if len(parts) > 4 else "",
    }
    return out


def euclidean_distance(p1: Tuple[int, int], p2: Tuple[int, int]) -> float:
    return float(np.hypot(p1[0] - p2[0], p1[1] - p2[1]))


def golden_deviation(a: float, b: float) -> Optional[float]:
    if a is None or b is None or a <= 0 or b <= 0:
        return None
    r = max(a, b) / min(a, b)
    return abs(r - PHI)


In [32]:
# Detection utilities using OpenCV Haar cascades

def detect_face_regions(img_bgr):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(80, 80))
    if len(faces) == 0:
        return None, None
    # choose the largest face
    x, y, w, h = sorted(faces, key=lambda r: r[2] * r[3], reverse=True)[0]
    face_roi = img_bgr[y:y+h, x:x+w]
    face_gray = gray[y:y+h, x:x+w]
    return (x, y, w, h), (face_roi, face_gray)


def detect_eyes(face_gray):
    eyes = eyes_cascade.detectMultiScale(face_gray, scaleFactor=1.1, minNeighbors=8, minSize=(20, 20))
    if len(eyes) < 2:
        return None
    # pick two eyes with left/right order based on x
    eyes_sorted = sorted(eyes, key=lambda r: r[0])[:2]
    centers = []
    for (ex, ey, ew, eh) in eyes_sorted:
        centers.append((int(ex + ew / 2), int(ey + eh / 2)))
    # ensure left-right order
    centers = sorted(centers, key=lambda c: c[0])
    return centers  # [(x_left, y_left), (x_right, y_right)]


def detect_nose(face_gray):
    if not nose_cascade:
        return None
    noses = nose_cascade.detectMultiScale(face_gray, scaleFactor=1.1, minNeighbors=5, minSize=(24, 24))
    if len(noses) == 0:
        return None
    x, y, w, h = sorted(noses, key=lambda r: r[2] * r[3], reverse=True)[0]
    center = (int(x + w / 2), int(y + h / 2))
    return center


def detect_mouth(face_gray):
    # Using smile cascade as a proxy for mouth region; not perfect but works reasonably for smiles
    mouths = mouth_cascade.detectMultiScale(face_gray, scaleFactor=1.1, minNeighbors=40, minSize=(30, 15))
    if len(mouths) == 0:
        return None, None
    x, y, w, h = sorted(mouths, key=lambda r: r[2] * r[3], reverse=True)[0]
    center = (int(x + w / 2), int(y + h / 2))
    width = float(w)
    return center, width


def compute_measurements(img_bgr) -> dict:
    results = {
        "face_width": None,
        "face_height": None,
        "eye_distance": None,
        "mouth_width": None,
        "nose_to_mouth": None,
        # golden ratio deviations (lower is closer to phi)
        "phi_dev_face_h_w": None,          # face height vs width
        "phi_dev_eye_to_mouth": None,      # eye distance vs mouth width
        "phi_dev_nose_mouth_to_eye": None  # nose-to-mouth vs eye distance
    }

    face_rect, face_pair = detect_face_regions(img_bgr)
    if face_rect is None:
        return results

    (fx, fy, fw, fh) = face_rect
    face_roi, face_gray = face_pair

    results["face_width"] = float(fw)
    results["face_height"] = float(fh)

    # features within face
    eye_centers = detect_eyes(face_gray)
    nose_center = detect_nose(face_gray)
    mouth_center, mouth_width = detect_mouth(face_gray)

    if eye_centers and len(eye_centers) == 2:
        results["eye_distance"] = euclidean_distance(eye_centers[0], eye_centers[1])
    if mouth_width is not None:
        results["mouth_width"] = float(mouth_width)
    if nose_center is not None and mouth_center is not None:
        results["nose_to_mouth"] = euclidean_distance(nose_center, mouth_center)

    # golden ratio deviations
    results["phi_dev_face_h_w"] = golden_deviation(results["face_height"], results["face_width"]) if results["face_height"] and results["face_width"] else None
    if results["eye_distance"] and results["mouth_width"]:
        results["phi_dev_eye_to_mouth"] = golden_deviation(results["eye_distance"], results["mouth_width"])
    if results["nose_to_mouth"] and results["eye_distance"]:
        results["phi_dev_nose_mouth_to_eye"] = golden_deviation(results["nose_to_mouth"], results["eye_distance"])

    return results


In [33]:
# Build dataset from all images in faces_database
rows = []
image_paths = sorted([p for p in IMAGES_DIR.glob("*.jpg")])

for img_path in image_paths:
    img = cv2.imread(str(img_path))
    if img is None:
        continue
    labels = parse_labels(img_path.name)
    meas = compute_measurements(img)

    row = {
        # categoricals
        "file_name": img_path.name,
        "subject_id": labels.get("subject_id"),
        "age_group": labels.get("age_group"),
        "gender": labels.get("gender"),
        "expression": labels.get("expression"),
        "variant": labels.get("variant"),
        # numericals
        **{k: (None if v is None else float(v)) for k, v in meas.items()},
    }
    rows.append(row)

faces_df = pd.DataFrame(rows)

# Display summary
print(f"Processed {len(rows)} images from: {IMAGES_DIR}")
faces_df.head()


Processed 12 images from: /Users/bellalu/DS6012/ML1_Project/faces_database


Unnamed: 0,file_name,subject_id,age_group,gender,expression,variant,face_width,face_height,eye_distance,mouth_width,nose_to_mouth,phi_dev_face_h_w,phi_dev_eye_to_mouth,phi_dev_nose_mouth_to_eye
0,004_o_m_h_a.jpg,4,old,male,happy,a,1859.0,1859.0,682.354746,767.0,,0.618034,0.493985,
1,004_o_m_h_b.jpg,4,old,male,happy,b,1908.0,1908.0,435.491676,558.0,,0.618034,0.336724,
2,066_y_m_h_a.jpg,66,young,male,happy,a,1842.0,1842.0,,709.0,,0.618034,,
3,066_y_m_h_b.jpg,66,young,male,happy,b,1824.0,1824.0,1025.914226,725.0,,0.618034,0.20298,
4,079_o_f_h_a.jpg,79,old,female,happy,a,2193.0,2193.0,239.384628,509.0,,0.618034,0.508251,


In [34]:
# Inspect completeness and save results
print(faces_df.isna().mean().sort_values())
output_csv = PROJECT_ROOT / "faces_features_golden_ratio.csv"
faces_df.to_csv(output_csv, index=False)
print(f"Saved: {output_csv}")


file_name                    0.000000
subject_id                   0.000000
age_group                    0.000000
gender                       0.000000
expression                   0.000000
variant                      0.000000
face_width                   0.000000
face_height                  0.000000
mouth_width                  0.000000
phi_dev_face_h_w             0.000000
eye_distance                 0.333333
phi_dev_eye_to_mouth         0.333333
nose_to_mouth                1.000000
phi_dev_nose_mouth_to_eye    1.000000
dtype: float64
Saved: /Users/bellalu/DS6012/ML1_Project/faces_features_golden_ratio.csv


In [35]:
# Define binary targets for golden ratio "closeness"
import numpy as np

# Choose which phi deviation to target primarily
PRIMARY_TARGET = "phi_dev_face_h_w"  # alternatives: 'phi_dev_eye_to_mouth', 'phi_dev_nose_mouth_to_eye'

# Strategy: use median split for balanced classes (50th percentile)
# With small datasets, median ensures roughly 50/50 split
MEDIAN_SPLIT = True
PERCENTILE = 50  # median split for balanced classes

# Compute percentile-based target using median (lower deviation is better/closer to golden ratio)
valid_devs = faces_df[PRIMARY_TARGET].dropna().values
if valid_devs.size > 0:
    if MEDIAN_SPLIT:
        perc_cut = np.median(valid_devs)  # median split for balance
    else:
        perc_cut = np.percentile(valid_devs, PERCENTILE)
else:
    perc_cut = None

faces_df["is_golden_pct"] = faces_df[PRIMARY_TARGET].apply(
    lambda v: bool(perc_cut is not None and v is not None and v <= perc_cut)
)

# Alternative: absolute threshold (adjust if needed based on your data range)
ABS_THRESHOLD = np.median(valid_devs) if valid_devs.size > 0 else 0.1
faces_df["is_golden_abs"] = faces_df[PRIMARY_TARGET].apply(
    lambda v: bool(v is not None and v < ABS_THRESHOLD)
)

# Use median-based target (should be more balanced)
TARGET_COL = "is_golden_pct"

print("Target column:", TARGET_COL)
print("Median threshold value:", perc_cut)
print("Class balance (value counts):")
print(faces_df[TARGET_COL].value_counts(dropna=False))

# Check if we have both classes
class_counts = faces_df[TARGET_COL].value_counts()
if len(class_counts) < 2:
    print("\n⚠️ WARNING: Only one class in target! Using median split to create balance...")
    # Force a more balanced split by using quantiles
    if valid_devs.size > 0:
        median_val = np.median(valid_devs)
        faces_df[TARGET_COL] = faces_df[PRIMARY_TARGET].apply(
            lambda v: bool(v is not None and v <= median_val)
        )
        print("Updated class balance:")
        print(faces_df[TARGET_COL].value_counts(dropna=False))

faces_df[["file_name", PRIMARY_TARGET, TARGET_COL]].head()


Target column: is_golden_pct
Median threshold value: 0.6180339887498949
Class balance (value counts):
is_golden_pct
True    12
Name: count, dtype: int64

Updated class balance:
is_golden_pct
True    12
Name: count, dtype: int64


Unnamed: 0,file_name,phi_dev_face_h_w,is_golden_pct
0,004_o_m_h_a.jpg,0.618034,True
1,004_o_m_h_b.jpg,0.618034,True
2,066_y_m_h_a.jpg,0.618034,True
3,066_y_m_h_b.jpg,0.618034,True
4,079_o_f_h_a.jpg,0.618034,True


In [36]:
# Feature preparation: numerical + one-hot categoricals with imputation
import sys, subprocess
try:
    from sklearn.model_selection import train_test_split
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scikit-learn"])  # noqa: E402
    from sklearn.model_selection import train_test_split
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline

# Select features
num_features = [
    "face_width", "face_height", "eye_distance", "mouth_width", "nose_to_mouth",
    "phi_dev_face_h_w", "phi_dev_eye_to_mouth", "phi_dev_nose_mouth_to_eye",
]
cat_features = ["age_group", "gender", "expression", "variant"]

df_model = faces_df.dropna(subset=[TARGET_COL]).copy()
X = df_model[num_features + cat_features]
y = df_model[TARGET_COL].astype(int)

# Safety check: ensure both classes exist
unique_classes = y.unique()
if len(unique_classes) < 2:
    print(f"⚠️ ERROR: Target has only {len(unique_classes)} class(es): {unique_classes}")
    print("Cannot train binary classifier. Please adjust the target threshold in the previous cell.")
    raise ValueError(f"Need at least 2 classes, found: {unique_classes}")

print(f"✓ Target has {len(unique_classes)} classes: {dict(y.value_counts())}")

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

# Use stratify only if both classes have enough samples for stratification
min_class_count = y.value_counts().min()
stratify_param = y if min_class_count >= 2 else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=stratify_param
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Positive rate (train/test):", y_train.mean().round(3), y_test.mean().round(3))
print(f"Train classes: {dict(y_train.value_counts())}")
print(f"Test classes: {dict(y_test.value_counts())}")


⚠️ ERROR: Target has only 1 class(es): [1]
Cannot train binary classifier. Please adjust the target threshold in the previous cell.


ValueError: Need at least 2 classes, found: [1]

In [None]:
# Train baseline Logistic Regression and Random Forest
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

log_reg_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")),
])

rf_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, class_weight="balanced_subsample")),
])

log_reg_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

print("Models trained.")




ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)

In [None]:
# Evaluate models: metrics + cross-validation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold

models = {
    "log_reg": log_reg_clf,
    "random_forest": rf_clf,
}

for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_prob)
    except ValueError:
        auc = float('nan')

    print(f"\n=== {name} (test) ===")
    print(f"Accuracy: {acc:.3f}  F1: {f1:.3f}  Precision: {prec:.3f}  Recall: {rec:.3f}  ROC AUC: {auc:.3f}")
    print(classification_report(y_test, y_pred, digits=3))

    # cross-validated ROC AUC on training set (only if both classes present in train)
    train_class_counts = y_train.value_counts()
    if len(train_class_counts) >= 2 and train_class_counts.min() >= 1:
        n_splits = min(5, max(2, train_class_counts.min()))
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        try:
            cv_auc = cross_val_score(model, X_train, y_train, cv=cv, scoring="roc_auc")
            print(f"CV ROC AUC (mean±std): {cv_auc.mean():.3f} ± {cv_auc.std():.3f}")
        except ValueError as e:
            print(f"CV skipped: {e}")
    else:
        print(f"CV skipped: insufficient class balance in training set ({dict(train_class_counts)})")
