<a href="https://colab.research.google.com/github/priscilla254/synthetic_dataset_for_age_estimation/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import numpy as np
import pandas as pd

# --- paths ---
EMB_NPY = "/content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings.npy"
EMB_CSV = "/content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings.csv"
OUT_CSV = "/content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings_clustered.csv"

# --- load ---
X = np.load(EMB_NPY)             # shape (num_faces_detected, 512)
df = pd.read_csv(EMB_CSV)        # shape (num_images, ...), includes has_face column (0/1)

# Normalize/clean has_face just in case it's not numeric
df["has_face"] = df["has_face"].astype(int)

# Subset rows with faces
df_faces = df[df["has_face"] == 1].copy()

# Sanity check: make sure row-count matches embeddings
if len(df_faces) != X.shape[0]:
    raise ValueError(f"Mismatch: df_faces={len(df_faces)} vs embeddings={X.shape[0]}. "
                     f"Check that EMB_CSV matches the EMB_NPY you loaded.")

# ---- choose a clustering method ----
# Option 1: DBSCAN (no k needed). Good defaults; tune eps if needed.
from sklearn.cluster import DBSCAN
labels = DBSCAN(eps=0.7, min_samples=3, metric="cosine").fit_predict(X)

# Option 2: KMeans (uncomment to use)
# from sklearn.cluster import KMeans
# k = 8
# labels = KMeans(n_clusters=k, random_state=42, n_init=10).fit_predict(X)

# Attach labels to the face rows only
df_faces["cluster"] = labels

# Merge back: keep original order, set NaN for rows with no faces
df_out = df.merge(
    df_faces[["img_path", "cluster"]],  # adjust keys if your CSV uses a different column name
    on="img_path",
    how="left"
)

# Save
df_out.to_csv(OUT_CSV, index=False)
print("Saved clustered manifest to:", OUT_CSV)

# Quick peek
print(df_out["cluster"].value_counts(dropna=False).sort_index())
df_out.head()


Saved clustered manifest to: /content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings_clustered.csv
cluster
0.0    182
NaN     18
Name: count, dtype: int64


Unnamed: 0,img_path,has_face,bbox,aligned_path,cluster
0,/content/drive/MyDrive/ai-workspace/outputs/sd...,1,"(343, 211, 717, 744)",/content/drive/MyDrive/ai-workspace/outputs/sd...,0.0
1,/content/drive/MyDrive/ai-workspace/outputs/sd...,1,"(337, 141, 669, 626)",/content/drive/MyDrive/ai-workspace/outputs/sd...,0.0
2,/content/drive/MyDrive/ai-workspace/outputs/sd...,1,"(324, 156, 696, 705)",/content/drive/MyDrive/ai-workspace/outputs/sd...,0.0
3,/content/drive/MyDrive/ai-workspace/outputs/sd...,1,"(356, 260, 649, 681)",/content/drive/MyDrive/ai-workspace/outputs/sd...,0.0
4,/content/drive/MyDrive/ai-workspace/outputs/sd...,1,"(321, 143, 708, 730)",/content/drive/MyDrive/ai-workspace/outputs/sd...,0.0
