<a href="https://colab.research.google.com/github/priscilla254/synthetic_dataset_for_age_estimation/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np, pandas as pd

EMB_NPY = "/content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings.npy"
EMB_CSV = "/content/drive/MyDrive/ai-workspace/outputs/sdxl_passport_25yo_embeddings/embeddings.csv"

X = np.load(EMB_NPY)                 # (num_faces_detected, 512)
df = pd.read_csv(EMB_CSV)
df["has_face"] = df["has_face"].astype(int)
mask = df["has_face"] == 1
df_faces = df.loc[mask].copy()

assert len(df_faces) == X.shape[0], (len(df_faces), X.shape[0])  # should match


In [3]:
from numpy.linalg import norm
Xn = X / (norm(X, axis=1, keepdims=True) + 1e-12)

from sklearn.cluster import DBSCAN
def run_dbscan(eps):
    return DBSCAN(eps=eps, min_samples=3, metric="cosine").fit_predict(Xn)

for eps in [0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    labs = run_dbscan(eps)
    n_clusters = len(set(labs)) - (1 if -1 in labs else 0)
    noise = (labs == -1).sum()
    print(f"eps={eps:.2f}: clusters={n_clusters}, noise={noise}")


eps=0.25: clusters=0, noise=182
eps=0.30: clusters=0, noise=182
eps=0.35: clusters=0, noise=182
eps=0.40: clusters=2, noise=175
eps=0.45: clusters=4, noise=124
eps=0.50: clusters=2, noise=45


In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

best = None
for k in range(2, 15):
    km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(Xn)
    s = silhouette_score(Xn, km.labels_, metric="cosine")
    best = max(best or (-1,None,None), (s, k, km), key=lambda z: z[0])
    print(f"k={k:2d}  silhouette={s:.3f}")
sil_score, k_star, km_model = best
print("Chosen k:", k_star, "silhouette:", sil_score)

labels = km_model.labels_
df["cluster"] = np.nan
df.loc[mask, "cluster"] = labels


k= 2  silhouette=0.248
k= 3  silhouette=0.160
k= 4  silhouette=0.040
k= 5  silhouette=0.035
k= 6  silhouette=0.034
k= 7  silhouette=0.031
k= 8  silhouette=0.026
k= 9  silhouette=0.023
k=10  silhouette=0.023
k=11  silhouette=0.019
k=12  silhouette=0.021
k=13  silhouette=0.018
k=14  silhouette=0.008
Chosen k: 2 silhouette: 0.24750741
