## Determine Optimal Clusters 

In [None]:
# Evaluate cluster range (3-6)
cluster_range = range(3, 7)
silhouette_scores = []
db_scores = []

for n_clusters in cluster_range:
    # K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    
    # Metrics
    silhouette_scores.append(silhouette_score(embeddings, labels))
    db_scores.append(davies_bouldin_score(embeddings, labels))

# Plot metrics
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Optimal Cluster Count')

plt.subplot(1, 2, 2)
plt.plot(cluster_range, db_scores, 'go-')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Score')
plt.title('Lower is Better')
plt.savefig('cluster_metrics.png', bbox_inches='tight')
plt.close()

# Select best cluster count (max silhouette)
optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f"Optimal clusters: {optimal_clusters} (Silhouette: {max(silhouette_scores):.3f})")

## Final Clustering 

# K-Means with optimal clusters
final_kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
cluster_labels = final_kmeans.fit_predict(embeddings)

# Alternative: Gaussian Mixture Model
# gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
# cluster_labels = gmm.fit_predict(embeddings)