In [34]:
from pathlib import Path
import sys
import os

root = Path.cwd().parent
sys.path.append(str(root))

In [35]:
from src.config import config

config = config.get('cluster', {})

In [36]:
import pandas as pd
import numpy as np
import ast

df = pd.read_csv(root / "data" / "bsc_dataset_regplans" / "embedded_icl_examples_1536d.csv")
df['embedding'] = df['embedding'].apply(ast.literal_eval).apply(np.array)

In [37]:
embeddings = np.vstack(df['embedding'].values)

In [38]:
import umap

umap_reducer = umap.UMAP(n_components=30, random_state=42)

embeddings_umap = umap_reducer.fit_transform(embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [39]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=15)

cluster_labels = clusterer.fit_predict(embeddings_umap)

# Analyze clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Percentage of noise: {n_noise/len(cluster_labels)*100:.1f}%")

Number of clusters: 34
Number of noise points: 745
Percentage of noise: 31.4%



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [40]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)

embeddings_tsne = tsne.fit_transform(embeddings)

In [41]:
import plotly.express as px

# Create outputs directory if it doesn't exist
outputs_dir = root / "outputs"
outputs_dir.mkdir(exist_ok=True)

fig = px.scatter(
    x=embeddings_tsne[:, 0],
    y=embeddings_tsne[:, 1],
    color=cluster_labels.astype(str),
    title="t-SNE visualization of embeddings"
)

fig.show()
# Save to dedicated outputs directory with descriptive filename
fig.write_html(outputs_dir / "hdbscan_tsne_clusters.html")