In [None]:
# Add project root to path 
import sys
from pathlib import Path
sys.path.append(str(Path("../").resolve()))

In [None]:
# Imports
from src.clustering_utils_hdbscan import (
    fetch_embeddings,
    reduce_dimensions,
    cluster_embeddings,
    save_clusters,
    soft_assign_noise,
    get_cluster_representatives,
    label_clusters_llm,
    save_cluster_labels,
    save_cluster_labels_to_table
)

import numpy as np
import pandas as pd



In [None]:
# Fetch embeddings
df = fetch_embeddings(limit=None)
print(f"Fetched {len(df)} ticket embeddings for clustering")

In [None]:
# Prepare numpy array for UMAP dim reduction
X = np.array(df["embedding"].to_list())

In [None]:
# Dimensionality reduction with UMAP
# n_neighbors: controls local structure sensitivity (10–30 typically, I will use 30)
# n_components: final embedding dimensions (10–50, I will try with 50 )
X_reduced, reducer = reduce_dimensions(X, n_components=50, n_neighbors=30)
print(f"Reduced embeddings shape: {X_reduced.shape}")



In [None]:
# Cluster reduced embeddings using HDBSCAN
# min_cluster_size: smaller = more fine-grained clusters, i will use 10 

labels, clusterer = cluster_embeddings(X_reduced, min_cluster_size=50)

# Soft-assign noise points that are above a threshold of similarity to the nearest cluster centers
labels_soft = soft_assign_noise(X_reduced, labels, similarity_threshold= 0.999)
df["cluster_id"] = labels_soft

print(f"Found {len(set(labels_soft)) - (1 if -1 in labels else 0)} clusters")



In [None]:
# Save cluster labels back to Postgres
save_clusters(df[["ticket_id", "cluster_id"]])
print("Cluster labels saved to Postgres")



In [None]:
from sqlalchemy import create_engine, text
import pandas as pd
from src.config import DB_URL

engine = create_engine(DB_URL)
cluster_id = -1  # noise points

query = text("""
    SELECT 
        p.ticket_id,
        p.keywords
    FROM 
        ticket_preprocessed p
    JOIN 
        ticket_embeddings e 
    ON 
        p.ticket_id = e.ticket_id
    WHERE 
        e.cluster_id = :cluster_id
        AND p.keywords IS NOT NULL;
""")

with engine.connect() as conn:
    df_interested = pd.read_sql(query, conn, params={"cluster_id": cluster_id})

print(len(df_interested))


In [None]:
df_interested

In [None]:
df.head()

In [None]:
# Find representatives of each cluster
representatives = get_cluster_representatives(df, X_reduced, top_n=3)


In [None]:
import os
import pickle

# Ensure the models directory exists
os.makedirs("../models", exist_ok=True)  # adjust path relative to your notebook

# Save the models to files inside the models folder
with open("../models/hdbscan_model.pkl", "wb") as f:
    pickle.dump(clusterer, f)

with open("../models/umap_reducer.pkl", "wb") as f:
    pickle.dump(reducer, f)

print("Models saved successfully under '../models/'")


In [None]:
# Generate natural language cluster labels
cluster_labels = label_clusters_llm(df, representatives, max_tickets=3)


In [None]:
# Save cluster labels to DB
save_cluster_labels(cluster_labels)

In [None]:
# Save to Postgres
save_cluster_labels_to_table(cluster_labels)
print("Cluster labels saved to Postgres successfully.")