In [1]:
# Add project root to path 
import sys
from pathlib import Path
sys.path.append(str(Path("../").resolve()))

In [2]:
# Imports
from src.clustering_utils_hdbscan import (
    fetch_embeddings,
    reduce_dimensions,
    cluster_embeddings,
    save_clusters,
    soft_assign_noise,
    get_cluster_representatives,
    label_clusters_llm,
    save_cluster_labels,
    save_cluster_labels_to_table
)

import numpy as np
import pandas as pd



In [3]:
# Fetch embeddings
df = fetch_embeddings(limit=None)
print(f"Fetched {len(df)} ticket embeddings for clustering")

Fetched 49800 ticket embeddings for clustering


In [4]:
# Prepare numpy array for UMAP dim reduction
X = np.array(df["embedding"].to_list())

In [5]:
# Dimensionality reduction with UMAP
# n_neighbors: controls local structure sensitivity (10–30 typically, I will use 30)
# n_components: final embedding dimensions (10–50, I will try with 50 )
X_reduced, reducer = reduce_dimensions(X, n_components=50, n_neighbors=30)
print(f"Reduced embeddings shape: {X_reduced.shape}")



Reduced embeddings shape: (49800, 50)


In [6]:
# Cluster reduced embeddings using HDBSCAN
# min_cluster_size: smaller = more fine-grained clusters, i will use 10 

labels, clusterer = cluster_embeddings(X_reduced, min_cluster_size=50)

# Soft-assign noise points that are above a threshold of similarity to the nearest cluster centers
labels_soft = soft_assign_noise(X_reduced, labels, similarity_threshold= 0.999)
df["cluster_id"] = labels_soft

print(f"Found {len(set(labels_soft)) - (1 if -1 in labels else 0)} clusters")





Found 113 clusters


In [7]:
# Save cluster labels back to Postgres
save_clusters(df[["ticket_id", "cluster_id"]])
print("Cluster labels saved to Postgres")



Cluster labels saved to Postgres


In [8]:
from sqlalchemy import create_engine, text
import pandas as pd
from src.config import DB_URL

engine = create_engine(DB_URL)
cluster_id = -1  # noise points

query = text("""
    SELECT 
        p.ticket_id,
        p.keywords
    FROM 
        ticket_preprocessed p
    JOIN 
        ticket_embeddings e 
    ON 
        p.ticket_id = e.ticket_id
    WHERE 
        e.cluster_id = :cluster_id
        AND p.keywords IS NOT NULL;
""")

with engine.connect() as conn:
    df_interested = pd.read_sql(query, conn, params={"cluster_id": cluster_id})

print(len(df_interested))


2


In [9]:
df_interested

Unnamed: 0,ticket_id,keywords
0,TKT-530739,"pleånbok verification fails card, pleånbok ver..."
1,TKT-546365,"vrifming iwallet fails card, iwallet fails car..."


In [10]:
df.head()

Unnamed: 0,ticket_id,embedding,combined_text,cluster_id
0,TKT-527279,"[-0.047022976, -0.028998813, -0.030356936, -0....","Requsting new card — chip not working|| , I'm ...",28
1,TKT-519142,"[-0.046972826, 0.006419635, 0.0408308, -0.0183...",Requesting new card — chip not working (URGENT...,28
2,TKT-501921,"[-0.053356387, 0.016575871, 0.021287508, -0.05...","New card — chip not working|| , , I'm [PERSON]...",28
3,TKT-501704,"[-0.082989596, -0.012467751, 0.028746905, -0.0...","Requesting new card — chip not working|| , , I...",28
4,TKT-501783,"[-0.061868295, 0.0070742546, 0.009118566, -0.0...","Requesting new card — chip not working|| , I'm...",28


In [11]:
# Find representatives of each cluster
representatives = get_cluster_representatives(df, X_reduced, top_n=3)


In [12]:
import os
import pickle

# Ensure the models directory exists
os.makedirs("../models", exist_ok=True)  # adjust path relative to your notebook

# Save the models to files inside the models folder
with open("../models/hdbscan_model.pkl", "wb") as f:
    pickle.dump(clusterer, f)

with open("../models/umap_reducer.pkl", "wb") as f:
    pickle.dump(reducer, f)

print("Models saved successfully under '../models/'")


Models saved successfully under '../models/'


In [13]:
# Generate natural language cluster labels
cluster_labels = label_clusters_llm(df, representatives, max_tickets=3)


Cluster 28:  Chip Card Not Working (Blocking Checkout)
Cluster 5:  Spending Limit Update Requests
Cluster 45:  Urgent Fee Breakdown Requests
Cluster 44:  Fee Breakdown/Invoice Issue
Cluster 3:  SDK Error after App Update
Cluster 11:  Blocked Transactions (MCC Categories)
Cluster 26:  Tokenization Failure in Wallet
Cluster 18:  Webhook Delivery Issues (Urgent)
Cluster 49:  Declined Transactions (Urgent)
Cluster 48:  Urgent Payment Rejections (No Cause)
Cluster 33:  Urgent Card Issue
Cluster 68:  Card Replacement Requests (Urgent)
Cluster 89:  Card Payment Error (Urgent)
Cluster 91:  Card Payment Error (Urgent)
Cluster 63:  Wallet Verification Failure
Cluster 55:  Wallet Verification Failure (Urgent)
Cluster 64:  Wallet Verification Failure (Urgent)
Cluster 100:  Card Loss and Urgent Assistance
Cluster 99:  Lost Card Issue
Cluster 67:  Payment Issue (Compensation Required)
Cluster 47:  Stolen Subway Card Issues
Cluster 108:  3DS Code Delivery Issue
Cluster 109:  3DS Code Delivery Issue (

In [14]:
# Save cluster labels to DB
save_cluster_labels(cluster_labels)

In [15]:
# Save to Postgres
save_cluster_labels_to_table(cluster_labels)
print("Cluster labels saved to Postgres successfully.")

Cluster labels saved to Postgres successfully.
