In [11]:
from stream_router import StreamRouter

# Import training updates and test prompts from the external file.
from ideation import TRAIN_UPDATES, TEST_PROMPTS

router = StreamRouter([], embedding_dim=8, learning_rate=0.1, min_samples=12)

for i in range(0, len(TRAIN_UPDATES), 9):
    batch = TRAIN_UPDATES[i:i+9]
    if batch:  # Only process if we have examples
        agent = batch[0][1]  # Get agent name from first example
        prompts = [p[0] for p in batch]  # Extract just the prompts
        # print(agent, prompts)
        router.add_agent(agent, prompts)

for prompt, agent in TRAIN_UPDATES:
    router.update(prompt, agent)
print(f"Total clusters formed after training update: {len(router.clusters)}")

# Debug: Print cluster details.
# router.debug_clusters()

# Execute test inferences using the imported test prompts.
print("\nTest Inference Results:")
for i, prompt in enumerate(TEST_PROMPTS, start=1):
    predicted_agent = router.inference(prompt)
    print(f"Test Prompt {i}: \"{prompt}\"")
    print(f"Predicted Agent: {predicted_agent}\n")


Total clusters formed after training update: 81

Test Inference Results:
Test Prompt 1: "I have an issue with Workday. My hours weren't updated properly."
Predicted Agent: HR Agent

Test Prompt 2: "Generate a Python script to fetch data from a REST API and store it in a database."
Predicted Agent: Code Generation Agent

Test Prompt 3: "Find the latest security vulnerabilities in Python libraries we use."
Predicted Agent: Web Search Agent

Test Prompt 4: "Help me troubleshoot why my SaaS account is locked."
Predicted Agent: Customer Service Agent

Test Prompt 5: "Retrieve the contact details of all employees in the marketing department."
Predicted Agent: Database Agent

Test Prompt 6: "Schedule a meeting with the engineering and marketing teams next Wednesday at 10 AM and send out invites."
Predicted Agent: Executive Assistant Agent

Test Prompt 7: "Review the terms of service for our new SaaS product and ensure compliance with GDPR and CCPA."
Predicted Agent: Legal Agent

Test Prompt 8

In [12]:
import pickle
# Save essential components of the router
router_state = {
    'agents': router.agents,
    'clusters': router.clusters,
    'agent_embeddings': router.agent_embeddings,
    'embedding_dim': router.embedding_dim,
    'learning_rate': router.learning_rate,
    'min_samples': router.min_samples,
}

with open('router_state_12.pkl', 'wb') as f:
    pickle.dump(router_state, f)

print("Router state saved successfully to router_state.pkl")

Router state saved successfully to router_state.pkl


In [9]:
# Measure cosine similarity between cluster embeddings and agent embeddings
print("\nCluster-Agent Embedding Similarities:")
print("-" * 80)

for i, cluster in enumerate(router.clusters):
    if cluster["embedding"] is not None:  # Only check clusters with learned embeddings
        # Get all agents used in this cluster
        agents_in_cluster = set(agent for _, agent in cluster["data"])
        
        for agent in agents_in_cluster:
            similarity = router._similarity(
                cluster["embedding"],
                router.agent_embeddings[agent]
            )
            print(f"Cluster {i} <-> {agent}: {similarity:.4f}")



Cluster-Agent Embedding Similarities:
--------------------------------------------------------------------------------
Cluster 0 <-> HR Agent: 0.9869
Cluster 1 <-> Code Generation Agent: 0.9348
Cluster 2 <-> Web Search Agent: 0.9969
Cluster 3 <-> Customer Service Agent: 0.9990
Cluster 4 <-> Code Generation Agent: 0.9867
Cluster 4 <-> Database Agent: 0.7033
Cluster 5 <-> Executive Assistant Agent: 0.9994
Cluster 6 <-> Legal Agent: 0.9951
Cluster 7 <-> Software QA Agent: 0.9974
Cluster 8 <-> Web Automation Agent: 0.9996


In [8]:
# Measure cosine similarity between cluster embeddings and agent embeddings
print("\nCluster-Agent Embedding Similarities:")
print("-" * 80)

for i, cluster in enumerate(router.clusters):
    if cluster["embedding"] is not None:  # Only check clusters with learned embeddings
        # Get all agents used in this cluster
        agents_in_cluster = set(agent for _, agent in cluster["data"])
        
        for agent in agents_in_cluster:
            similarity = router._similarity(
                cluster["embedding"],
                router.agent_embeddings[agent]
            )
            print(f"Cluster {i} <-> {agent}: {similarity:.4f}")



Cluster-Agent Embedding Similarities:
--------------------------------------------------------------------------------
Cluster 0 <-> HR Agent: 0.9869
Cluster 1 <-> Code Generation Agent: 0.9348
Cluster 2 <-> Web Search Agent: 0.9969
Cluster 3 <-> Customer Service Agent: 0.9990
Cluster 4 <-> Code Generation Agent: 0.9867
Cluster 4 <-> Database Agent: 0.7033
Cluster 5 <-> Executive Assistant Agent: 0.9994
Cluster 6 <-> Legal Agent: 0.9951
Cluster 7 <-> Software QA Agent: 0.9974
Cluster 8 <-> Web Automation Agent: 0.9996


In [5]:
# Sample a few random prompts and find their nearest neighbors
import random
from sklearn.neighbors import NearestNeighbors
import numpy as np
# Get embeddings for all training prompts
all_embeddings = []
all_prompts = []
for prompt, _ in TRAIN_UPDATES:
    emb = router._compute_prompt_embedding(prompt)
    all_embeddings.append(emb.numpy())
    all_prompts.append(prompt)
    
all_embeddings = np.array(all_embeddings)

# Initialize KNN
knn = NearestNeighbors(n_neighbors=10, metric="cosine")  # 3 neighbors (including self)
knn.fit(all_embeddings)

# Sample 5 random prompts
sample_indices = random.sample(range(len(all_prompts)), 5)

print("Nearest neighbors analysis:")
print("-" * 80)
for idx in sample_indices:
    query_embedding = all_embeddings[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(query_embedding)
    
    print(f"\nQuery prompt: \"{all_prompts[idx]}\"")
    print("\nNearest neighbors:")
    # Skip first neighbor (self) and show next 2
    for d, i in zip(distances[0][1:], indices[0][1:]):
        print(f"Distance: {d:.3f} - \"{all_prompts[i]}\"")
    print("-" * 80)


Nearest neighbors analysis:
--------------------------------------------------------------------------------

Query prompt: "Explain the security measures in place for protecting my account data."

Nearest neighbors:
Distance: 0.504 - "How do I set up multi-factor authentication for better security?"
Distance: 0.610 - "Identify possible security vulnerabilities and generate security test cases for user authentication."
Distance: 0.639 - "Backup all my important documents and pictures to an external drive."
Distance: 0.680 - "Generate a data privacy policy that aligns with global data protection regulations."
Distance: 0.692 - "Can you help me migrate my data from another platform to your service?"
Distance: 0.696 - "Provide steps to integrate your API with my existing application."
Distance: 0.722 - "How do I update my direct deposit information?"
Distance: 0.725 - "Draft a terms of service and privacy policy for a mobile app, ensuring compliance with App Store and Google Play guidelin

In [4]:
distances, indices = knn.kneighbors(query_embedding)

NameError: name 'knn' is not defined

In [10]:
# Let's explore why we don't have 1 cluster despite alpha < 0
import numpy as np
# 1. Print the actual distances between points to understand clustering behavior
print("Sample distances between points:")
sample_embeddings = []
sample_prompts = TRAIN_UPDATES[5:15]  # Take first 3 training examples
for prompt, _ in sample_prompts:
    emb = router._compute_prompt_embedding(prompt)
    sample_embeddings.append(emb.numpy())

sample_embeddings = np.array(sample_embeddings)
distances = np.linalg.norm(sample_embeddings[:, None] - sample_embeddings, axis=2)
print(distances)

# 2. Print the actual clustering threshold being used
print(f"\nClustering threshold (min_samples * exp(alpha)): {router.min_samples * np.exp(router.alpha)}")

# 3. Check the distribution of pairwise distances
all_embeddings = []
for prompt, _ in TRAIN_UPDATES:
    emb = router._compute_prompt_embedding(prompt)
    all_embeddings.append(emb.numpy())
    
all_embeddings = np.array(all_embeddings)
all_distances = np.linalg.norm(all_embeddings[:, None] - all_embeddings, axis=2)
flat_distances = all_distances[np.triu_indices_from(all_distances, k=1)]

print(f"\nDistance statistics:")
print(f"Min distance: {np.min(flat_distances):.3f}")
print(f"Max distance: {np.max(flat_distances):.3f}")
print(f"Mean distance: {np.mean(flat_distances):.3f}")
print(f"Median distance: {np.median(flat_distances):.3f}")

# This shows that even with alpha < 0, if the actual distances between points
# are larger than the threshold, new clusters will still form


Sample distances between points:
[[0.        1.1991975 1.2099881 1.2253543 1.3701342 1.3525289 1.3651464
  1.3733066 1.3476781 1.3160906]
 [1.1991975 0.        1.2105585 1.2254704 1.3729197 1.2827737 1.3788327
  1.4081657 1.397967  1.3709254]
 [1.2099881 1.2105585 0.        1.2459263 1.2617028 1.2807477 1.3374196
  1.3829098 1.2719167 1.3132303]
 [1.2253543 1.2254704 1.2459263 0.        1.351324  1.3308349 1.3988179
  1.342134  1.3715174 1.3097007]
 [1.3701342 1.3729197 1.2617028 1.351324  0.        1.353681  1.3089576
  1.2183383 1.2172592 1.306563 ]
 [1.3525289 1.2827737 1.2807477 1.3308349 1.353681  0.        1.3540425
  1.3116673 1.3378683 1.3409972]
 [1.3651464 1.3788327 1.3374196 1.3988179 1.3089576 1.3540425 0.
  1.3277911 1.2880034 1.2177488]
 [1.3733066 1.4081657 1.3829098 1.342134  1.2183383 1.3116673 1.3277911
  0.        1.2655433 1.2194403]
 [1.3476781 1.397967  1.2719167 1.3715174 1.2172592 1.3378683 1.2880034
  1.2655433 0.        1.3081747]
 [1.3160906 1.3709254 1.31323

In [4]:
# Calculate average intra-cluster and cross-cluster distances using cosine similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Get embeddings for all training prompts
all_embeddings = []
for prompt, _ in TRAIN_UPDATES:
    emb = router._compute_prompt_embedding(prompt)
    all_embeddings.append(emb.numpy())
all_embeddings = np.array(all_embeddings)

# Split into 5 clusters of 10 prompts each
clusters = []
for i in range(0, 50, 10):
    clusters.append(all_embeddings[i:i+10])

# Calculate average intra-cluster similarities
intra_cluster_sims = []
for cluster in clusters:
    # Get all pairwise similarities within cluster
    sims = cosine_similarity(cluster)
    # Get upper triangle only (excluding diagonal)
    upper_tri = sims[np.triu_indices_from(sims, k=1)]
    if len(upper_tri) > 0:
        intra_cluster_sims.append(np.mean(upper_tri))

avg_intra_sim = np.mean(intra_cluster_sims)
print(f"Average intra-cluster cosine similarity: {avg_intra_sim:.3f}")

# Calculate average cross-cluster similarities
cross_cluster_sims = []
for i in range(len(clusters)):
    for j in range(i+1, len(clusters)):
        sims = cosine_similarity(clusters[i], clusters[j])
        cross_cluster_sims.append(np.mean(sims))

avg_cross_sim = np.mean(cross_cluster_sims)
print(f"Average cross-cluster cosine similarity: {avg_cross_sim:.3f}")


Average intra-cluster cosine similarity: 0.270
Average cross-cluster cosine similarity: 0.174


In [5]:
from sklearn.cluster import KMeans
import numpy as np

# Get embeddings for all training prompts
train_embeddings = []
train_prompts = []
for prompt, _ in TRAIN_UPDATES:
    emb = router._compute_prompt_embedding(prompt)
    train_embeddings.append(emb.numpy())
    train_prompts.append(prompt)

# Convert to numpy array
train_embeddings = np.array(train_embeddings)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(train_embeddings)

# Print prompts in each cluster
for i in range(8):
    print(f"\nCluster {i}:")
    cluster_prompts = [prompt for j, prompt in enumerate(train_prompts) if clusters[j] == i]
    for prompt in cluster_prompts:
        print(f"  - {prompt}")



Cluster 0:
  - Write a Dockerfile for a Node.js application with Express and PostgreSQL.
  - Create a Terraform script to provision an AWS EC2 instance and configure security groups.
  - Generate a Kubernetes deployment YAML file for a Flask web application.
  - Find documentation for the latest version of Kubernetes.

Cluster 1:
  - Create a SQL query to retrieve the top 10 highest-paying customers from our database.
  - Find the total number of employees working in the company and provide a breakdown by department.
  - List all employees who joined the company after January 1, 2023.
  - Get the email addresses of all team leads in the engineering department.
  - Show me the employee with the highest salary in the company.
  - Provide a list of employees along with their job titles and phone numbers.
  - Fetch all employees who report to [Manager Name].
  - Generate a list of employees whose work anniversary is this month.
  - Find employees who have been with the company for more th

In [2]:
len(router.clusters)

32