In this notebook, we'll generate a synthetic dataset of questions which we'll use `kura` to reconstruct. This way we can know with confidence that our hierachal topic modelling is working as intended

In [1]:
# First let's load in our categories
import yaml

with open("kura_categories.yml", "r") as f:
    categories = yaml.safe_load(f)

# Now let's do a quick count of the categoreis and subcategories
print(len(categories))
for category in categories:
    print(f"{category}: {len(categories[category])}")

8
Account & Eligibility: 4
Payment Scheduling & Management: 4
Fees, Interest & Penalties: 4
Order & Merchant Issues: 3
Disputes & Fraud: 3
Technical Support: 4
General Inquiries: 4
Emotional/Urgent Cases: 4


With approximately 8 categories, we'll generate 20 synthetic conversations for each and then see how our model is able to reconstruct them

In [6]:
import instructor
from typing import Literal
from pydantic import BaseModel

client = instructor.from_provider("openai/gpt-4.1-mini",async_client=True)


# Now let's define our data model
class Message(BaseModel):
    role: Literal["user", "assistant"]
    content: str

class GeneratedConversation(BaseModel):
    conversation: list[Message]

class Conversation(BaseModel):
    messages: list[Message]
    category: str
    subcategory: str


Now let's generate our first few conversations for our synthetic dataset. We'll sample from the top level categories and then the subcategories with equal probability.

We'll generate around 5-10 per category first before using these generated examples as few shot examples down the line.

In [3]:
import random
from rich import print


class Subcategory(BaseModel):
    title: str
    description: str
    sample_questions: list[str]


class Config(BaseModel):
    category: str
    subcategory: Subcategory
    sample_conversations: list[Conversation]

    @classmethod
    def get_random_config(cls, config_dict: dict):
        category = random.choice(list(config_dict.keys()))
        subcategory = random.choice(list(config_dict[category].keys()))
        return cls(
            category=category,
            subcategory=Subcategory(**config_dict[category][subcategory]),
            sample_conversations=[],
        )


print(Config.get_random_config(categories))

In [7]:
async def generate_conversation(client:instructor.AsyncInstructor,config:Config):
    conversation = await client.chat.completions.create(
        messages=[
            {
                "role":"system",
                "content":f"""
                Generate a conversation between a customer and a BNPL service representative that covers the following category and subcategory:
                Category: {config.category}
                Subcategory: {config.subcategory.title}

                Here are some examples of questions that the customer might ask:
                {config.subcategory.sample_questions}

                make sure to start your conversation with an appropriate greeting by the assistant and then have the user start the conversation up with the questions. Simulate the conversation with a natural flow and make sure the user's question is resolved in the conversation.

                Make sure to resolve the conversation in {random.randint(5,12)} messages and have the conversation end with the user's query being resolved or the user thanking the assistant and ending the conversation.
                """
            }
        ],
        response_model=GeneratedConversation,
    )

    return Conversation(
        messages=conversation.conversation,
        category=config.category,
        subcategory=config.subcategory.title,
    )

Now let's generate around 10 initial conversations like this

In [8]:
from tqdm.asyncio import tqdm_asyncio

coros = [generate_conversation(client, Config.get_random_config(categories)) for _ in range(2)]

conversations = await tqdm_asyncio.gather(*coros)


100%|██████████| 2/2 [00:17<00:00,  8.91s/it]


We'll also do a quick count of the categories and subcategories breakdown too

In [9]:
from collections import Counter

category_counter = Counter([conversation.category for conversation in conversations])
print(category_counter)

Now let's see the breakdown for each subcategory under each category

In [10]:
conversations[0].category,conversations[0].subcategory

('Emotional/Urgent Cases', 'Urgent Fraud or Security Alert')

In [11]:
category_to_subcategory = {}
for category in categories:
    category_to_subcategory[category] = []
    for subcategory in categories[category]:
        category_to_subcategory[category].append(categories[category][subcategory]["title"])


subcategory_counter = Counter()
for conversation in conversations:
    subcategory_counter[conversation.subcategory] += 1

subcategory_counter
for category in category_to_subcategory:
    print(category)
    for subcategory in category_to_subcategory[category]:
        if subcategory in subcategory_counter:
            print(f"  {subcategory}: {subcategory_counter[subcategory]}")

In [24]:
import os

os.makedirs("./kura_data", exist_ok=True)
with open("./kura_data/synthetic_conversations.json", "w") as f:
    for conversation in conversations:
        f.write(conversation.model_dump_json() + "\n")


Now let's generate 100 of these conversations, providing sample conversations from the same subcategory category combination as examples

In [43]:
with open("./kura_data/synthetic_conversations.json", "r") as f:
    conversations = [Conversation.model_validate_json(line) for line in f]

In [50]:
async def generate_conversation(client:instructor.AsyncInstructor,config:Config, examples:list[Conversation]):
    valid_examples = [example for example in examples if example.subcategory == config.subcategory.title and example.category == config.category]
    conversation = await client.chat.completions.create(
        messages=[
            {
                "role":"system",
                "content":"""
                Generate a conversation between a customer and a BNPL service representative that covers the following category and subcategory:
                Category: {{ config.category }}
                Subcategory: {{ config.subcategory.title }}

                Here are some examples of questions that the customer might ask:
                {{ config.subcategory.sample_questions }}

                make sure to start your conversation with an appropriate greeting by the assistant and then have the user start the conversation up with the questions. Simulate the conversation with a natural flow and make sure the user's question is resolved in the conversation.

                Make sure to resolve the conversation in {{ num_messages }} messages and have the conversation end with the user's query being resolved or the user thanking the assistant and ending the conversation.

                {% if examples | length > 0 %}
                Here are some examples of conversations that cover the same category and subcategory. Use them as reference but do not copy the exact wording or structure of the conversation. Add extra details, use fake names, etc.
                {% for example in examples %}
                {{ example.messages | join("\n") }}
                {% endfor %}
                {% endif %}
                """
            }
        ],
        context={
            "config":config,
            "examples":valid_examples,
            "num_messages":random.randint(5,15)
        },
        response_model=GeneratedConversation,
    )

    return Conversation(
        messages=conversation.conversation,
        category=config.category,
        subcategory=config.subcategory.title,
    )

In [52]:
print(await generate_conversation(client, Config.get_random_config(categories), conversations))

Now let's run this and generate 150 conversations and we can see what kura returns. We'll generate 10 on each iteration and update our list of examples on each round

In [54]:
total_conversations = 150
for i in range(total_conversations // 10):
    with open("./kura_data/synthetic_conversations.json", "r") as f:
        examples = [Conversation.model_validate_json(line) for line in f]
    coros = [generate_conversation(client, Config.get_random_config(categories), examples) for _ in range(10)]
    conversations = await tqdm_asyncio.gather(*coros)
    
    with open("./kura_data/synthetic_conversations.json", "a") as f:
        for conversation in conversations:
            f.write(conversation.model_dump_json() + "\n")

100%|██████████| 10/10 [00:16<00:00,  1.65s/it]
100%|██████████| 10/10 [00:08<00:00,  1.13it/s]
100%|██████████| 10/10 [00:10<00:00,  1.01s/it]
100%|██████████| 10/10 [00:13<00:00,  1.30s/it]
100%|██████████| 10/10 [00:20<00:00,  2.07s/it]
100%|██████████| 10/10 [00:17<00:00,  1.75s/it]
100%|██████████| 10/10 [00:14<00:00,  1.47s/it]
100%|██████████| 10/10 [00:11<00:00,  1.17s/it]
100%|██████████| 10/10 [00:17<00:00,  1.73s/it]
100%|██████████| 10/10 [00:22<00:00,  2.25s/it]
100%|██████████| 10/10 [00:26<00:00,  2.67s/it]
100%|██████████| 10/10 [00:13<00:00,  1.40s/it]
100%|██████████| 10/10 [00:10<00:00,  1.08s/it]
100%|██████████| 10/10 [00:19<00:00,  1.91s/it]
100%|██████████| 10/10 [00:13<00:00,  1.37s/it]


Generating Clusters

In [14]:
from kura import Kura

kura = Kura()

In [12]:
from kura.types import Conversation as KuraConversation
from datetime import timedelta, datetime
import uuid
import json

def process_messages(row: Conversation):
    return [
        {
            "role": message.role,
            "content": message.content,
            "created_at": datetime.now() + timedelta(minutes=5 * i),
        }
        for i, message in enumerate(row.messages)
    ]

conversations = [Conversation(**json.loads(row)) for row in open("./kura_data/synthetic_conversations.json")]
conversations = [
    KuraConversation(
        chat_id=str(uuid.uuid4()),
        messages=process_messages(conversation),
        created_at=datetime.now(),
        metadata={
            "turns": len(conversation.messages),
            "category":conversation.category,
            "subcategory":conversation.subcategory,
        }
    )
    for conversation in conversations
]

In [3]:
import instructor
from pydantic import BaseModel

client = instructor.from_provider("google/gemini-2.0-flash")

class User(BaseModel):
    name: str
    age: int

resp = client.chat.completions.create(
    messages=[
        {
            "role":"user",
            "content":"Ivan is 27"
        }
    ],
    response_model=User
)

print(resp)



name='Ivan' age=27


In [15]:
await kura.cluster_conversations(conversations)

  0%|          | 0/2 [01:27<?, ?it/s]
Task exception was never retrieved
future: <Task finished name='Task-7' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /Users/ivanleo/Documents/coding/systematically-improving-rag/cohort_2/.venv/lib/python3.9/site-packages/tqdm/asyncio.py:75> exception=TypeError("object GeneratedConversation can't be used in 'await' expression")>
Traceback (most recent call last):
  File "/Users/ivanleo/Documents/coding/systematically-improving-rag/cohort_2/.venv/lib/python3.9/site-packages/tqdm/asyncio.py", line 76, in wrap_awaitable
    return i, await f
  File "/var/folders/ws/q_m6c6qs3n553603dk_zvrgc0000gn/T/ipykernel_7695/1021393176.py", line 2, in generate_conversation
    conversation = await client.chat.completions.create(
TypeError: object GeneratedConversation can't be used in 'await' expression
Summarising 180 conversations: 100%|██████████| 180/180 [00:06<00:00, 28.76it/s]
Embedding Summaries: 100%|██████████| 180/180 [00:10<00:00,

Starting with 18 clusters


Embedding Clusters: 100%|██████████| 18/18 [00:00<00:00, 18.93it/s]
Generating Meta Clusters: 100%|██████████| 2/2 [00:04<00:00,  2.26s/it]


Reduced to 9 clusters




[ProjectedCluster(id='04de04f64260462eb3ba51ac02d0a632', name='Troubleshoot crashing app; escalate to tech support', description='Users experienced app crashes or loading issues, seeking troubleshooting, reinstallation, and escalation to the tech team for resolution and updates', chat_ids=['b00fc2ae-9f3e-49a1-9681-dcd96e8dccf5', '6be7501a-4800-40d5-b7a4-c6a4f833bb2b', '1ea30058-e254-4142-ad91-9787ea330230', '146d107c-72b3-45b0-8055-af88115a5f5b', 'dc0679be-affc-4db8-8e2f-d76ea663c838', '1236ad71-219f-49cd-a526-fefc02247dd5', 'd12aee27-60c1-4209-825a-3f88431a8cc8', 'a31ecdc3-3140-444e-a13e-29188ebb467a'], parent_id=None, x_coord=11.178121566772461, y_coord=6.572209358215332, level=0, count=8),
 ProjectedCluster(id='f7f2454c36fc4ae9bb0d36630145d95c', name='Troubleshoot a crashing app and escalate to tech team', description='Users experienced an app that was crashing or stuck on the loading screen. They sought troubleshooting assistance, tried reinstalling the app, and requested escalatio

In [16]:
kura.visualise_clusters()

Clusters (180 conversations)
╠══ Troubleshoot crashing app; escalate to tech support (8 conversations)
║   ╚══ Troubleshoot a crashing app and escalate to tech team (8 conversations)
╠══ Help me troubleshoot and secure my account (16 conversations)
║   ╠══ Troubleshoot locked account and password resets (3 conversations)
║   ╠══ Troubleshoot and receive one-time passcodes via alternative methods. (6 conversations)
║   ╚══ Secure a potentially compromised account immediately (7 conversations)
╠══ Manage Buy Now, Pay Later Payment Options (15 conversations)
║   ╠══ Inquire about Payment Schedules and Payment Date Changes (7 conversations)
║   ╚══ Manage payment methods for a buy now, pay later service. (8 conversations)
╠══ Troubleshoot buy now, pay later order and payment issues (16 conversations)
║   ╠══ Troubleshoot issues with BNPL orders and payments (10 conversations)
║   ╚══ Troubleshoot buy now pay later checkout failures (6 conversations)
╠══ Determine eligibility and raise limi

use the kura cli to visualise

In [19]:
conversations = [Conversation(**json.loads(row)) for row in open("./kura_data/synthetic_conversations.json")]

In [None]:
category_to_subcategory = {}
for category in categories:
    category_to_subcategory[category] = []
    for subcategory in categories[category]:
        category_to_subcategory[category].append(categories[category][subcategory]["title"])


subcategory_counter = Counter()
for conversation in conversations:
    subcategory_counter[conversation.subcategory] += 1

subcategory_counter
for category in category_to_subcategory:
    print(category)
    for subcategory in category_to_subcategory[category]:
        if subcategory in subcategory_counter:
            subcategory_data = [item for item in categories[category].items() if item[1]["title"] == subcategory][0]
            print(f"  {subcategory}: {subcategory_counter[subcategory]} ({subcategory_data[1]['description']})")

TypeError: tuple indices must be integers or slices, not str