In [5]:
import httpx
import asyncio
from tqdm.asyncio import tqdm_asyncio
from typing import Optional, List, Dict
import json
from datetime import datetime
import random

In [8]:
def read_jsonl(filepath: str) -> List[dict]:
    with open(filepath) as f:
        return [json.loads(line) for line in f]

def write_jsonl(data: List[dict], filepath: str):
    with open(filepath, "w") as f:
        for line in data:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [9]:
logs_data = read_jsonl("/Users/vinhnguyen/Projects/ext-chatbot/resources/logs/exports_1766646264633-lf-traces-export-cmio1vjfs000ynv0795mpsunw.jsonl")

In [10]:

group_by_sessionId = {}
for s in logs_data:
    sessionId = s["sessionId"]
    if sessionId not in group_by_sessionId:
        group_by_sessionId[sessionId] = []
    if "guru" in s["tags"]:
        group_by_sessionId[sessionId].append(s)

for sessionId in group_by_sessionId:
    sorted_list = sorted(group_by_sessionId[sessionId], key=lambda x: datetime.fromisoformat(x['timestamp'].replace('Z', '+00:00')))
    group_by_sessionId[sessionId] = sorted_list

print(len(group_by_sessionId))


23


In [11]:
conversations = []
for sessionId in list(group_by_sessionId.keys()):
    conversation = []
    if len(group_by_sessionId[sessionId]) == 0:
        continue
    current_ind = len(group_by_sessionId[sessionId])-1
    count = 0
    while True:
        current_history = json.loads(group_by_sessionId[sessionId][current_ind]["metadata"]["history"])
        conversation = current_history + conversation
        current_ind -= len(current_history)//2
        if current_ind <= 0:
            break
        count += 1
        if count > 1000:
            print(current_ind)
            break
    
    try:
        conversation.extend([
            {"role": "user", "content": group_by_sessionId[sessionId][-1]["input"]},
            {"role": "assistant", "content": json.loads(group_by_sessionId[sessionId][-1]["output"])["user_msg"]}
        ])
    except Exception:
        conversation.extend([
            {"role": "user", "content": group_by_sessionId[sessionId][-1]["input"]},
        ])
    refined_conversation = []
    for ind in range(len(conversation)):
        if ind == 0:
            refined_conversation.append(conversation[ind])
            continue
        if conversation[ind]["role"] == conversation[ind-1]["role"]:
            if conversation[ind]["content"] == conversation[ind-1]["content"]:
                continue
            else:
                new_content = conversation[ind-1]["content"] + conversation[ind]["content"]
                refined_conversation.pop()
                refined_conversation.append({
                    "role": conversation[ind]["role"], 
                    "content": new_content
                })
        else:
            refined_conversation.append(conversation[ind])
    conversations.append(refined_conversation)

1
1


In [12]:
test_dataset = []
for conversation in conversations:
    for ind in range(len(conversation)):
        if conversation[ind]["role"] == "user":
            test_dataset.append({
                "conversation_history": conversation[:ind],
                "current_message": conversation[ind]["content"]
            })

print(len(test_dataset))

181


In [13]:
async def call_route_api(
    current_message: str,
    conversation_history: Optional[List[Dict[str, str]]] = None,
    base_url: str = "http://localhost:8000"
) -> str:
    """
    Asynchronously call the route API endpoint.
    
    Args:
        current_message: Current user message to process
        conversation_history: Optional conversation history in format 
                            [{'role': 'user'|'assistant', 'content': '...'}]
        base_url: Base URL of the API server (default: http://localhost:8000)
    
    Returns:
        The data_source string returned by the API (e.g., 'sql', 'vector', or 'none')
    
    Raises:
        httpx.HTTPError: If the API request fails
    """
    url = f"{base_url}/route"
    payload = {
        "current_message": current_message,
        "conversation_history": conversation_history
    }
    
    async with httpx.AsyncClient(timeout=3000.0) as client:
        response = await client.post(url, json=payload)
        response.raise_for_status()
        result = response.json()
        return result["data_source"]


async def call_route_api_batch(
    requests: List[Dict[str, any]],
    base_url: str = "http://localhost:8000",
    max_concurrent: int = 10
) -> List[str]:
    """
    Asynchronously call the route API endpoint for multiple requests in batch.
    
    Args:
        requests: List of dicts with 'current_message' and optional 'conversation_history'
        base_url: Base URL of the API server (default: http://localhost:8000)
        max_concurrent: Maximum number of concurrent requests (default: 10)
    
    Returns:
        List of data_source strings in the same order as input requests
    """
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def call_with_semaphore(request_data):
        async with semaphore:
            return await call_route_api(
                current_message=request_data["current_message"],
                conversation_history=request_data.get("conversation_history"),
                base_url=base_url
            )
    
    tasks = [call_with_semaphore(req) for req in requests]
    return await tqdm_asyncio.gather(*tasks)

In [None]:
route_results = await call_route_api_batch(
    test_dataset,
    base_url="http://10.164.84.47:8321",
    max_concurrent=10
)

filter_test_dataset = []
for route_result, sample in zip(route_results, test_dataset):
    if route_result == "sql":
        filter_test_dataset.append(sample)
print(len(filter_test_dataset))

100%|██████████| 181/181 [03:21<00:00,  1.11s/it]


In [134]:
write_jsonl(
    random.sample(filter_test_dataset, min(200, len(filter_test_dataset))), 
    "/Users/vinhnguyen/Projects/ext-chatbot/resources/logs/batdongsan_test.jsonl"
)