In [None]:
from langchain_together import ChatTogether
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage, AIMessage
from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver
from langchain_mcp_adapters.client import MultiServerMCPClient
from typing import TypedDict, Annotated
import operator
import asyncio
import pandas as pd
import json
import ast
import re
from fuzzywuzzy import fuzz
from langchain_fireworks import ChatFireworks

In [None]:
# Define the agent state
class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], operator.add]
    step: str  # Tracks current step: 'init', 'models', 'domains', 'subdomains', 'metrics', 'done'
    models: list[str]
    domains: list[str]
    subdomains: dict[str, list[str]]
    query: str
    metrics: dict
    intent: str
    context: str
    selected_llm: str

# Initialize memory and MCP client
memory = MemorySaver()
print(f"Checkpointer type: {type(memory)}")

In [None]:
#MCP client
client = MultiServerMCPClient(
    {
        "csv_r": {
            "command": "uv",
            "args": [
                "--directory",
                "/home/sesi/testing/agents/ER/agentic-ai/experiments/HeteroLLMs/mcp_server/",
                "run",
                "main.py"
            ],
            "transport": "stdio",
        }
    }
)

In [None]:
prompt = """You are an AI assistant in a Multi-Agent System (MAS) framework, tasked with dynamically selecting the most suitable LLM for a given task using only the provided CSV dataset tools (Model, Parameters, Domain, Sub-domain, Accuracy, Latency_ms, Memory_mb, etc.). The user query can be in any text format. Follow this workflow:
1. Classify the query intent (e.g., list models, compare models, retrieve metrics).
2. Check available domains and subdomains from the CSV at the start.
3. Dynamically map the query to the closest domain-subdomain pair by comparing query terms to available domains/subdomains, using keyword overlap or similarity, without hardcoded mappings.
4. Use cached data to avoid redundant tool calls.
5. Evaluate metrics (e.g., accuracy for knowledge tasks) to select the best LLM at each step.
6. Normalize accuracy to 0-1 scale if >1.
7. Provide a clear explanation of the selected LLM, including metrics, why it was chosen, and the domain-subdomain mapping, referencing context if relevant.
8. If no data is found, respond: "No specific data found for this query in the CSV. Try a related domain/subdomain or check the query."

Context: {context}

Use only the "csv_r" MCP server tools. Log decisions for debugging. Select the best LLM at each step where metrics are available.
"""

In [23]:
class Agent:
    def __init__(self, model, tools, checkpointer):
        self.tools = {t.name: t for t in tools}
        self.model = model.bind_tools(tools)
        self.graph = self._build_graph(checkpointer)

    def _build_graph(self, checkpointer):
        graph = StateGraph(AgentState)
        graph.add_node("init", self.init_step)
        graph.add_node("list_models", self.list_models_step)
        graph.add_node("list_domains", self.list_domains_step)
        graph.add_node("list_subdomains", self.list_subdomains_step)
        graph.add_node("collect_metrics", self.collect_metrics_step)
        graph.add_node("respond", self.respond_step)
        graph.add_conditional_edges(
            "init",
            lambda state: self.route_step(state),
            {"list_models": "list_models", "list_domains": "list_domains", "collect_metrics": "collect_metrics", "respond": "respond"}
        )
        graph.add_conditional_edges(
            "list_models",
            lambda state: "list_domains" if state["intent"] in ["compare", "metrics"] and not state["domains"] else "collect_metrics",
            {"list_domains": "list_domains", "collect_metrics": "collect_metrics"}
        )
        graph.add_edge("list_domains", "list_subdomains")
        graph.add_edge("list_subdomains", "collect_metrics")
        graph.add_edge("collect_metrics", "respond")
        graph.set_entry_point("init")
        return graph.compile(checkpointer=checkpointer)

    def route_step(self, state: AgentState):
        """Dynamically route based on intent and cached data."""
        intent = state["intent"]
        print(f"Routing: Intent={intent}, Models={state['models']}, Domains={state['domains']}, Subdomains={state['subdomains']}")
        if intent == "list_models" and state["models"]:
            print("Routing: Using cached models, skipping list_models")
            return "collect_metrics" if state["intent"] in ["compare", "metrics"] else "respond"
        elif intent == "list_models":
            return "list_models"
        elif intent in ["compare", "metrics"] and state["models"] and state["domains"] and state["subdomains"]:
            print("Routing: Using cached models, domains, and subdomains")
            return "collect_metrics"
        elif intent in ["compare", "metrics"]:
            return "list_models"
        return "respond"

    async def init_step(self, state: AgentState):
        query = state["messages"][-1].content
        print(f"Initializing with query: {query}")
        intent = self.classify_intent(query)
        print(f"Classified intent: {intent}")

        # Initialize with cached data or fetch from tools
        models = state.get("models", [])
        domains = state.get("domains", [])
        subdomains = state.get("subdomains", {})
        if not models and intent in ["list_models", "compare", "metrics"]:
            models_result = await self.tools["list_models"].ainvoke({})
            models = models_result if isinstance(models_result, list) else []
            print(f"Models retrieved: {models}")
        if not domains and intent in ["compare", "metrics"]:
            domains_result = await self.tools["list_domains"].ainvoke({})
            domains = domains_result if isinstance(domains_result, list) else []
            print(f"Domains retrieved: {domains}")
        if not subdomains and intent in ["compare", "metrics"]:
            subdomains = {}
            for domain in domains:
                subdomains_result = await self.tools["list_sub_domains"].ainvoke({"domain": domain})
                subdomains[domain] = subdomains_result if isinstance(subdomains_result, list) else []
                print(f"Subdomains for {domain}: {subdomains[domain]}")
            print(f"Collected subdomains: {subdomains}")

        context = await self.summarize_context(state["messages"], models, domains, subdomains)
        print(f"Context: {context}")
        return {
            "step": "init",
            "query": query,
            "intent": intent,
            "context": context,
            "models": models,
            "domains": domains,
            "subdomains": subdomains,
            "metrics": {},
            "selected_llm": None,
            "messages": []
        }

    def classify_intent(self, query: str) -> str:
        q = query.lower()
        if re.search(r"\b(models|list|available)\b", q):
            return "list_models"
        if re.search(r"\b(compare|better|best|prefer|efficient|memory|performance)\b", q):
            return "compare"
        if re.search(r"\b(metrics|performance)\b", q):
            return "metrics"
        return "unknown"


    async def summarize_context(self, messages: list[AnyMessage], models: list, domains: list, subdomains: dict) -> str:
        """Summarize previous responses and available data for context."""
        context_lines = [f"Available models: {', '.join(models) if models else 'None'}",
                         f"Available domains: {', '.join(domains) if domains else 'None'}",
                         f"Available subdomains: {subdomains if subdomains else 'None'}"]
        seen_outputs = set()
        for msg in messages[-5:]:
            if isinstance(msg, AIMessage):
                if "Available models" in msg.content:
                    models_match = re.search(r"Available models: (.+)", msg.content)
                    if models_match and models_match.group(1) not in seen_outputs:
                        context_lines.append(f"Models: {models_match.group(1)}")
                        seen_outputs.add(models_match.group(1))
                elif "The best model" in msg.content:
                    best_model = re.search(r"The best model for the query is (\S+) with an average accuracy of (\d+\.\d+)", msg.content)
                    if best_model and best_model.group(0) not in seen_outputs:
                        context_lines.append(f"Best model: {best_model.group(1)} (accuracy: {best_model.group(2)})")
                        seen_outputs.add(best_model.group(0))
                elif "Metrics for" in msg.content:
                    metrics = re.findall(r"Metrics for (\S+):\n(.*?)(?=\n\n|$)", msg.content, re.DOTALL)
                    for metric in metrics:
                        key = metric[0]
                        if key not in seen_outputs:
                            context_lines.append(f"Metrics for {key}: {metric[1].strip()}")
                            seen_outputs.add(key)
            elif isinstance(msg, ToolMessage) and msg.name in ["list_models", "compare_models_domain_subdomain", "get_metric_domain_subdomain"]:
                if msg.content not in seen_outputs:
                    context_lines.append(f"{msg.name}: {msg.content}")
                    seen_outputs.add(msg.content)
        context = "\n".join(context_lines)
        summary_prompt = f"Summarize this context in 100 words or less, focusing on key information (models, best models, metrics, domains, subdomains):\n{context}"
        try:
            summary = await self.model.ainvoke([HumanMessage(content=summary_prompt)])
            return summary.content
        except Exception as e:
            print(f"Error summarizing context: {e}")
            return context[:500] or "No previous context available."

    async def list_models_step(self, state: AgentState):
        try:
            result = await self.tools["list_models"].ainvoke({})
            print(f"Models retrieved: {result}, Type: {type(result)}")
            return {
                "step": "models",
                "models": result if isinstance(result, list) else [],
                "messages": [ToolMessage(
                    tool_call_id="list_models_1",
                    name="list_models",
                    content=str(result)
                )]
            }
        except Exception as e:
            print(f"Error in list_models: {e}")
            return {
                "step": "models",
                "models": [],
                "messages": [ToolMessage(
                    tool_call_id="list_models_1",
                    name="list_models",
                    content=f"Error: {e}"
                )]
            }

    async def list_domains_step(self, state: AgentState):
        try:
            result = await self.tools["list_domains"].ainvoke({})
            print(f"Domains retrieved: {result}, Type: {type(result)}")
            return {
                "step": "domains",
                "domains": result if isinstance(result, list) else [],
                "messages": [ToolMessage(
                    tool_call_id="list_domains_1",
                    name="list_domains",
                    content=str(result)
                )]
            }
        except Exception as e:
            print(f"Error in list_domains: {e}")
            return {
                "step": "domains",
                "domains": [],
                "messages": [ToolMessage(
                    tool_call_id="list_domains_1",
                    name="list_domains",
                    content=f"Error: {e}"
                )]
            }

    async def list_subdomains_step(self, state: AgentState):
        subdomains = {}
        try:
            for domain in state["domains"]:
                result = await self.tools["list_sub_domains"].ainvoke({"domain": domain})
                print(f"Sub-domains for {domain}: {result}, Type: {type(result)}")
                subdomains[domain] = result if isinstance(result, list) else []
            print(f"Collected subdomains: {subdomains}")
            return {
                "step": "subdomains",
                "subdomains": subdomains,
                "messages": [ToolMessage(
                    tool_call_id=f"list_subdomains_{domain}",
                    name="list_sub_domains",
                    content=str(subdomains.get(domain, []))
                ) for domain in subdomains]
            }
        except Exception as e:
            print(f"Error in list_subdomains: {e}")
            return {
                "step": "subdomains",
                "subdomains": {},
                "messages": [ToolMessage(
                    tool_call_id="list_subdomains_error",
                    name="list_sub_domains",
                    content=f"Error: {e}"
                )]
            }

    def normalize_accuracy(self, value):
        """Normalize accuracy to 0-1 scale if needed."""
        if isinstance(value, (int, float)) and value > 1:
            return value / 100
        return value

    async def map_query_to_domain_subdomain_llm(self, query, domains, subdomains):
        # build prompt listing each domain + its subdomains
        dom_list = []
        for d in domains:
            subs = subdomains.get(d, []) or ["None"]
            dom_list.append(f"- {d}: {', '.join(subs)}")
        prompt = f"""
    You are given these domains/subdomains:
    {chr(10).join(dom_list)}
    User query: "{query}"
    Reply with exactly:
    Domain: <one of the domains above>
    Subdomain: <one of that domain’s subdomains, or None>
    """
        resp = await self.model.ainvoke([HumanMessage(content=prompt)])
        dm = re.search(r"Domain:\s*(\S+)", resp.content)
        sd = re.search(r"Subdomain:\s*(\S+)", resp.content)
        domain = dm.group(1) if dm else None
        subdomain = sd.group(1) if sd and sd.group(1) != 'None' else None
        reason = f"LLM mapped to {domain}/{subdomain or 'None'}"
        return domain, subdomain, reason


    def select_best_llm(self, metrics: dict, task_type: str) -> tuple[str, str]:
        """Select the best LLM based on task type and metrics."""
        if not metrics or "comparison" not in metrics:
            return None, "No comparison data available."
        comparison = metrics["comparison"]
        if not isinstance(comparison, dict) or not comparison:
            return None, f"Invalid comparison data: {comparison}"
        best_model = max(comparison.items(), key=lambda x: x[1], default=(None, 0))[0]
        reason = f"Selected {best_model} due to highest accuracy ({comparison[best_model]:.2f}) for {task_type}."
        return best_model, reason

    async def collect_metrics_step(self, state: AgentState):
        query = state["query"].lower()
        intent = state["intent"]
        metrics = {}
        messages = []
        selected_llm = None
        selection_reason = ""
    
# inside collect_metrics_step
        relevant_domain, relevant_subdomain, mapping_reason = await \
            self.map_query_to_domain_subdomain_llm(query, state["domains"], state["subdomains"])

        messages.append(ToolMessage(
            tool_call_id="mapping_decision",
            name="mapping",
            content=mapping_reason
        ))
    
        # 2) Helper to parse & normalize
        def parse_tool_result(res):
            try:
                data = res if isinstance(res, dict) else json.loads(res)
            except Exception:
                try:
                    data = ast.literal_eval(res)
                except Exception:
                    return {}
            if isinstance(data, dict) and "accuracy" in data:
                data["accuracy"] = self.normalize_accuracy(data["accuracy"])
            return data
    
        # 3) Invoke the correct tool based on intent
        try:
            if intent == "list_models":
                metrics["models"] = state["models"]
                messages.append(ToolMessage(
                    tool_call_id="list_models_response",
                    name="list_models",
                    content=str(state["models"])
                ))
    
            elif intent in ["compare", "metrics"]:
                # comparison request
                if re.search(r"\b(better|best|prefer|efficient|memory)\b", query) and domain:
                    if subdomain:
                        cmp_res = await self.tools["compare_models_domain_subdomain"].ainvoke(
                            {"domain": domain, "sub_domain": subdomain}
                        )
                    else:
                        cmp_res = await self.tools["compare_models_domain"].ainvoke(
                            {"domain": domain}
                        )
                    comp = parse_tool_result(cmp_res)
                    if isinstance(comp, dict):
                        comp = {m: self.normalize_accuracy(a) for m, a in comp.items()}
                    metrics["comparison"] = comp
                    messages.append(ToolMessage(
                        tool_call_id=f"compare_{domain}_{subdomain or 'all'}",
                        name=("compare_models_domain_subdomain"
                              if subdomain else "compare_models_domain"),
                        content=str(comp)
                    ))
                    # pick best
                    selected_llm, selection_reason = self.select_best_llm(metrics, subdomain or domain)
    
                # detailed metrics request
                if re.search(r"\bmetrics\b", query) and domain:
                    for m in state["models"]:
                        if subdomain:
                            res = await self.tools["get_metric_domain_subdomain"].ainvoke(
                                {"model": m, "domain": domain, "sub_domain": subdomain}
                            )
                        else:
                            res = await self.tools["get_metric_domain"].ainvoke(
                                {"model": m, "domain": domain}
                            )
                        parsed = parse_tool_result(res)
                        key = f"{m}_{domain}_{subdomain}" if subdomain else f"{m}_{domain}"
                        metrics[key] = parsed
                        messages.append(ToolMessage(
                            tool_call_id=f"metrics_{key}",
                            name=("get_metric_domain_subdomain"
                                  if subdomain else "get_metric_domain"),
                            content=str(parsed)
                        ))
        except Exception as e:
            messages.append(ToolMessage(
                tool_call_id="metrics_error",
                name="metrics",
                content=f"Error: {e}"
            ))
    
        # 4) Build the next state
        return {
            "step": "metrics",
            "metrics": metrics,
            "messages": messages,
            "selected_llm": selected_llm,
            "context": (
                state["context"] +
                (f"\nSelected LLM: {selected_llm} ({selection_reason})" if selected_llm else "")
            )
        }


        def parse_tool_result(result):
            if isinstance(result, dict):
                if "accuracy" in result:
                    result["accuracy"] = self.normalize_accuracy(result["accuracy"])
                return result
            elif isinstance(result, str):
                try:
                    parsed = json.loads(result)
                    if isinstance(parsed, dict) and "accuracy" in parsed:
                        parsed["accuracy"] = self.normalize_accuracy(parsed["accuracy"])
                    return parsed
                except json.JSONDecodeError:
                    try:
                        parsed = ast.literal_eval(result)
                        if isinstance(parsed, dict) and "accuracy" in parsed:
                            parsed["accuracy"] = self.normalize_accuracy(parsed["accuracy"])
                        return parsed
                    except (ValueError, SyntaxError):
                        print(f"Warning: Failed to parse string result: {result}")
                        return {}
            elif isinstance(result, list):
                print(f"Warning: Tool returned a list: {result}")
                return {}
            else:
                print(f"Warning: Unexpected result type: {type(result)}, value: {result}")
                return {}

        try:
            if intent == "list_models":
                metrics["models"] = state["models"]
                messages.append(ToolMessage(
                    tool_call_id="list_models_response",
                    name="list_models",
                    content=str(state["models"])
                ))
            elif intent in ["compare", "metrics"]:
                if "better" in query or "prefer" in query or "best" in query:
                    if relevant_domain and relevant_subdomain:
                        result = await self.tools["compare_models_domain_subdomain"].ainvoke({
                            "domain": relevant_domain,
                            "sub_domain": relevant_subdomain
                        })
                        result = parse_tool_result(result)
                        if isinstance(result, dict):
                            result = {k: self.normalize_accuracy(v) for k, v in result.items()}
                        print(f"Comparison result for {relevant_domain}/{relevant_subdomain}: {result}")
                        metrics["comparison"] = result
                        messages.append(ToolMessage(
                            tool_call_id=f"compare_models_{relevant_domain}_{relevant_subdomain}",
                            name="compare_models_domain_subdomain",
                            content=str(result)
                        ))
                        task_type = relevant_subdomain or relevant_domain or query
                        selected_llm, selection_reason = self.select_best_llm(metrics, task_type)
                    elif relevant_domain:
                        result = await self.tools["compare_models_domain"].ainvoke({"domain": relevant_domain})
                        result = parse_tool_result(result)
                        if isinstance(result, dict):
                            result = {k: self.normalize_accuracy(v) for k, v in result.items()}
                        print(f"Comparison result for {relevant_domain}: {result}")
                        metrics["comparison"] = result
                        messages.append(ToolMessage(
                            tool_call_id=f"compare_models_{relevant_domain}",
                            name="compare_models_domain",
                            content=str(result)
                        ))
                        task_type = relevant_domain or query
                        selected_llm, selection_reason = self.select_best_llm(metrics, task_type)
                if "metrics" in query and relevant_domain:
                    for model in state["models"]:
                        if relevant_subdomain:
                            result = await self.tools["get_metric_domain_subdomain"].ainvoke({
                                "model": model,
                                "domain": relevant_domain,
                                "sub_domain": relevant_subdomain
                            })
                            result = parse_tool_result(result)
                            print(f"Metrics for {model}/{relevant_domain}/{relevant_subdomain}: {result}")
                            metrics[f"{model}_{relevant_domain}_{relevant_subdomain}"] = result
                            messages.append(ToolMessage(
                                tool_call_id=f"metrics_{model}_{relevant_domain}_{relevant_subdomain}",
                                name="get_metric_domain_subdomain",
                                content=str(result)
                            ))
                        else:
                            result = await self.tools["get_metric_domain"].ainvoke({
                                "model": model,
                                "domain": relevant_domain
                            })
                            result = parse_tool_result(result)
                            print(f"Metrics for {model}/{relevant_domain}: {result}")
                            metrics[f"{model}_{relevant_domain}"] = result
                            messages.append(ToolMessage(
                                tool_call_id=f"metrics_{model}_{relevant_domain}",
                                name="get_metric_domain",
                                content=str(result)
                            ))
            else:
                print("Decision: Unknown intent, no metrics collected")
                metrics["unknown"] = {}

        except Exception as e:
            print(f"Error in collect_metrics: {e}")
            messages.append(ToolMessage(
                tool_call_id="metrics_error",
                name="metrics",
                content=f"Error: {e}"
            ))

        print(f"Collected metrics: {metrics}")
        return {
            "step": "metrics",
            "metrics": metrics,
            "messages": messages,
            "selected_llm": selected_llm,
            "context": state["context"] + f"\nSelected LLM: {selected_llm} ({selection_reason})" if selected_llm else state["context"]
        }

    async def respond_step(self, state: AgentState):
        query = state["query"].lower()
        intent = state["intent"]
        metrics = state["metrics"]
        context = state["context"]
        selected_llm = state["selected_llm"]
        print(f"Responding with metrics: {metrics}, Intent: {intent}, Context: {context}, Selected LLM: {selected_llm}")

        response = f"Based on the analysis and context:\nContext: {context}\n\n"
        if intent == "list_models" and "models" in metrics:
            if metrics["models"]:
                response += f"Available models: {', '.join(metrics['models'])}"
            else:
                response += "No models found in the CSV file."
        elif intent in ["compare", "metrics"] and not metrics:
            response += "No specific data found for this query in the CSV. Try a related domain/subdomain or check the query."
        elif intent == "compare" or (intent == "metrics" and ("better" in query or "prefer" in query or "best" in query)):
            comparison = metrics.get("comparison", {})
            if isinstance(comparison, dict) and comparison:
                best_model = max(comparison.items(), key=lambda x: x[1], default=(None, 0))[0]
                if best_model:
                    response += f"The best model for the query is {best_model} with an average accuracy of {comparison[best_model]:.2f}.\n"
                    response += "Model accuracies:\n" + "\n".join(f"{model}: {acc:.2f}" for model, acc in comparison.items()) + "\n"
                    if selected_llm:
                        response += f"Selected LLM: {selected_llm} (chosen for highest accuracy in {query}).\n"
                else:
                    response += f"No valid comparison data found. Received: {comparison}\n"
            # Include metrics if requested
            if "metrics" in query:
                response += "Detailed metrics:\n"
                for key, metric in metrics.items():
                    if "metrics" in key and isinstance(metric, dict) and metric:
                        response += f"{key}:\n" + "\n".join(f"{k}: {v:.2f}" for k, v in metric.items() if isinstance(v, (int, float))) + "\n"
                        if "accuracy" in metric and metric["accuracy"] < 0.1:
                            response += f"Warning: Low accuracy ({metric['accuracy']:.2f}) for {key}. Verify CSV data.\n"
                    elif "metrics" in key:
                        response += f"No metrics data found for {key}. Received: {metric}\n"
            # Generate chart for comparison
            
        elif intent == "metrics":
            response += "Metrics for the requested models:\n"
            accuracies = {}
            for key, metric in metrics.items():
                if isinstance(metric, dict) and metric:
                    response += f"{key}:\n" + "\n".join(f"{k}: {v:.2f}" for k, v in metric.items() if isinstance(v, (int, float))) + "\n"
                    if "accuracy" in metric:
                        model_name = key.split("_")[0]
                        accuracies[model_name] = metric["accuracy"]
                        if metric["accuracy"] < 0.1:
                            response += f"Warning: Low accuracy ({metric['accuracy']:.2f}) for {key}. Verify CSV data.\n"
                else:
                    response += f"No metrics data found for {key}. Received: {metric}\n"
            if "better" in query or "prefer" in query or "best" in query:
                if accuracies:
                    best_model = max(accuracies.items(), key=lambda x: x[1], default=(None, 0))[0]
                    if best_model:
                        response += f"The best model is {best_model} with an accuracy of {accuracies[best_model]:.2f}.\n"
                        if selected_llm:
                            response += f"Selected LLM: {selected_llm} (chosen for highest accuracy in {query}).\n"
                        
                else:
                    response += "No valid accuracy data found for comparison.\n"
        else:
            response += "No specific data found for this query in the CSV. Try a related domain/subdomain or check the query."

        return {
            "step": "done",
            "messages": [AIMessage(content=response)]
        }
from sentence_transformers import SentenceTransformer

In [24]:
async def main():
    # 1) Load your CSV tools via MCP
    tools = await client.get_tools()

    # 2) Initialize your LLM wrapper
    model = ChatTogether(
        model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
        temperature=0.0,
    )

    # 3) Build the Agent with memory checkpointer
    agent = Agent(model, tools, checkpointer=memory)

    # 4) Prepare test queries
    queries = [
        "How do metrics compare for Combinatorics across both models and which one is more efficient in memory usage?"
    ]

    # 5) Initialize shared state
    state = {
        "messages": [],
        "step": "init",
        "models": [],
        "domains": [],
        "subdomains": {},
        "query": "",
        "metrics": {},
        "intent": "",
        "context": "",
        "selected_llm": None
    }

    # 6) Iterate through queries
    for query in queries:
        state["query"] = query
        state["messages"] = [HumanMessage(content=query)]

        # Use the same thread config if required by your MCP setup
        thread = {"configurable": {"thread_id": "3"}}

        # Invoke your agent's state graph
        result = await agent.graph.ainvoke(state, thread)

        # Print the final LLM response
        print(f"User query: {query}")
        print(f"Agent response:\n{result['messages'][-1].content}\n")

        # Update cached fields for potential multi-turn
        for field in ["models", "domains", "subdomains", "context", "selected_llm"]:
            if field in result:
                state[field] = result[field]

        # Append tool & assistant messages to history
        state["messages"].extend(result.get("messages", []))

# Run the async main (in Jupyter or asyncio)
await main()


Initializing with query: How do metrics compare for Combinatorics across both models and which one is more efficient in memory usage?
Classified intent: list_models
Models retrieved: ['Llama3.2_1B', 'Qwen2.5_1.5B']
Context: 
Routing: Intent=list_models, Models=['Llama3.2_1B', 'Qwen2.5_1.5B'], Domains=[], Subdomains={}
Routing: Using cached models, skipping list_models
Responding with metrics: {}, Intent: list_models, Context: , Selected LLM: None
User query: How do metrics compare for Combinatorics across both models and which one is more efficient in memory usage?
Agent response:
Based on the analysis and context:
Context: 

No specific data found for this query in the CSV. Try a related domain/subdomain or check the query.

