diff --git a/.env.example b/.env.example index 1b11e21..c5bf0bc 100644 --- a/.env.example +++ b/.env.example @@ -73,6 +73,8 @@ AGENT_MAX_STEPS=30 AGENT_DEFAULT_ENABLED=true # Agent mode availability (renamed to align with other FEATURE_* flags) FEATURE_AGENT_MODE_AVAILABLE=true +# Agent loop strategy: react (structured reasoning) or think-act (faster, concise) +AGENT_LOOP_STRATEGY=think-act # (Adjust above to stage rollouts. For a bare-bones chat set them all to false.) APP_LOG_DIR=/workspaces/atlas-ui-3-11/logs diff --git a/CLAUDE.md b/CLAUDE.md index f6a66df..ab86e49 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,7 +14,9 @@ Atlas UI 3 is a full-stack LLM chat interface with Model Context Protocol (MCP) # Style note -No Emojis should ever be added in this repo. If you find one, then remove it. +No Emojis should ever be added in this repo. If you find one, then remove it. + +**File Naming**: Do not use generic names like `main.py`, `cli.py`, `utils.py`, or `helpers.py`. Use descriptive names that reflect the file's purpose (e.g., `chat_service.py`, `mcp_tool_manager.py`, `websocket_handler.py`). Exception: top-level entry points like `backend/main.py` are acceptable. # Tests @@ -166,9 +168,10 @@ backend/ 1. **Protocol-Based Dependency Injection**: Uses Python `Protocol` (structural subtyping) instead of ABC inheritance for loose coupling -2. **Agent Loop Strategy Pattern**: Two implementations selectable via `APP_AGENT_LOOP_STRATEGY`: - - `ReActAgentLoop`: Reasoning-Act (faster, better for tools) - - `ThinkActAgentLoop`: Extended thinking (slower, complex reasoning) +2. **Agent Loop Strategy Pattern**: Three implementations selectable via `APP_AGENT_LOOP_STRATEGY`: + - `react`: Reason-Act-Observe cycle (structured reasoning) + - `think-act`: Extended thinking (slower, complex reasoning) + - `act`: Pure action loop (fastest, minimal overhead) 3. **MCP Transport Auto-Detection**: Automatically detects stdio, HTTP, or SSE based on config @@ -232,9 +235,10 @@ MCP servers defined in `config/defaults/mcp.json`. The backend: 4. Supports group-based access control ### Agent Modes -Two agent loop strategies implement different reasoning patterns: -- **ReAct** (`backend/application/chat/agent/react_agent_loop.py`): Fast iteration, good for tool-heavy tasks -- **Think-Act** (`backend/application/chat/agent/think_act_agent_loop.py`): Deep reasoning, slower but more thoughtful +Three agent loop strategies implement different reasoning patterns: +- **ReAct** (`backend/application/chat/agent/react_loop.py`): Reason-Act-Observe cycle, good for tool-heavy tasks with structured reasoning +- **Think-Act** (`backend/application/chat/agent/think_act_loop.py`): Deep reasoning with explicit thinking steps, slower but more thoughtful +- **Act** (`backend/application/chat/agent/act_loop.py`): Pure action loop without explicit reasoning steps, fastest with minimal overhead. LLM calls tools directly and signals completion via the "finished" tool ### File Storage S3-compatible storage via `backend/modules/file_storage/s3_client.py`: diff --git a/backend/application/chat/agent/__init__.py b/backend/application/chat/agent/__init__.py index 688cac1..042fd11 100644 --- a/backend/application/chat/agent/__init__.py +++ b/backend/application/chat/agent/__init__.py @@ -3,3 +3,4 @@ from .protocols import AgentLoopProtocol, AgentContext, AgentEvent, AgentResult, AgentEventHandler from .react_loop import ReActAgentLoop from .think_act_loop import ThinkActAgentLoop +from .factory import AgentLoopFactory diff --git a/backend/application/chat/agent/act_loop.py b/backend/application/chat/agent/act_loop.py new file mode 100644 index 0000000..f74c818 --- /dev/null +++ b/backend/application/chat/agent/act_loop.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, List, Optional + +from interfaces.llm import LLMProtocol +from interfaces.tools import ToolManagerProtocol +from modules.prompts.prompt_provider import PromptProvider + +from .protocols import AgentContext, AgentEvent, AgentEventHandler, AgentLoopProtocol, AgentResult +from ..utilities import error_utils, tool_utils + + +class ActAgentLoop(AgentLoopProtocol): + """Pure action agent loop - just execute tools in a loop until done. + + No explicit reasoning or observation steps. The LLM directly decides which + tools to call and when to finish. Fastest strategy with minimal overhead. + + Exit conditions: + - LLM calls the "finished" tool with a final_answer + - No tool calls returned (LLM provides text response) + - Max steps reached + """ + + def __init__( + self, + *, + llm: LLMProtocol, + tool_manager: Optional[ToolManagerProtocol], + prompt_provider: Optional[PromptProvider], + connection: Any = None, + ) -> None: + self.llm = llm + self.tool_manager = tool_manager + self.prompt_provider = prompt_provider + self.connection = connection + + def _extract_finished_args(self, tool_calls: List[Dict[str, Any]]) -> Optional[str]: + """Extract final_answer from finished tool call if present.""" + try: + for tc in tool_calls: + f = tc.get("function") if isinstance(tc, dict) else None + if f and f.get("name") == "finished": + raw_args = f.get("arguments") + if isinstance(raw_args, str): + try: + args = json.loads(raw_args) + return args.get("final_answer") + except Exception: + return None + if isinstance(raw_args, dict): + return raw_args.get("final_answer") + return None + except Exception: + return None + + async def run( + self, + *, + model: str, + messages: List[Dict[str, Any]], + context: AgentContext, + selected_tools: Optional[List[str]], + data_sources: Optional[List[str]], + max_steps: int, + temperature: float, + event_handler: AgentEventHandler, + ) -> AgentResult: + await event_handler(AgentEvent(type="agent_start", payload={"max_steps": max_steps, "strategy": "act"})) + + steps = 0 + final_answer: Optional[str] = None + + # Define the "finished" control tool + finished_tool_schema = { + "type": "function", + "function": { + "name": "finished", + "description": "Call this when you have completed the task and are ready to provide a final answer to the user.", + "parameters": { + "type": "object", + "properties": { + "final_answer": { + "type": "string", + "description": "The final response to provide to the user", + }, + }, + "required": ["final_answer"], + "additionalProperties": False, + }, + }, + } + + while steps < max_steps and final_answer is None: + steps += 1 + await event_handler(AgentEvent(type="agent_turn_start", payload={"step": steps})) + + # Build tools schema: user tools + finished tool + tools_schema: List[Dict[str, Any]] = [finished_tool_schema] + if selected_tools and self.tool_manager: + user_tools = await error_utils.safe_get_tools_schema(self.tool_manager, selected_tools) + tools_schema.extend(user_tools) + + # Call LLM with tools - using "required" to force tool calling during Act phase + # The LiteLLM caller has fallback logic to "auto" if "required" is not supported + if data_sources and context.user_email: + llm_response = await self.llm.call_with_rag_and_tools( + model, messages, data_sources, tools_schema, context.user_email, "required", temperature=temperature + ) + else: + llm_response = await self.llm.call_with_tools( + model, messages, tools_schema, "required", temperature=temperature + ) + + # Process response + if llm_response.has_tool_calls(): + tool_calls = llm_response.tool_calls or [] + + # Check if finished tool was called + final_answer = self._extract_finished_args(tool_calls) + if final_answer: + break + + # Execute first non-finished tool call + first_call = None + for tc in tool_calls: + f = tc.get("function") if isinstance(tc, dict) else None + if f and f.get("name") != "finished": + first_call = tc + break + + if first_call is None: + # Only finished tool or no valid tools + final_answer = llm_response.content or "Task completed." + break + + # Execute the tool + messages.append({ + "role": "assistant", + "content": llm_response.content, + "tool_calls": [first_call], + }) + + result = await tool_utils.execute_single_tool( + tool_call=first_call, + session_context={ + "session_id": context.session_id, + "user_email": context.user_email, + "files": context.files, + }, + tool_manager=self.tool_manager, + update_callback=(self.connection.send_json if self.connection else None), + ) + + messages.append({ + "role": "tool", + "content": result.content, + "tool_call_id": result.tool_call_id, + }) + + # Emit tool results for artifact ingestion + await event_handler(AgentEvent(type="agent_tool_results", payload={"results": [result]})) + else: + # No tool calls - treat content as final answer + final_answer = llm_response.content or "Task completed." + break + + # Fallback if no final answer after max steps + if not final_answer: + final_answer = await self.llm.call_plain(model, messages, temperature=temperature) + + await event_handler(AgentEvent(type="agent_completion", payload={"steps": steps})) + return AgentResult(final_answer=final_answer, steps=steps, metadata={"agent_mode": True, "strategy": "act"}) diff --git a/backend/application/chat/agent/factory.py b/backend/application/chat/agent/factory.py new file mode 100644 index 0000000..c7538a5 --- /dev/null +++ b/backend/application/chat/agent/factory.py @@ -0,0 +1,135 @@ +"""Factory for creating agent loop instances based on strategy.""" + +import logging +from typing import Optional + +from interfaces.llm import LLMProtocol +from interfaces.tools import ToolManagerProtocol +from interfaces.transport import ChatConnectionProtocol +from modules.prompts.prompt_provider import PromptProvider + +from .protocols import AgentLoopProtocol +from .react_loop import ReActAgentLoop +from .think_act_loop import ThinkActAgentLoop +from .act_loop import ActAgentLoop + +logger = logging.getLogger(__name__) + + +class AgentLoopFactory: + """ + Factory for creating agent loop instances. + + This factory pattern allows for easy addition of new agent loop strategies + without modifying existing code. Simply add a new strategy to the registry. + """ + + def __init__( + self, + llm: LLMProtocol, + tool_manager: Optional[ToolManagerProtocol] = None, + prompt_provider: Optional[PromptProvider] = None, + connection: Optional[ChatConnectionProtocol] = None, + ): + """ + Initialize factory with shared dependencies. + + Args: + llm: LLM protocol implementation + tool_manager: Optional tool manager + prompt_provider: Optional prompt provider + connection: Optional connection for sending updates + """ + self.llm = llm + self.tool_manager = tool_manager + self.prompt_provider = prompt_provider + self.connection = connection + + # Registry of available strategies + self._strategy_registry = { + "react": ReActAgentLoop, + "think-act": ThinkActAgentLoop, + "think_act": ThinkActAgentLoop, + "thinkact": ThinkActAgentLoop, + "act": ActAgentLoop, + } + + # Cache of instantiated loops for performance + self._loop_cache: dict[str, AgentLoopProtocol] = {} + + def create(self, strategy: str = "think-act") -> AgentLoopProtocol: + """ + Create an agent loop instance for the given strategy. + + Args: + strategy: Strategy name (react, think-act, act, etc.) + + Returns: + AgentLoopProtocol instance + + Note: + If the strategy is not recognized, falls back to 'react' with a warning. + """ + strategy_normalized = strategy.lower().strip() + + # Check cache first + if strategy_normalized in self._loop_cache: + logger.info(f"Using agent loop strategy: {strategy_normalized}") + return self._loop_cache[strategy_normalized] + + # Look up strategy in registry + loop_class = self._strategy_registry.get(strategy_normalized) + + if loop_class is None: + logger.warning( + f"Unknown agent loop strategy '{strategy}', falling back to 'react'" + ) + loop_class = self._strategy_registry["react"] + strategy_normalized = "react" + + # Instantiate the loop + loop_instance = loop_class( + llm=self.llm, + tool_manager=self.tool_manager, + prompt_provider=self.prompt_provider, + connection=self.connection, + ) + + # Cache for future use + self._loop_cache[strategy_normalized] = loop_instance + + logger.info(f"Created and using agent loop strategy: {strategy_normalized}") + return loop_instance + + def get_available_strategies(self) -> list[str]: + """ + Get list of available strategy names. + + Returns: + List of strategy identifiers + """ + # Return unique strategy names (deduplicated) + unique_strategies = set() + for strategy in self._strategy_registry.keys(): + # Normalize to primary name + if strategy in ("react",): + unique_strategies.add("react") + elif strategy in ("think-act", "think_act", "thinkact"): + unique_strategies.add("think-act") + elif strategy in ("act",): + unique_strategies.add("act") + return sorted(unique_strategies) + + def register_strategy(self, name: str, loop_class: type[AgentLoopProtocol]) -> None: + """ + Register a new agent loop strategy. + + This allows for dynamic extension of available strategies. + + Args: + name: Strategy identifier + loop_class: Agent loop class to instantiate + """ + name_normalized = name.lower().strip() + self._strategy_registry[name_normalized] = loop_class + logger.info(f"Registered new agent loop strategy: {name_normalized}") diff --git a/backend/application/chat/agent/react_loop.py b/backend/application/chat/agent/react_loop.py index 4e04095..016ee3f 100644 --- a/backend/application/chat/agent/react_loop.py +++ b/backend/application/chat/agent/react_loop.py @@ -100,7 +100,7 @@ async def run( event_handler: AgentEventHandler, ) -> AgentResult: # Agent start - await event_handler(AgentEvent(type="agent_start", payload={"max_steps": max_steps})) + await event_handler(AgentEvent(type="agent_start", payload={"max_steps": max_steps, "strategy": "react"})) steps = 0 final_response: Optional[str] = None @@ -210,14 +210,16 @@ async def run( tools_schema = await error_utils.safe_get_tools_schema(self.tool_manager, selected_tools) tool_results: List[ToolResult] = [] + # Use "required" to force tool calling during Act phase + # The LiteLLM caller has fallback logic to "auto" if "required" is not supported if tools_schema: if data_sources and context.user_email: llm_response = await self.llm.call_with_rag_and_tools( - model, messages, data_sources, tools_schema, context.user_email, "auto", temperature=temperature + model, messages, data_sources, tools_schema, context.user_email, "required", temperature=temperature ) else: llm_response = await self.llm.call_with_tools( - model, messages, tools_schema, "auto", temperature=temperature + model, messages, tools_schema, "required", temperature=temperature ) if llm_response.has_tool_calls(): diff --git a/backend/application/chat/agent/think_act_loop.py b/backend/application/chat/agent/think_act_loop.py index 7c254b5..3447dc3 100644 --- a/backend/application/chat/agent/think_act_loop.py +++ b/backend/application/chat/agent/think_act_loop.py @@ -45,7 +45,7 @@ async def run( temperature: float, event_handler: AgentEventHandler, ) -> AgentResult: - await event_handler(AgentEvent(type="agent_start", payload={"max_steps": max_steps})) + await event_handler(AgentEvent(type="agent_start", payload={"max_steps": max_steps, "strategy": "think-act"})) steps = 0 final_answer: Optional[str] = None @@ -96,65 +96,69 @@ def parse_args(resp: LLMResponse) -> Dict[str, Any]: async def emit_think(text: str, step: int) -> None: await event_handler(AgentEvent(type="agent_reason", payload={"message": text, "step": step})) - # First think + # First think - ALWAYS happens before entering the loop steps += 1 await event_handler(AgentEvent(type="agent_turn_start", payload={"step": steps})) first_think = await self.llm.call_with_tools(model, messages, think_tools_schema, "required", temperature=temperature) think_args = parse_args(first_think) await emit_think(first_think.content or "", steps) + + # Check if we can finish immediately after first think if think_args.get("finish"): final_answer = think_args.get("final_answer") or first_think.content - else: - # Action loop - while steps < max_steps and final_answer is None: - # Act: single tool selection and execution - tools_schema: List[Dict[str, Any]] = [] - if selected_tools and self.tool_manager: - tools_schema = await error_utils.safe_get_tools_schema(self.tool_manager, selected_tools) - - if tools_schema: - if data_sources and context.user_email: - llm_response = await self.llm.call_with_rag_and_tools( - model, messages, data_sources, tools_schema, context.user_email, "auto", temperature=temperature - ) - else: - llm_response = await self.llm.call_with_tools( - model, messages, tools_schema, "auto", temperature=temperature - ) - - if llm_response.has_tool_calls(): - first_call = (llm_response.tool_calls or [None])[0] - if first_call is None: - final_answer = llm_response.content or "" - break - messages.append({"role": "assistant", "content": llm_response.content, "tool_calls": [first_call]}) - result = await tool_utils.execute_single_tool( - tool_call=first_call, - session_context={ - "session_id": context.session_id, - "user_email": context.user_email, - "files": context.files, - }, - tool_manager=self.tool_manager, - update_callback=(self.connection.send_json if self.connection else None), - ) - messages.append({"role": "tool", "content": result.content, "tool_call_id": result.tool_call_id}) - # Notify service to ingest artifacts - await event_handler(AgentEvent(type="agent_tool_results", payload={"results": [result]})) - else: - if llm_response.content: - final_answer = llm_response.content - break - - # Think after action - steps += 1 - await event_handler(AgentEvent(type="agent_turn_start", payload={"step": steps})) - think_resp = await self.llm.call_with_tools(model, messages, think_tools_schema, "required", temperature=temperature) - think_args = parse_args(think_resp) - await emit_think(think_resp.content or "", steps) - if think_args.get("finish"): - final_answer = think_args.get("final_answer") or think_resp.content - break + + # Action loop - entered after first think + while steps < max_steps and final_answer is None: + # Act: single tool selection and execution + tools_schema: List[Dict[str, Any]] = [] + if selected_tools and self.tool_manager: + tools_schema = await error_utils.safe_get_tools_schema(self.tool_manager, selected_tools) + + # Use "required" to force tool calling during Act phase + # The LiteLLM caller has fallback logic to "auto" if "required" is not supported + if tools_schema: + if data_sources and context.user_email: + llm_response = await self.llm.call_with_rag_and_tools( + model, messages, data_sources, tools_schema, context.user_email, "required", temperature=temperature + ) + else: + llm_response = await self.llm.call_with_tools( + model, messages, tools_schema, "required", temperature=temperature + ) + + if llm_response.has_tool_calls(): + first_call = (llm_response.tool_calls or [None])[0] + if first_call is None: + final_answer = llm_response.content or "" + break + messages.append({"role": "assistant", "content": llm_response.content, "tool_calls": [first_call]}) + result = await tool_utils.execute_single_tool( + tool_call=first_call, + session_context={ + "session_id": context.session_id, + "user_email": context.user_email, + "files": context.files, + }, + tool_manager=self.tool_manager, + update_callback=(self.connection.send_json if self.connection else None), + ) + messages.append({"role": "tool", "content": result.content, "tool_call_id": result.tool_call_id}) + # Notify service to ingest artifacts + await event_handler(AgentEvent(type="agent_tool_results", payload={"results": [result]})) + else: + if llm_response.content: + final_answer = llm_response.content + break + + # Think after action + steps += 1 + await event_handler(AgentEvent(type="agent_turn_start", payload={"step": steps})) + think_resp = await self.llm.call_with_tools(model, messages, think_tools_schema, "required", temperature=temperature) + think_args = parse_args(think_resp) + await emit_think(think_resp.content or "", steps) + if think_args.get("finish"): + final_answer = think_args.get("final_answer") or think_resp.content + break if not final_answer: final_answer = await self.llm.call_plain(model, messages, temperature=temperature) diff --git a/backend/application/chat/service.py b/backend/application/chat/service.py index 54cca5a..299ae00 100644 --- a/backend/application/chat/service.py +++ b/backend/application/chat/service.py @@ -24,7 +24,7 @@ # Import utilities from .utilities import tool_utils, file_utils, notification_utils, error_utils -from .agent import AgentLoopProtocol, ReActAgentLoop, ThinkActAgentLoop +from .agent import AgentLoopFactory from .agent.protocols import AgentContext, AgentEvent from core.prompt_risk import calculate_prompt_injection_risk, log_high_risk_event from core.auth_utils import create_authorization_manager @@ -48,17 +48,18 @@ def __init__( connection: Optional[ChatConnectionProtocol] = None, config_manager: Optional[ConfigManager] = None, file_manager: Optional[Any] = None, - agent_loop: Optional[AgentLoopProtocol] = None, + agent_loop_factory: Optional[AgentLoopFactory] = None, ): """ Initialize chat service with dependencies. - + Args: llm: LLM protocol implementation tool_manager: Optional tool manager connection: Optional connection for sending updates config_manager: Configuration manager file_manager: File manager for S3 operations + agent_loop_factory: Factory for creating agent loops (optional) """ self.llm = llm self.tool_manager = tool_manager @@ -69,31 +70,28 @@ def __init__( PromptProvider(self.config_manager) if self.config_manager else None ) self.file_manager = file_manager - # Agent loop DI (default to ReActAgentLoop). Allow override via config/env. - if agent_loop is not None: - self.agent_loop = agent_loop + + # Agent loop factory - create if not provided + if agent_loop_factory is not None: + self.agent_loop_factory = agent_loop_factory else: - strategy = None - try: - if self.config_manager: - strategy = self.config_manager.app_settings.agent_loop_strategy - except Exception: - strategy = None - strategy = (strategy or "react").lower() - if strategy in ("think-act", "think_act", "thinkact"): - self.agent_loop = ThinkActAgentLoop( - llm=self.llm, - tool_manager=self.tool_manager, - prompt_provider=self.prompt_provider, - connection=self.connection, - ) - else: - self.agent_loop = ReActAgentLoop( - llm=self.llm, - tool_manager=self.tool_manager, - prompt_provider=self.prompt_provider, - connection=self.connection, - ) + self.agent_loop_factory = AgentLoopFactory( + llm=self.llm, + tool_manager=self.tool_manager, + prompt_provider=self.prompt_provider, + connection=self.connection, + ) + + # Get default strategy from config + self.default_agent_strategy = "think-act" + try: + if self.config_manager: + config_strategy = self.config_manager.app_settings.agent_loop_strategy + if config_strategy: + self.default_agent_strategy = config_strategy.lower() + except Exception: + # Ignore config errors - fall back to default strategy + pass async def create_session( self, @@ -243,6 +241,7 @@ async def handle_chat_message( max_steps=kwargs.get("agent_max_steps", 30), update_callback=update_callback, temperature=temperature, + agent_loop_strategy=kwargs.get("agent_loop_strategy"), ) elif selected_tools and not only_rag: # Enforce MCP tool ACLs: filter tools to authorized servers only @@ -663,12 +662,20 @@ async def _handle_agent_mode_via_loop( max_steps: int, update_callback: Optional[UpdateCallback] = None, temperature: float = 0.7, + agent_loop_strategy: Optional[str] = None, ) -> Dict[str, Any]: - """Handle agent mode using the injected AgentLoopProtocol with event streaming. + """Handle agent mode using the factory-created AgentLoopProtocol with event streaming. Translates AgentEvents to UI notifications and persists artifacts; appends final assistant message to history and returns a chat response. + + Args: + agent_loop_strategy: Strategy name (react, think-act). Falls back to config default. """ + # Get agent loop from factory based on strategy + strategy = agent_loop_strategy or self.default_agent_strategy + agent_loop = self.agent_loop_factory.create(strategy) + # Build agent context agent_context = AgentContext( session_id=session.id, @@ -683,7 +690,7 @@ async def handle_event(evt: AgentEvent) -> None: p = evt.payload or {} # UI notifications (guard on connection) if et == "agent_start" and self.connection: - await notification_utils.notify_agent_update(update_type="agent_start", connection=self.connection, max_steps=p.get("max_steps")) + await notification_utils.notify_agent_update(update_type="agent_start", connection=self.connection, max_steps=p.get("max_steps"), strategy=p.get("strategy")) elif et == "agent_turn_start" and self.connection: await notification_utils.notify_agent_update(update_type="agent_turn_start", connection=self.connection, step=p.get("step")) elif et == "agent_reason" and self.connection: @@ -713,7 +720,7 @@ async def handle_event(evt: AgentEvent) -> None: await notification_utils.notify_agent_update(update_type="agent_error", connection=self.connection, message=p.get("message")) # Run the loop - result = await self.agent_loop.run( + result = await agent_loop.run( model=model, messages=messages, context=agent_context, diff --git a/backend/main.py b/backend/main.py index 5b082c6..7d21446 100644 --- a/backend/main.py +++ b/backend/main.py @@ -239,6 +239,7 @@ async def websocket_endpoint(websocket: WebSocket): agent_mode=data.get("agent_mode", False), agent_max_steps=data.get("agent_max_steps", 10), temperature=data.get("temperature", 0.7), + agent_loop_strategy=data.get("agent_loop_strategy"), update_callback=lambda message: websocket_update_callback(websocket, message), files=data.get("files") ) diff --git a/backend/mcp/pptx_generator/main.py b/backend/mcp/pptx_generator/main.py index 3c407bb..4e1e742 100644 --- a/backend/mcp/pptx_generator/main.py +++ b/backend/mcp/pptx_generator/main.py @@ -65,6 +65,15 @@ mcp = FastMCP("pptx_generator") +def _sanitize_filename(filename: str, max_length: int = 50) -> str: + """Sanitize filename by removing bad characters and truncating.""" + # Remove bad characters (anything not alphanumeric, underscore, or dash) + cleaned_filename = re.sub(r'[^\w\-]', '', filename) + # Remove newlines and extra spaces + cleaned_filename = re.sub(r'\s+', '', cleaned_filename) + # Truncate to max length + return cleaned_filename[:max_length] if cleaned_filename else "presentation" + def _is_backend_download_path(s: str) -> bool: """Detect backend-relative download paths like /api/files/download/....""" return isinstance(s, str) and s.startswith("/api/files/download/") @@ -166,8 +175,9 @@ def _add_image_to_slide(slide_obj, image_bytes: bytes, left: Inches = Inches(1), @mcp.tool def json_to_pptx( input_data: Annotated[str, "JSON string containing slide data in this format: {\"slides\": [{\"title\": \"Slide 1\", \"content\": \"- Item 1\\n- Item 2\\n- Item 3\"}, {\"title\": \"Slide 2\", \"content\": \"- Item A\\n- Item B\"}]}"], - image_filename: Annotated[str, "Optional image filename to integrate into the presentation"] = "", - image_data_base64: Annotated[str, "Framework may supply Base64 image content as fallback"] = "" + output_filename: Annotated[Optional[str], "Base name for output files (without extension)"] = "presentation", + image_filename: Annotated[Optional[str], "Optional image filename to integrate into the presentation"] = "", + image_data_base64: Annotated[Optional[str], "Framework may supply Base64 image content as fallback"] = "" ) -> Dict[str, Any]: """ Create professional PowerPoint presentations from structured JSON data with advanced formatting and multimedia support. @@ -239,6 +249,7 @@ def json_to_pptx( Args: input_data: JSON string with slide definitions (title and content pairs with bullet points) + output_filename: Base name for output files (without extension, default: "presentation") image_filename: Optional image file to embed in presentation (supports various formats) image_data_base64: Alternative Base64-encoded image content (automatically provided by framework) @@ -252,6 +263,11 @@ def json_to_pptx( """ print("Starting json_to_pptx execution...") try: + # Handle None values and sanitize the output filename + image_filename = image_filename or "" + image_data_base64 = image_data_base64 or "" + output_filename = _sanitize_filename(output_filename or "presentation") + import json data = json.loads(input_data) @@ -330,13 +346,13 @@ def json_to_pptx( # Write outputs to a temporary directory and clean up after encoding with tempfile.TemporaryDirectory() as tmpdir: # Save presentation - pptx_output_path = os.path.join(tmpdir, "output_presentation.pptx") + pptx_output_path = os.path.join(tmpdir, f"output_{output_filename}.pptx") prs.save(pptx_output_path) if VERBOSE: logger.info(f"Saved PowerPoint presentation to {pptx_output_path}") # Create HTML file instead of PDF - html_output_path = os.path.join(tmpdir, "output_presentation.html") + html_output_path = os.path.join(tmpdir, f"output_{output_filename}.html") if VERBOSE: logger.info(f"Starting HTML creation to {html_output_path}") @@ -440,7 +456,7 @@ def json_to_pptx( # Prepare artifacts artifacts = [ { - "name": "presentation.pptx", + "name": f"{output_filename}.pptx", "b64": pptx_b64, "mime": "application/vnd.openxmlformats-officedocument.presentationml.presentation", } @@ -449,7 +465,7 @@ def json_to_pptx( # Add HTML if creation was successful if html_b64: artifacts.append({ - "name": "presentation.html", + "name": f"{output_filename}.html", "b64": html_b64, "mime": "text/html", }) @@ -469,14 +485,14 @@ def json_to_pptx( "artifacts": artifacts, "display": { "open_canvas": True, - "primary_file": "presentation.pptx", + "primary_file": f"{output_filename}.pptx", "mode": "replace", "viewer_hint": "powerpoint", }, "meta_data": { "generated_slides": len(slides), - "output_files": [f"presentation.pptx", "presentation.html"] if html_b64 else ["presentation.pptx"], - "output_file_paths": ["temp:output_presentation.pptx", "temp:output_presentation.html"] if html_b64 else ["temp:output_presentation.pptx"], + "output_files": [f"{output_filename}.pptx", f"{output_filename}.html"] if html_b64 else [f"{output_filename}.pptx"], + "output_file_paths": [f"temp:output_{output_filename}.pptx", f"temp:output_{output_filename}.html"] if html_b64 else [f"temp:output_{output_filename}.pptx"], }, } except Exception as e: @@ -488,17 +504,19 @@ def json_to_pptx( @mcp.tool def markdown_to_pptx( markdown_content: Annotated[str, "Markdown content with headers (# or ##) as slide titles and content below each header"], - image_filename: Annotated[str, "Optional image filename to integrate into the presentation"] = "", - image_data_base64: Annotated[str, "Framework may supply Base64 image content as fallback"] = "" + output_filename: Annotated[Optional[str], "Base name for output files (without extension)"] = "presentation", + image_filename: Annotated[Optional[str], "Optional image filename to integrate into the presentation"] = "", + image_data_base64: Annotated[Optional[str], "Framework may supply Base64 image content as fallback"] = "" ) -> Dict[str, Any]: """ Converts markdown content to PowerPoint presentation with support for bullet point lists and optional image integration - + Args: markdown_content: Markdown content where headers (# or ##) become slide titles and content below becomes slide content + output_filename: Base name for output files (without extension, default: "presentation") image_filename: Optional image filename to integrate into the presentation image_data_base64: Framework may supply Base64 image content as fallback - + Returns: Dictionary with 'results' and 'artifacts' keys: - 'results': Success message or error message @@ -507,6 +525,11 @@ def markdown_to_pptx( if VERBOSE: logger.info("Starting markdown_to_pptx execution...") try: + # Handle None values and sanitize the output filename + image_filename = image_filename or "" + image_data_base64 = image_data_base64 or "" + output_filename = _sanitize_filename(output_filename or "presentation") + # Parse markdown into slides slides = _parse_markdown_slides(markdown_content) if VERBOSE: @@ -588,13 +611,13 @@ def markdown_to_pptx( # Write outputs to a temporary directory and clean up after encoding with tempfile.TemporaryDirectory() as tmpdir: # Save presentation - pptx_output_path = os.path.join(tmpdir, "output_presentation.pptx") + pptx_output_path = os.path.join(tmpdir, f"output_{output_filename}.pptx") prs.save(pptx_output_path) if VERBOSE: logger.info(f"Saved PowerPoint presentation to {pptx_output_path}") # Create HTML file instead of PDF - html_output_path = os.path.join(tmpdir, "output_presentation.html") + html_output_path = os.path.join(tmpdir, f"output_{output_filename}.html") if VERBOSE: logger.info(f"Starting HTML creation to {html_output_path}") @@ -711,7 +734,7 @@ def markdown_to_pptx( # Prepare artifacts artifacts = [ { - "name": "presentation.pptx", + "name": f"{output_filename}.pptx", "b64": pptx_b64, "mime": "application/vnd.openxmlformats-officedocument.presentationml.presentation", } @@ -720,7 +743,7 @@ def markdown_to_pptx( # Add HTML if creation was successful if html_b64: artifacts.append({ - "name": "presentation.html", + "name": f"{output_filename}.html", "b64": html_b64, "mime": "text/html", }) @@ -740,14 +763,14 @@ def markdown_to_pptx( "artifacts": artifacts, "display": { "open_canvas": True, - "primary_file": "presentation.pptx", + "primary_file": f"{output_filename}.pptx", "mode": "replace", "viewer_hint": "powerpoint", }, "meta_data": { "generated_slides": len(slides), - "output_files": [f"presentation.pptx", "presentation.html"] if html_b64 else ["presentation.pptx"], - "output_file_paths": ["temp:output_presentation.pptx", "temp:output_presentation.html"] if html_b64 else ["temp:output_presentation.pptx"], + "output_files": [f"{output_filename}.pptx", f"{output_filename}.html"] if html_b64 else [f"{output_filename}.pptx"], + "output_file_paths": [f"temp:output_{output_filename}.pptx", f"temp:output_{output_filename}.html"] if html_b64 else [f"temp:output_{output_filename}.pptx"], }, } except Exception as e: diff --git a/backend/modules/config/manager.py b/backend/modules/config/manager.py index 6a74ce3..75bc164 100644 --- a/backend/modules/config/manager.py +++ b/backend/modules/config/manager.py @@ -103,7 +103,7 @@ class AppSettings(BaseSettings): ) # Accept both old and new env var names agent_max_steps: int = 10 agent_loop_strategy: str = Field( - default="react", + default="think-act", description="Agent loop strategy selector (react, think-act)", validation_alias=AliasChoices("AGENT_LOOP_STRATEGY"), ) diff --git a/backend/modules/file_storage/s3_client.py b/backend/modules/file_storage/s3_client.py index 8fd45d5..eed4fc8 100644 --- a/backend/modules/file_storage/s3_client.py +++ b/backend/modules/file_storage/s3_client.py @@ -20,6 +20,14 @@ logger = logging.getLogger(__name__) +def _sanitize_for_logging(value: str) -> str: + """Sanitize user-controlled values for safe logging to prevent log injection attacks.""" + if isinstance(value, str): + # Escape or remove control characters that could enable log injection + return value.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') + return str(value) + + class S3StorageClient: """Client for interacting with S3-compatible storage (MinIO/AWS S3).""" diff --git a/backend/modules/llm/litellm_caller.py b/backend/modules/llm/litellm_caller.py index 85fc3d3..8e4965a 100644 --- a/backend/modules/llm/litellm_caller.py +++ b/backend/modules/llm/litellm_caller.py @@ -191,12 +191,8 @@ async def call_with_tools( litellm_model = self._get_litellm_model_name(model_name) model_kwargs = self._get_model_kwargs(model_name, temperature) - # Handle tool_choice parameter - some providers don't support "required" + # Handle tool_choice parameter - try "required" first, fallback to "auto" if unsupported final_tool_choice = tool_choice - if tool_choice == "required": - # Try with "required" first, fallback to "auto" if unsupported - final_tool_choice = "auto" - logger.info(f"Using tool_choice='auto' instead of 'required' for better compatibility") try: total_chars = sum(len(str(msg.get('content', ''))) for msg in messages) diff --git a/config/overrides/llmconfig.yml b/config/overrides/llmconfig.yml index a94151c..31d4f69 100644 --- a/config/overrides/llmconfig.yml +++ b/config/overrides/llmconfig.yml @@ -10,9 +10,9 @@ models: openrouter-gpt-oss: model_url: "https://openrouter.ai/api/v1/chat/completions" - model_name: "openai/gpt-oss-20b" + model_name: "openai/gpt-oss-120b" api_key: "${OPENROUTER_API_KEY}" - description: "OpenRouter aggregated GPT-4o model" + description: "OpenRouter aggregated openai gpt-oss 120B model" extra_headers: HTTP-Referer: "${OPENROUTER_SITE_URL}" X-Title: "${OPENROUTER_SITE_NAME}" diff --git a/config/overrides/messages.txt b/config/overrides/messages.txt index e69de29..be4076a 100644 --- a/config/overrides/messages.txt +++ b/config/overrides/messages.txt @@ -0,0 +1 @@ +This is a test message. diff --git a/docs/service-refactor-plan.md b/docs/service-refactor-plan.md new file mode 100644 index 0000000..2443ad9 --- /dev/null +++ b/docs/service-refactor-plan.md @@ -0,0 +1,256 @@ +# Service Refactor Plan + +## Objective + +Make the backend—and especially `backend/application/chat/service.py`—clearer, more modular, and easier to test by separating concerns into well-defined layers and services, without changing behavior. + +--- + +## What’s off today + +From `backend/application/chat/service.py` and the current repo layout: + +- ChatService is doing too much: + - Session management (in-memory) + - Request orchestration and branching (plain, tools, RAG, agent) + - MCP prompt override injection + - Tool ACL filtering + - File ingestion and artifact persistence + - Agent-loop event translation to UI + - Streaming notifications + - Error wrapping +- Inconsistent message typing (`List[Dict[str, str]]` vs `List[Dict[str, Any]]`) and ad-hoc message shapes. +- Large handler `handle_chat_message` with many flags and deep branching logic. +- Inline authorization and inline prompt override hide important policies in orchestration code. +- Transport concerns (WebSocket streaming) are coupled into application logic via direct `connection` + `notification_utils` calls. +- Legacy remnants and duplication (commented-out older implementation blocks). + +Net effect: lower testability, higher change risk, and difficulty adding new modes/transports. + +--- + +## Target architecture (ports and adapters) + +- Domain (pure): entities/models, errors, value objects + - Existing: `domain/messages`, `domain/sessions`, `domain/errors` + - Add: typed DTOs where needed (LLMMessage, ChatRequest, ChatResponse) +- Application (use-cases/orchestration): + - ChatOrchestrator: single entrypoint that wires steps but delegates to strategies/services + - Mode runners/strategies: PlainMode, ToolsMode, RagMode, AgentMode + - Preprocessors: MessageBuilder (history + files manifest), PromptOverrideService (MCP), RiskCheck (optional) + - Policy services: ToolAuthorizationService (ACL filtering), ToolSelectionPolicy (required/auto) + - ArtifactIngestor: updates session context with tool artifacts and emits file/canvas updates + - SessionManager: get/create/update session (backed by repository) + - EventPublisher: abstraction for UI updates (no direct transport dependency) +- Interfaces (ports/contracts): + - LLMCaller (reuse `LLMProtocol`) + - ToolManager (existing interface), plus a PromptOverrideProvider port if helpful + - FileStorage (file_manager port) + - SessionRepository (in-memory now; replaceable later) + - EventPublisher (UI transport-agnostic) + - Authorization (tool ACL port) +- Infrastructure (adapters): + - WebSocketEventPublisher (wraps `notification_utils` and `connection.send_json`) + - MCP ToolManager adapter and MCP PromptProvider adapter + - S3/MinIO FileStorage adapter (existing) + - InMemorySessionRepository (drop-in for current dict) + - Config-backed AuthorizationManager adapter (wraps `create_authorization_manager()`) + +Outcome: ChatOrchestrator stays thin and stable; each part evolves independently with strong contracts. + +--- + +## Key design decisions + +- Strong DTOs + - ChatRequest: model, content, selected_tools, selected_prompts, selected_data_sources, flags, temperature, user_email + - ChatResponse: final text + metadata + - LLMMessage: type-safe shape used across runners (role, content, optional tool_calls) +- Strategies for modes + - PlainModeRunner: LLM plain + - ToolsModeRunner: tool schemas + LLM + tool workflow + final synthesis + - RagModeRunner: rag-aware call + - AgentModeRunner: bridges AgentLoopProtocol, delegates EventPublisher and ArtifactIngestor +- Preprocessing pipeline + - Build base messages (history + files manifest) + - Apply MCP prompt override (first valid only, as today) + - Optional risk scoring/logging (from `core.prompt_risk`) +- Policies extracted out of orchestrator + - ToolAuthorizationService handles ACL per user, including special cases (e.g., `canvas_canvas`) + - ToolSelectionPolicy enforces “required” vs “auto” +- Eventing decoupled + - EventPublisher abstracts all UI updates; mapping to `notification_utils` lives in infra + - Agent events translation moved to an AgentEventRelay using EventPublisher +- Session management separated + - SessionManager + SessionRepository port (keep in-memory impl initially) +- Cleanup + - Remove legacy commented code + - Normalize message typing and signatures (no more `Dict[str, Any]` everywhere) + +--- + +## Proposed file structure and new modules + +- `backend/application/chat/` + - `orchestrator.py` (ChatOrchestrator – replaces most of ChatService’s `handle_chat_message`) + - `service.py` (Thin façade delegating to Orchestrator; retains public API temporarily) + - `modes/` + - `plain.py` (PlainModeRunner) + - `tools.py` (ToolsModeRunner) + - `rag.py` (RagModeRunner) + - `agent.py` (AgentModeRunner; wraps AgentLoopFactory/Protocol and event relay) + - `preprocessors/` + - `message_builder.py` (history + files manifest) + - `prompt_override_service.py` (MCP prompt override extraction/injection) + - `risk_check.py` (optional prompt risk logger using `core.prompt_risk`) + - `policies/` + - `tool_authorization.py` (ACL filtering) + - `tool_selection.py` (required vs auto) + - `artifacts/` + - `ingestor.py` (wraps `file_utils.process_tool_artifacts` and session context updates) + - `events/` + - `publisher.py` (EventPublisher interface; could live under `interfaces/`) + - `agent_event_relay.py` (maps AgentEvents -> EventPublisher calls) + - `sessions/` + - `manager.py` (SessionManager orchestrates fetch/update) + - `repository.py` (SessionRepository port + InMemory implementation) +- `backend/interfaces/` + - `transport.py` (existing `ChatConnectionProtocol`; add `EventPublisher`) + - `tools.py` (existing `ToolManagerProtocol`; add prompt retrieval port if needed) + - `prompts.py` (PromptProvider / PromptOverrideProvider port) + - `storage.py` (FileStorage port if not already abstracted) + - `authorization.py` (AuthorizationManager port) + - `sessions.py` (SessionRepository port) +- `backend/infrastructure/` + - `events/websocket_publisher.py` (wraps `notification_utils` + connection) + - `prompts/mcp_prompt_provider.py` (bridge `tool_manager.get_prompt` to PromptOverrideProvider) + - `sessions/in_memory.py` (in-memory session repo) + - `authorization/manager_adapter.py` (wrap `create_authorization_manager()`) + +Note: keep existing utilities but progressively move their usage into the appropriate application/infrastructure modules. + +--- + +## Phased refactor roadmap (no behavior change per phase) + +Phase 0: Preparations +- Remove dead/commented blocks (old `_handle_tools_mode_with_utilities` copy). +- Introduce DTOs: ChatRequest, ChatResponse, LLMMessage. +- Normalize message typing in `ChatService` and internal methods. + +Phase 1: Extract policies and preprocessing (low-risk) +- Move Tool ACL filtering into `policies/tool_authorization.py`. +- Extract MCP prompt override logic into `preprocessors/prompt_override_service.py` with an adapter using current `tool_manager.get_prompt`. +- Extract message building (history + files manifest) into `preprocessors/message_builder.py`. +- Keep `ChatService` calling these new modules. + +Phase 2: EventPublisher and AgentEventRelay +- Create `events/publisher.py` interface and `infrastructure/events/websocket_publisher.py` implementation (wraps `notification_utils` and `connection.send_json`). +- Extract agent event mapping into `events/agent_event_relay.py`. +- Replace direct `notification_utils` calls in `ChatService` with EventPublisher calls through a thin wrapper, but keep `notification_utils` usage inside the infra publisher. + +Phase 3: Mode strategies +- Extract `_handle_plain_mode`, `_handle_tools_mode_with_utilities`, `_handle_rag_mode`, `_handle_agent_mode_via_loop` into separate classes under `modes/`. +- Keep `ChatService.handle_chat_message` delegating to the proper ModeRunner based on flags. +- Ensure tool workflow + artifact ingest path is preserved, but routed through `artifacts/ingestor.py`. + +Phase 4: Orchestrator + SessionManager +- Create `orchestrator.py` consolidating preprocessing, policy checks, mode dispatch, and event publisher wiring. +- `ChatService` becomes a thin façade: takes ChatRequest, delegates to Orchestrator. +- Introduce SessionManager + SessionRepository; replace internal `self.sessions` dict progressively. + +Phase 5: Cleanup and documentation +- Update docstrings and docs/architecture notes. +- Remove transport-level calls from application layer. +- Consolidate `error_utils` usage into well-defined error boundaries in orchestrator and runners. + +--- + +## Acceptance criteria + +- Behavior unchanged: + - Same inputs produce same UI updates and final assistant messages (including MCP prompt override behavior, tool ACL filtering, canvas/file events). + - Existing tests pass without modification. +- Type hygiene: + - No stray `Any` in new code paths; DTOs and protocols are typed. +- Clear separation: + - No transport-level imports or calls in application layer. + - Policies and preprocessing are not embedded in orchestrator code. +- Backwards compatibility: + - `ChatService` public method signatures preserved for at least one release cycle (wrapping Orchestrator). +- Observability: + - Logging remains at parity; sensitive fields still sanitized. + +--- + +## File-by-file highlights (first waves) + +- `backend/application/chat/service.py` + - Keep class but reduce responsibility: delegate to Orchestrator + - Remove inline tool ACL and prompt override; call services + - Remove commented legacy block + - Normalize messages typing via LLMMessage DTO +- `backend/application/chat/preprocessors/prompt_override_service.py` + - Move MCP prompt override injection logic; keep “first valid prompt” rule +- `backend/application/chat/policies/tool_authorization.py` + - Move ACL filtering logic, including `canvas_canvas` special-case and authorized server prefix check +- `backend/application/chat/modes/tools.py` + - Hold tool schema resolution, LLM call with tools, tool workflow execution, artifact ingest, final synthesis and event publishing via EventPublisher +- `backend/application/chat/modes/agent.py` + - Wrap AgentLoopFactory; use AgentEventRelay to publish updates and ingest artifacts +- `backend/application/chat/artifacts/ingestor.py` + - Wrap `file_utils.process_tool_artifacts` and session context updates +- `backend/infrastructure/events/websocket_publisher.py` + - All calls to `notification_utils.*` live here; application layer only publishes events + +--- + +## Testing and migration + +- Unit tests + - PromptOverrideService: parses and injects system message correctly from varying MCP prompt shapes + - ToolAuthorizationService: filters tools given user and servers, including underscore server names and the canvas special-case + - Mode runners: happy path + “no tool calls” path + failure path + - EventPublisher/WebSocketPublisher: calls the correct `notification_utils` functions +- Integration tests + - Full chat flow (plain, tools, rag, agent) using LLM/ToolManager fakes or existing mocks in `mocks/` + - Verify artifacts ingestion triggers file/canvas updates as before +- E2E + - Re-use `./test/run_tests.sh all` as-is (per project docs) +- Migration plan + - Phased PRs per phase above; each PR keeps tests green + - Introduce DTOs and strategies without changing routes or API payloads + - Keep `ChatService` API stable; wire new orchestrator under the hood +- Rollback plan + - Each phase is reversible by toggling orchestrator/strategy injection back to legacy code path for that mode + +--- + +## Risks and mitigations + +- Behavior drift in event ordering or content + - Mitigation: capture golden recordings of `notification_utils` calls in tests before refactor; assert on order and payloads +- Tool ACL discrepancies + - Mitigation: explicit tests with multiple server names (including underscores) and the canvas special-case +- Async/event coupling in Agent mode + - Mitigation: encapsulate AgentEventRelay; keep exact mapping semantics; add tests for sequence of events +- Message shape mismatches + - Mitigation: introduce LLMMessage early; add adapters where legacy dicts still exist +- MCP prompt variations + - Mitigation: preserve robust parsing with fallback to `str(prompt_obj)`; unit tests with multiple prompt shapes + +--- + +## Small adjacent improvements + +- Replace ad-hoc log sanitization calls with a `LogContext` helper used consistently. +- Cache tool schemas and MCP prompts per session to reduce repeated lookups. +- Standardize metadata keys in assistant messages, e.g., `{ "mode": "tools", "tools": [...], "data_sources": [...] }`. + +--- + +## Next steps + +- Start with Phase 1 (policies + preprocessors) — lowest risk, highest clarity gain. +- Scaffold modules and wire them into `ChatService` without changing behavior. +- Add focused unit tests for new modules and keep integration/E2E tests passing. diff --git a/frontend/src/components/SettingsPanel.jsx b/frontend/src/components/SettingsPanel.jsx index 5491581..6c8f841 100644 --- a/frontend/src/components/SettingsPanel.jsx +++ b/frontend/src/components/SettingsPanel.jsx @@ -5,7 +5,8 @@ const SettingsPanel = ({ isOpen, onClose }) => { // Default settings const defaultSettings = { llmTemperature: 0.7, - maxIterations: 10 + maxIterations: 10, + agentLoopStrategy: 'think-act' } // State for settings @@ -149,6 +150,39 @@ const SettingsPanel = ({ isOpen, onClose }) => { + {/* Agent Loop Strategy Setting */} +
+ Think-Act: Concise, unified reasoning approach. + Faster iterations with fewer LLM calls. Better for most workflows and quick tasks. +
++ ReAct: Structured reasoning with Reason-Act-Observe phases. + Better for complex tasks requiring multiple tools and detailed planning. Slower but more thorough. +
++ Act: Pure action loop without explicit reasoning steps. + Fastest strategy with minimal overhead. LLM calls tools directly and signals completion via the "finished" tool. +
+