From 0655d2399c60d9d9a1526073a3b0ae1ced2b4b93 Mon Sep 17 00:00:00 2001 From: Anthony Date: Wed, 29 Oct 2025 05:38:37 +0000 Subject: [PATCH 1/3] feat(utilities): enhance logging in tool workflow and LLM response handling - Add detailed step logging in tool_utils.py for execute_tools_workflow, execute_single_tool, and prepare_tool_arguments functions to trace tool execution process - Modify llm logging in error_utils.py to output full llm_response instead of just has_tool_calls for better debugging - Update pdfbasic main.py to include logging import and adjust _analyze_pdf_content function to accept optional original_filename parameter for improved context handling in PDF analysis --- .../application/chat/utilities/error_utils.py | 2 +- .../application/chat/utilities/tool_utils.py | 5 + backend/mcp/pdfbasic/main.py | 441 ++++++++---------- backend/modules/llm/litellm_caller.py | 5 + backend/modules/mcp_tools/client.py | 1 + base-image-update-plan.md | 105 ----- config/overrides/mcp.json | 10 + 7 files changed, 208 insertions(+), 361 deletions(-) delete mode 100644 base-image-update-plan.md diff --git a/backend/application/chat/utilities/error_utils.py b/backend/application/chat/utilities/error_utils.py index 3ab2c56..c2ad5ea 100644 --- a/backend/application/chat/utilities/error_utils.py +++ b/backend/application/chat/utilities/error_utils.py @@ -85,7 +85,7 @@ async def safe_call_llm_with_tools( llm_response = await llm_caller.call_with_tools( model, messages, tools_schema, tool_choice, temperature=temperature ) - logger.info(f"LLM response received with tools only, has_tool_calls: {llm_response.has_tool_calls()}") + logger.info(f"LLM response received with tools only, llm_response: {llm_response}") return llm_response except Exception as e: logger.error(f"Error calling LLM with tools: {e}", exc_info=True) diff --git a/backend/application/chat/utilities/tool_utils.py b/backend/application/chat/utilities/tool_utils.py index 51140f9..35b3a67 100644 --- a/backend/application/chat/utilities/tool_utils.py +++ b/backend/application/chat/utilities/tool_utils.py @@ -35,6 +35,7 @@ async def execute_tools_workflow( Pure function that coordinates tool execution without maintaining state. """ + logger.info("Step 4: Entering execute_tools_workflow") # Add assistant message with tool calls messages.append({ "role": "assistant", @@ -115,6 +116,7 @@ async def execute_single_tool( Pure function that doesn't maintain state - all context passed as parameters. """ + logger.info("Step 5: Entering execute_single_tool") from . import notification_utils try: @@ -233,6 +235,7 @@ def prepare_tool_arguments(tool_call, session_context: Dict[str, Any], tool_mana Pure function that transforms arguments based on context and tool schema. """ + logger.info("Step 6: Entering prepare_tool_arguments") # Parse raw arguments raw_args = getattr(tool_call.function, "arguments", {}) if isinstance(raw_args, dict): @@ -286,6 +289,7 @@ def to_url(key: str) -> str: ref = files_ctx.get(fname) if ref and ref.get("key"): url = to_url(ref["key"]) + logger.info(f"Step 6.1: Rewriting filename to URL: {url}") parsed_args.setdefault("original_filename", fname) parsed_args["filename"] = url parsed_args.setdefault("file_url", url) @@ -304,6 +308,7 @@ def to_url(key: str) -> str: else: urls.append(fname) if urls: + logger.info(f"Step 6.1: Rewriting filenames to URLs: {urls}") parsed_args.setdefault("original_file_names", originals) parsed_args["file_names"] = urls parsed_args.setdefault("file_urls", urls) diff --git a/backend/mcp/pdfbasic/main.py b/backend/mcp/pdfbasic/main.py index d55b25b..c429fc0 100644 --- a/backend/mcp/pdfbasic/main.py +++ b/backend/mcp/pdfbasic/main.py @@ -7,8 +7,10 @@ import base64 import io import re +import requests +import logging from collections import Counter -from typing import Any, Dict, Annotated +from typing import Any, Dict, Annotated, Optional # This tool requires the PyPDF2 and reportlab libraries. # Install them using: pip install PyPDF2 reportlab @@ -19,31 +21,59 @@ from fastmcp import FastMCP +logger = logging.getLogger(__name__) + mcp = FastMCP("PDF_Analyzer") -def _analyze_pdf_content(instructions: str, filename: str, file_data_base64: str) -> Dict[str, Any]: +def _analyze_pdf_content(instructions: str, filename: str, original_filename: Optional[str] = None) -> Dict[str, Any]: """ Core PDF analysis logic that can be reused by multiple tools. Args: instructions: Instructions for the tool, not used in this implementation. filename: The name of the file, which must have a '.pdf' extension. - file_data_base64: The Base64-encoded string of the PDF file content. + original_filename: The original name of the file. Returns: A dictionary containing the analysis results or an error message. """ try: # print the instructions. - print(f"Instructions: {instructions}") + logger.info(f"Instructions: {instructions}") # 1. Validate that the filename is for a PDF - if not filename.lower().endswith('.pdf'): + if not (filename.lower().endswith('.pdf') or (original_filename and original_filename.lower().endswith('.pdf'))): return {"results": {"error": "Invalid file type. This tool only accepts PDF files."}} # 2. Decode the Base64 data and read the PDF content - decoded_bytes = base64.b64decode(file_data_base64) - pdf_stream = io.BytesIO(decoded_bytes) + # Check if filename is a URL (absolute or relative) + is_url = ( + filename.startswith("http://") or + filename.startswith("https://") or + filename.startswith("/api/") or + filename.startswith("/") + ) + + if is_url: + # Convert relative URLs to absolute URLs + if filename.startswith("/"): + # Construct absolute URL from relative path + # Default to localhost:8000 for local development + import os + backend_url = os.getenv("BACKEND_URL", "http://localhost:8000") + url = f"{backend_url}{filename}" + else: + url = filename + + logger.info(f"Step 9: Downloading file from URL: {url}") + response = requests.get(url) + response.raise_for_status() + pdf_stream = io.BytesIO(response.content) + else: + # Assume it's base64-encoded data + decoded_bytes = base64.b64decode(filename) + pdf_stream = io.BytesIO(decoded_bytes) + reader = PdfReader(pdf_stream) full_text = "" @@ -56,7 +86,7 @@ def _analyze_pdf_content(instructions: str, filename: str, file_data_base64: str return { "results": { "operation": "pdf_analysis", - "filename": filename, + "filename": original_filename or filename, "status": "Success", "message": "PDF contained no extractable text.", "total_word_count": 0, @@ -78,7 +108,7 @@ def _analyze_pdf_content(instructions: str, filename: str, file_data_base64: str return { "results": { "operation": "pdf_analysis", - "filename": filename, + "filename": original_filename or filename, "total_word_count": total_word_count, "top_100_words": top_100_words_dict } @@ -92,258 +122,159 @@ def _analyze_pdf_content(instructions: str, filename: str, file_data_base64: str return {"results": {"error": f"PDF analysis failed: {str(e)}"}} -@mcp.tool -def analyze_pdf( - instructions: Annotated[str, "Instructions for the tool, not used in this implementation"], - filename: Annotated[str, "The name of the file, which must have a '.pdf' extension"], - file_data_base64: Annotated[str, "LLM agent can leave blank. Do NOT fill. This will be filled by the framework."] = "" -) -> Dict[str, Any]: - """ - Extract and analyze text content from PDF documents with comprehensive word frequency analysis. - - This powerful PDF processing tool provides detailed text analytics for PDF documents: - - **PDF Text Extraction:** - - Extracts text from all pages in PDF documents - - Handles various PDF formats and structures - - Works with both text-based and scanned PDFs (text extraction only) - - Preserves document structure and content flow - - **Text Analysis Features:** - - Complete word count across entire document - - Top 100 most frequently used words identification - - Case-insensitive word analysis for accurate frequency counting - - Word pattern recognition and linguistic analysis - - Document length and content density assessment - - **Content Processing:** - - Intelligent text cleaning and normalization - - Punctuation and formatting handling - - Multi-language text support - - Special character and encoding management - - **Analytics Insights:** - - Document vocabulary richness and complexity - - Key topic identification through word frequency - - Content themes and focus areas analysis - - Writing style and language pattern recognition - - Document structure and organization assessment - - **Use Cases:** - - Academic paper and research document analysis - - Legal document keyword extraction and analysis - - Content marketing and SEO keyword research - - Document classification and categorization - - Research literature review and summarization - - Contract and agreement content analysis - - **Supported PDF Types:** - - Research papers, reports, and academic documents - - Business documents, contracts, and agreements - - Marketing materials and content documents - - Technical documentation and manuals - - Legal documents and regulatory filings - - **Output Format:** - - Structured word frequency data - - Total document word count statistics - - Top 100 words with occurrence frequencies - - Document metadata and processing information - - Args: - instructions: Processing instructions or requirements (currently not used) - filename: PDF file name (must end with .pdf extension) - file_data_base64: Base64-encoded PDF content (automatically provided by framework) - - Returns: - Dictionary containing: - - operation: Processing type confirmation - - filename: Source PDF file name - - total_word_count: Complete document word count - - top_100_words: Dictionary of most frequent words with counts - Or error message if PDF cannot be processed or contains no extractable text +@mcp.tool +def analyze_pdf( + instructions: Annotated[str, "Instructions for the tool, not used in this implementation"], + filename: Annotated[str, "The name of the file, which must have a '.pdf' extension"], + original_filename: Optional[str] = None +) -> Dict[str, Any]: """ - return _analyze_pdf_content(instructions, filename, file_data_base64) - - -@mcp.tool -def generate_report_about_pdf( - instructions: Annotated[str, "Instructions for the tool, not used in this implementation"], - filename: Annotated[str, "The name of the file, which must have a '.pdf' extension"], - file_data_base64: Annotated[str, "LLM agent can leave blank. Do NOT fill. This will be filled by the framework."] = "" -) -> Dict[str, Any]: - """ - Create comprehensive PDF analysis reports with professional formatting and detailed word frequency insights. - - This advanced PDF reporting tool combines text analysis with professional document generation: - - **Complete PDF Analysis Workflow:** - - Performs full text extraction and word frequency analysis - - Generates professional analysis reports in PDF format - - Creates downloadable documents with structured data presentation - - Provides ready-to-share analytical insights - - **Report Contents:** - - Executive summary with document overview - - Total word count and document statistics - - Top 100 most frequent words with occurrence counts - - Professional multi-column layout for easy reading - - Organized tabular presentation of word frequency data - - **Report Features:** - - Clean, professional PDF formatting using ReportLab - - Multi-column layout optimizing space usage - - Clear headers and structured information hierarchy - - Page management for large datasets - - High-quality typography and spacing - - **Document Generation:** - - Creates new PDF reports from analysis results - - Professional business document appearance - - Optimized layout for printing and digital sharing - - Comprehensive data presentation in readable format - - **Use Cases:** - - Academic research document analysis reporting - - Legal document content analysis for litigation support - - Content marketing keyword research documentation - - Business document compliance and review reporting - - Research literature analysis and summarization - - Document classification and content audit reports - - **Report Applications:** - - Stakeholder presentations with document insights - - Content strategy planning based on word analysis - - Academic research methodology documentation - - Legal discovery and document review processes - - Quality assurance for written content - - **Output Features:** - - Professional PDF report with embedded analysis - - Downloadable file for offline access and sharing - - Structured data visualization in document format - - Ready-to-present analytical insights - - Args: - instructions: Report generation instructions or requirements (currently not used) - filename: Source PDF file name (must end with .pdf extension) - file_data_base64: Base64-encoded PDF content (automatically provided by framework) - - Returns: - Dictionary containing: - - results: Report generation summary and success confirmation - - artifacts: Professional PDF report with complete analysis - - display: Optimized viewer configuration for report presentation - - meta_data: Source file information and analysis statistics - Or error message if PDF cannot be processed or report generation fails + Extract and analyze text content from PDF documents with comprehensive word frequency analysis. + + This powerful PDF processing tool provides detailed text analytics for PDF documents: + + **PDF Text Extraction:** + - Extracts text from all pages in PDF documents + - Handles various PDF formats and structures + - Works with both text-based and scanned PDFs (text extraction only) + - Preserves document structure and content flow + + **Text Analysis Features:** + - Complete word count across entire document + - Top 100 most frequently used words identification + - Case-insensitive word analysis for accurate frequency counting + - Word pattern recognition and linguistic analysis + - Document length and content density assessment + + **Content Processing:** + - Intelligent text cleaning and normalization + - Punctuation and formatting handling + - Multi-language text support + - Special character and encoding management + + **Analytics Insights:** + - Document vocabulary richness and complexity + - Key topic identification through word frequency + - Content themes and focus areas analysis + - Writing style and language pattern recognition + - Document structure and organization assessment + + **Use Cases:** + - Academic paper and research document analysis + - Legal document keyword extraction and analysis + - Content marketing and SEO keyword research + - Document classification and categorization + - Research literature review and summarization + - Contract and agreement content analysis + + **Supported PDF Types:** + - Research papers, reports, and academic documents + - Business documents, contracts, and agreements + - Marketing materials and content documents + - Technical documentation and manuals + - Legal documents and regulatory filings + + **Output Format:** + - Structured word frequency data + - Total document word count statistics + - Top 100 words with occurrence frequencies + - Document metadata and processing information + + Args: + instructions: Processing instructions or requirements (currently not used) + filename: PDF file name (must end with .pdf extension) + original_filename: The original name of the file. + + Returns: + Dictionary containing: + - operation: Processing type confirmation + - filename: Source PDF file name + - total_word_count: Complete document word count + - top_100_words: Dictionary of most frequent words with counts + Or error message if PDF cannot be processed or contains no extractable text """ - # --- 1. Perform the same analysis as the first function --- - analysis_result = _analyze_pdf_content(instructions, filename, file_data_base64) - if "error" in analysis_result: - return analysis_result # Return the error if analysis failed + logger.info("Step 8: Entering analyze_pdf tool") + return _analyze_pdf_content(instructions, filename, original_filename) - # --- 2. Generate a PDF report from the analysis results --- - try: - buffer = io.BytesIO() - # Create a canvas to draw on, using the buffer as the "file" - p = canvas.Canvas(buffer, pagesize=letter) - width, height = letter - - # Set up starting coordinates - x = inch - y = height - inch - - # Write title - p.setFont("Helvetica-Bold", 16) - p.drawString(x, y, f"Analysis Report for: {analysis_result['filename']}") - y -= 0.5 * inch - - # Write summary - p.setFont("Helvetica", 12) - p.drawString(x, y, f"Total Word Count: {analysis_result['total_word_count']}") - y -= 0.3 * inch - - # Write header for top words - p.setFont("Helvetica-Bold", 12) - p.drawString(x, y, "Top 100 Most Frequent Words:") - y -= 0.25 * inch - - # Write the list of top words - p.setFont("Helvetica", 10) - col1_x, col2_x, col3_x, col4_x = x, x + 1.75*inch, x + 3.5*inch, x + 5.25*inch - current_x = col1_x - - # Simple column layout - count = 0 - for word, freq in analysis_result['top_100_words'].items(): - if y < inch: # New page if we run out of space - p.showPage() - p.setFont("Helvetica", 10) - y = height - inch - - p.drawString(current_x, y, f"{word}: {freq}") - - # Move to the next column - if count % 4 == 0: current_x = col2_x - elif count % 4 == 1: current_x = col3_x - elif count % 4 == 2: current_x = col4_x - else: # Move to the next row - current_x = col1_x - y -= 0.2 * inch - count += 1 - - # Finalize the PDF - p.save() - - # --- 3. Encode the generated PDF for return --- - report_bytes = buffer.getvalue() - buffer.close() - report_base64 = base64.b64encode(report_bytes).decode('utf-8') - # Create a new filename for the report - report_filename = f"analysis_report_{filename.replace('.pdf', '.txt')}.pdf" +@mcp.tool +def generate_report_about_pdf( + instructions: Annotated[str, "Instructions for the tool, not used in this implementation"], + filename: Annotated[str, "The name of the file, which must have a '.pdf' extension"], + original_filename: Optional[str] = None +) -> Dict[str, Any]: + """ + Create comprehensive PDF analysis reports with professional formatting and detailed word frequency insights. - # --- 4. Return v2 MCP format with artifacts and display --- - return { - "results": { - "operation": "pdf_analysis_report", - "original_filename": filename, - "message": f"Successfully generated analysis report for {filename}." - }, - "artifacts": [ - { - "name": report_filename, - "b64": report_base64, - "mime": "application/pdf", - "size": len(report_bytes), - "description": f"Analysis report for {filename} with word frequency data", - "viewer": "pdf" - } - ], - "display": { - "open_canvas": True, - "primary_file": report_filename, - "mode": "replace", - "viewer_hint": "pdf" - }, - "meta_data": { - "original_file": filename, - "word_count": analysis_result["results"]["total_word_count"], - "report_type": "pdf_analysis", - "top_words_count": len(analysis_result["results"]["top_100_words"]) - } - } + This advanced PDF reporting tool combines text analysis with professional document generation: + + **Complete PDF Analysis Workflow:** + - Performs full text extraction and word frequency analysis + - Generates professional analysis reports in PDF format + - Creates downloadable documents with structured data presentation + - Provides ready-to-share analytical insights + + **Report Contents:** + - Executive summary with document overview + - Total word count and document statistics + - Top 100 most frequent words with occurrence counts + - Professional multi-column layout for easy reading + - Organized tabular presentation of word frequency data + + **Report Features:** + - Clean, professional PDF formatting using ReportLab + - Multi-column layout optimizing space usage + - Clear headers and structured information hierarchy + - Page management for large datasets + - High-quality typography and spacing + + **Document Generation:** + - Creates new PDF reports from analysis results + - Professional business document appearance + - Optimized layout for printing and digital sharing + - Comprehensive data presentation in readable format + + **Use Cases:** + - Academic research document analysis reporting + - Legal document content analysis for litigation support + - Content marketing keyword research documentation + - Business document compliance and review reporting + - Research literature analysis and summarization + - Document classification and content audit reports + + **Report Applications:** + - Stakeholder presentations with document insights + - Content strategy planning based on word analysis + - Academic research methodology documentation + - Legal discovery and document review processes + - Quality assurance for written content + + **Output Features:** + - Professional PDF report with embedded analysis + - Downloadable file for offline access and sharing + - Structured data visualization in document format + - Ready-to-present analytical insights + + Args: + instructions: Report generation instructions or requirements (currently not used) + filename: Source PDF file name (must end with .pdf extension) + original_filename: The original name of the file. + + Returns: + Dictionary containing: + - results: Report generation summary and success confirmation + - artifacts: Professional PDF report with complete analysis + - display: Optimized viewer configuration for report presentation + - meta_data: Source file information and analysis statistics + Or error message if PDF cannot be processed or report generation fails + """ + logger.info("Step 8: Entering generate_report_about_pdf tool") + # --- 1. Perform the same analysis as the first function --- + analysis_result = _analyze_pdf_content(instructions, filename, original_filename) + if "error" in analysis_result.get("results", {}): + return analysis_result - except Exception as e: - # print traceback for debugging - import traceback - traceback.print_exc() - return {"results": {"error": f"Failed to generate PDF report: {str(e)}"}} if __name__ == "__main__": - # This will start the server and listen for MCP requests. - # To use it, you would run this script and then connect to it - # with a FastMCP client. - print("Starting PDF Analyzer MCP server with report generation...") - mcp.run() + mcp.run() \ No newline at end of file diff --git a/backend/modules/llm/litellm_caller.py b/backend/modules/llm/litellm_caller.py index 8e4965a..52c5ba3 100644 --- a/backend/modules/llm/litellm_caller.py +++ b/backend/modules/llm/litellm_caller.py @@ -207,6 +207,11 @@ async def call_with_tools( ) message = response.choices[0].message + + if tool_choice == "required" and not getattr(message, 'tool_calls', None): + logger.error(f"LLM failed to return tool calls when tool_choice was 'required'. Full response: {response}") + raise ValueError("LLM failed to return tool calls when tool_choice was 'required'.") + return LLMResponse( content=getattr(message, 'content', None) or "", tool_calls=getattr(message, 'tool_calls', None), diff --git a/backend/modules/mcp_tools/client.py b/backend/modules/mcp_tools/client.py index 71a4ce6..979cc95 100644 --- a/backend/modules/mcp_tools/client.py +++ b/backend/modules/mcp_tools/client.py @@ -777,6 +777,7 @@ async def execute_tool( context: Optional[Dict[str, Any]] = None ) -> ToolResult: """Execute a tool call.""" + logger.info(f"Step 7: Entering ToolManager.execute_tool for tool {tool_call.name}") # Handle canvas pseudo-tool if tool_call.name == "canvas_canvas": # Canvas tool just returns the content - it's handled by frontend diff --git a/base-image-update-plan.md b/base-image-update-plan.md deleted file mode 100644 index 932ccde..0000000 --- a/base-image-update-plan.md +++ /dev/null @@ -1,105 +0,0 @@ -# Base Image Update Plan: Ubuntu → Fedora - -## Overview -Migrate from Ubuntu 24.04 to Fedora:latest base image and ensure all tests pass. Remove Playwright dependency issues while maintaining comprehensive testing coverage. - -## Current State Analysis -- **Base Images**: Ubuntu 24.04 in both `Dockerfile` and `Dockerfile-test` -- **Testing Strategy**: Mix of Playwright (problematic) and simple E2E tests with Beautiful Soup -- **CI/CD**: GitHub Actions using test container → build production → push -- **Current Tests**: Backend tests, frontend tests, E2E tests (Playwright + simple Python) - -## Migration Plan - -### Phase 1: Update Base Images -1. **Replace Ubuntu with Fedora:latest** in both Dockerfiles -2. **Update package managers**: `apt-get` → `dnf` -3. **Update package names**: Fedora equivalents for system dependencies -4. **Fix Node.js installation**: Use Fedora's Node.js packages or NodeSource for Fedora - -### Phase 2: Comment Out Playwright Dependencies -1. **Comment out Playwright tests** in test scripts (DO NOT DELETE) -2. **Keep only Beautiful Soup-based E2E tests** (`simple_e2e_test.py`) -3. **Update test runners** to skip Playwright -4. **Comment out Playwright dependencies** in package.json (keep for future) - -### Phase 3: Fedora-Specific Adjustments -1. **User management**: Fedora uses different commands for user creation -2. **Python setup**: Ensure Python 3.12 is available on Fedora -3. **uv installer**: Verify uv works on Fedora -4. **System dependencies**: Update curl, hostname, sudo installation - -### Phase 4: Testing Strategy -1. **Keep simple E2E tests**: HTTP requests to test API endpoints -2. **Keep backend tests**: pytest-based unit tests -3. **Keep frontend tests**: Vitest/Jest tests (no browser required) -4. **Comment out**: All Playwright browser-based tests - -### Phase 5: Local Testing & CI/CD -1. **Test locally** with new Dockerfiles -2. **Fix any Fedora-specific issues** -3. **Commit and push** to trigger GitHub Actions -4. **Monitor CI/CD** and fix failures iteratively - -## Key Changes - -### Package Manager Changes -- `apt-get update && apt-get install -y` → `dnf update -y && dnf install -y` -- `apt-get clean && rm -rf /var/lib/apt/lists/*` → `dnf clean all` - -### System Package Mapping -- `python3` → `python3` (same) -- `python3-pip` → `python3-pip` (same) -- `python3-venv` → `python3-virtualenv` -- `nodejs` → `nodejs` -- `npm` → `npm` -- `curl` → `curl` (same) -- `hostname` → `hostname` (same) -- `sudo` → `sudo` (same) -- `ca-certificates` → `ca-certificates` (same) -- `dos2unix` → `dos2unix` (same) -- `wget` → `wget` (same) - -### Node.js Installation -- Replace NodeSource Ubuntu repo with Fedora approach -- Use either Fedora's built-in Node.js or NodeSource Fedora repo - -### User Management -- `groupadd -r appuser && useradd -r -g appuser appuser` should work the same on Fedora - -### Testing Changes -- Comment out Playwright test execution in `test/e2e_tests.sh` -- Keep `simple_e2e_test.py` as primary E2E testing -- Comment out Playwright dependencies in `frontend/package.json` -- Update test scripts to skip Playwright steps - -## Risk Mitigation -- **Incremental approach**: Test each Dockerfile separately -- **Fallback plan**: Can revert to Ubuntu if Fedora causes major issues -- **Simple tests**: Focus on HTTP-based tests that don't depend on browser automation -- **Preserve Playwright**: Comment out rather than delete for future use - -## Success Criteria -1. Both Dockerfiles build successfully with Fedora base -2. All non-Playwright tests pass locally -3. CI/CD pipeline passes with new configuration -4. Application runs correctly in Fedora container -5. API endpoints are accessible and functional - -## Files to Modify -- `Dockerfile` - Production image -- `Dockerfile-test` - Test image -- `test/e2e_tests.sh` - Comment out Playwright execution -- `frontend/package.json` - Comment out Playwright dependencies -- Any other test scripts that reference Playwright - -## Timeline -- Phase 1-3: Update Dockerfiles and dependencies -- Phase 4: Update test configuration -- Phase 5: Local testing and CI/CD validation - -## Notes -- Always use timeouts for network operations -- Test locally before pushing to CI/CD -- Monitor resource usage during Fedora migration -- Keep detailed logs of any issues encountered \ No newline at end of file diff --git a/config/overrides/mcp.json b/config/overrides/mcp.json index b6c6ba6..c3a7bbd 100644 --- a/config/overrides/mcp.json +++ b/config/overrides/mcp.json @@ -21,6 +21,16 @@ "author": "Chat UI Team", "short_description": "PowerPoint presentation generator", "help_email": "support@chatui.example.com" + }, + "pdfbasic": { + "command": ["python", "mcp/pdfbasic/main.py"], + "cwd": "backend", + "groups": ["users"], + "is_exclusive": false, + "description": "Extract and analyze text content from PDF documents, search within PDFs, and summarize content", + "author": "Chat UI Team", + "short_description": "PDF text extraction and analysis", + "help_email": "support@chatui.example.com" } } From d5dbfd3e581ce11891cac7056f2576901980f5fa Mon Sep 17 00:00:00 2001 From: Anthony Date: Thu, 30 Oct 2025 00:50:51 +0000 Subject: [PATCH 2/3] feat: enhance chat file handling and PDF analysis - Refactor chat service to directly manage file references in session context for existing S3 files, bypassing complex file handling utilities and improving efficiency - Modify PDF analysis tool to generate in-memory PDF reports with word frequency summaries, providing better visual output for text analytics --- backend/application/chat/service.py | 33 ++-- backend/mcp/file_size_test/main.py | 292 ++++++++++++++++++++++++++++ backend/mcp/pdfbasic/main.py | 112 ++++++++++- config/overrides/mcp.json | 10 + 4 files changed, 430 insertions(+), 17 deletions(-) create mode 100644 backend/mcp/file_size_test/main.py diff --git a/backend/application/chat/service.py b/backend/application/chat/service.py index 299ae00..1cea4bc 100644 --- a/backend/application/chat/service.py +++ b/backend/application/chat/service.py @@ -341,7 +341,7 @@ async def handle_attach_file( try: # Get file metadata - file_result = await self.file_manager.get_file(user_email, s3_key) + file_result = await self.file_manager.s3_client.get_file(user_email, s3_key) if not file_result: return { "type": "file_attach", @@ -359,25 +359,26 @@ async def handle_attach_file( "error": "Invalid file metadata" } - # Add file to session context - session.context = await file_utils.handle_session_files( - session_context=session.context, - user_email=user_email, - files_map={ - filename: { - "key": s3_key, - "content_type": file_result.get("content_type"), - "size": file_result.get("size"), - "filename": filename - } - }, - file_manager=self.file_manager, - update_callback=update_callback - ) + # Add file reference directly to session context (file already exists in S3) + session.context.setdefault("files", {})[filename] = { + "key": s3_key, + "content_type": file_result.get("content_type"), + "size": file_result.get("size"), + "source": "user", + "last_modified": file_result.get("last_modified"), + } sanitized_s3_key = s3_key.replace('\r', '').replace('\n', '') logger.info(f"Attached file ({sanitized_s3_key}) to session {session_id}") + # Emit files_update to notify UI + if update_callback: + await file_utils.emit_files_update_from_context( + session_context=session.context, + file_manager=self.file_manager, + update_callback=update_callback + ) + return { "type": "file_attach", "s3_key": s3_key, diff --git a/backend/mcp/file_size_test/main.py b/backend/mcp/file_size_test/main.py new file mode 100644 index 0000000..ac92d0c --- /dev/null +++ b/backend/mcp/file_size_test/main.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +""" +File Size Test MCP Server using FastMCP. +Simple tool for testing file transfer by returning file size. +""" + +import base64 +import io +import os +import logging +from typing import Any, Dict, Annotated + +import requests +from fastmcp import FastMCP + +logger = logging.getLogger(__name__) + +mcp = FastMCP("File_Size_Test") + + +@mcp.tool +def process_file_demo( + filename: Annotated[str, "The file to process (URL or base64)"], + username: Annotated[str, "Username for auditing"] = None +) -> Dict[str, Any]: + """ + Demo tool that processes a file and returns a new transformed file. + + This tool demonstrates the v2 MCP artifacts contract by: + - Accepting a file input + - Processing it (converting text to uppercase for demo) + - Returning a new file as an artifact with proper v2 format + - Including display hints for canvas viewing + + **v2 Artifacts Contract:** + - Uses artifacts array with base64 content + - Includes MIME types and metadata + - Provides display hints for canvas behavior + - Supports username injection for auditing + + **File Processing:** + - For text files: converts content to uppercase + - For binary files: demonstrates file modification capability + - Preserves original file structure where possible + + **Return Format:** + - results: Summary of operation + - artifacts: Array containing the processed file + - display: Canvas hints (open_canvas: true, primary_file, etc.) + - meta_data: Additional processing details + + Args: + filename: File reference (URL or base64 data) to process + username: Injected user identity for auditing + + Returns: + Dictionary with results, artifacts, and display hints per v2 contract + """ + print(f"DEBUG: process_file_demo called with filename: {filename}") + print(f"DEBUG: username: {username}") + try: + # Get the file content (reuse logic from get_file_size) + is_url = ( + filename.startswith("http://") or + filename.startswith("https://") or + filename.startswith("/api/") or + filename.startswith("/") + ) + print(f"DEBUG: is_url determined as: {is_url}") + + if is_url: + if filename.startswith("/"): + backend_url = os.getenv("BACKEND_URL", "http://localhost:8000") + url = f"{backend_url}{filename}" + else: + url = filename + logger.info(f"Downloading file for processing: {url}") + response = requests.get(url) + response.raise_for_status() + file_bytes = response.content + original_filename = filename.split('/')[-1] or "processed_file.txt" + else: + # Assume base64 + logger.info("Decoding base64 for file processing") + file_bytes = base64.b64decode(filename) + original_filename = "processed_file.txt" + + print(f"DEBUG: Original file size: {len(file_bytes)} bytes") + + # Process the file (demo: convert text to uppercase) + try: + # Try to decode as text for processing + original_text = file_bytes.decode('utf-8') + processed_text = original_text.upper() + processed_bytes = processed_text.encode('utf-8') + processed_mime = "text/plain" + description = "Processed text (converted to uppercase)" + except UnicodeDecodeError: + # If not text, do a simple binary modification (demo purpose) + processed_bytes = file_bytes + b"\n[DEMO PROCESSED]" + processed_mime = "application/octet-stream" + description = "Processed binary file (demo modification)" + + # Create artifact + processed_b64 = base64.b64encode(processed_bytes).decode('ascii') + new_filename = f"processed_{original_filename}" + + # Create display hints + display_hints = { + "open_canvas": True, + "primary_file": new_filename, + "mode": "replace", + "viewer_hint": "auto" + } + + result = { + "results": { + "operation": "process_file_demo", + "original_filename": original_filename, + "processed_filename": new_filename, + "original_size": len(file_bytes), + "processed_size": len(processed_bytes), + "processing_type": "text_uppercase" if 'original_text' in locals() else "binary_demo", + "status": "success" + }, + "meta_data": { + "is_error": False, + "processed_by": "process_file_demo_v2", + "username": username, + "mime_type": processed_mime + }, + "artifacts": [ + { + "name": new_filename, + "b64": processed_b64, + "mime": processed_mime, + "size": len(processed_bytes), + "description": description, + "viewer": "auto" + } + ], + "display": display_hints + } + print(f"DEBUG: About to return processed file result: {result['results']}") + return result + + except Exception as e: + print(f"DEBUG: Exception in process_file_demo: {str(e)}") + import traceback + traceback.print_exc() + error_result = { + "results": { + "operation": "process_file_demo", + "error": f"File processing failed: {str(e)}", + "filename": filename + }, + "meta_data": { + "is_error": True, + "error_type": type(e).__name__, + "username": username + } + } + return error_result + + +@mcp.tool +def get_file_size( + filename: Annotated[str, "The file to check (URL or base64)"] +) -> Dict[str, Any]: + """ + Test file transfer by returning the size of the transferred file. + + This simple tool is designed for testing file transfer functionality + between frontend and backend. It accepts a file and returns its size in bytes. + + **File Input Support:** + - URL-based files (http://, https://, or /api/ paths) + - Base64-encoded file data + - Automatic backend URL construction for relative paths + + **Return Information:** + - File size in bytes + - File size in human-readable format (KB, MB) + - Original filename or URL + + **Use Cases:** + - Testing file upload/download workflows + - Validating file transfer infrastructure + - Debugging file handling issues + - Verifying file size limits + + Args: + filename: File reference (URL or base64 data) + + Returns: + Dictionary containing: + - operation: "get_file_size" + - filename: Original filename/URL + - size_bytes: File size in bytes + - size_human: Human-readable size (e.g., "1.5 MB") + Or error message if file cannot be accessed + """ + print(f"DEBUG: get_file_size called with filename: {filename}") + print(f"DEBUG: filename type: {type(filename)}, length: {len(filename) if filename else 0}") + try: + # Check if filename is a URL (absolute or relative) + is_url = ( + filename.startswith("http://") or + filename.startswith("https://") or + filename.startswith("/api/") or + filename.startswith("/") + ) + print(f"DEBUG: is_url determined as: {is_url}") + + if is_url: + # Convert relative URLs to absolute URLs + if filename.startswith("/"): + backend_url = os.getenv("BACKEND_URL", "http://localhost:8000") + url = f"{backend_url}{filename}" + print(f"DEBUG: Constructing URL from relative path: {filename} -> {url}") + else: + url = filename + print(f"DEBUG: Using absolute URL: {url}") + + print(f"DEBUG: About to download from URL: {url}") + logger.info(f"Downloading file from URL: {url}") + response = requests.get(url) + print(f"DEBUG: HTTP response status: {response.status_code}") + response.raise_for_status() + file_bytes = response.content + print(f"DEBUG: Successfully downloaded file content, length: {len(file_bytes)} bytes") + else: + # Assume it's base64-encoded data + print(f"DEBUG: Treating input as base64 data, attempting to decode") + logger.info("Decoding base64 file data") + file_bytes = base64.b64decode(filename) + print(f"DEBUG: Successfully decoded base64 data, length: {len(file_bytes)} bytes") + + # Calculate file size + size_bytes = len(file_bytes) + size_human = _format_size(size_bytes) + print(f"DEBUG: Calculated file size: {size_bytes} bytes ({size_human})") + + result = { + "results": { + "operation": "get_file_size", + "filename": filename, + "size_bytes": size_bytes, + "size_human": size_human, + "status": "success" + }, + "meta_data": { + "is_error": False, + "transfer_method": "url" if is_url else "base64" + } + } + print(f"DEBUG: About to return success result: {result}") + return result + + except Exception as e: + print(f"DEBUG: Exception occurred while processing file: {str(e)}") + print(f"DEBUG: Exception type: {type(e).__name__}") + print(f"DEBUG: Filename that caused error: {filename}") + import traceback + print("DEBUG: Full traceback:") + traceback.print_exc() + error_result = { + "results": { + "operation": "get_file_size", + "error": f"File size check failed: {str(e)}", + "filename": filename + }, + "meta_data": { + "is_error": True, + "error_type": type(e).__name__ + } + } + print(f"DEBUG: About to return error result: {error_result}") + return error_result + + +def _format_size(size_bytes: int) -> str: + """Format file size in human-readable format.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if size_bytes < 1024.0: + return f"{size_bytes:.2f} {unit}" + size_bytes /= 1024.0 + return f"{size_bytes:.2f} PB" + + +if __name__ == "__main__": + mcp.run() diff --git a/backend/mcp/pdfbasic/main.py b/backend/mcp/pdfbasic/main.py index c429fc0..c778791 100644 --- a/backend/mcp/pdfbasic/main.py +++ b/backend/mcp/pdfbasic/main.py @@ -131,7 +131,7 @@ def analyze_pdf( """ Extract and analyze text content from PDF documents with comprehensive word frequency analysis. - This powerful PDF processing tool provides detailed text analytics for PDF documents: + This PDF processing tool provides detailed text analytics for PDF documents: **PDF Text Extraction:** - Extracts text from all pages in PDF documents @@ -274,6 +274,116 @@ def generate_report_about_pdf( if "error" in analysis_result.get("results", {}): return analysis_result + # --- 2. Generate the PDF report --- + try: + results_data = analysis_result["results"] + + # Create PDF report in memory + pdf_buffer = io.BytesIO() + c = canvas.Canvas(pdf_buffer, pagesize=letter) + width, height = letter + + # Title + c.setFont("Helvetica-Bold", 16) + c.drawString(1 * inch, height - 1 * inch, "PDF Analysis Report") + + # Document info + c.setFont("Helvetica-Bold", 12) + c.drawString(1 * inch, height - 1.5 * inch, "Document:") + c.setFont("Helvetica", 10) + c.drawString(1.5 * inch, height - 1.5 * inch, results_data.get("filename", "Unknown")) + + # Total word count + c.setFont("Helvetica-Bold", 12) + c.drawString(1 * inch, height - 2 * inch, "Total Words:") + c.setFont("Helvetica", 10) + c.drawString(1.5 * inch, height - 2 * inch, str(results_data.get("total_word_count", 0))) + + # Top 100 words header + c.setFont("Helvetica-Bold", 12) + c.drawString(1 * inch, height - 2.5 * inch, "Top 100 Most Frequent Words:") + + # Display top words in columns + c.setFont("Helvetica", 9) + y_position = height - 3 * inch + x_col1 = 1 * inch + x_col2 = 3.5 * inch + x_col3 = 6 * inch + + top_100_words = results_data.get("top_100_words", {}) + words_list = list(top_100_words.items()) + + for idx, (word, count) in enumerate(words_list): + # Determine column position + col = idx % 3 + if col == 0: + x_pos = x_col1 + elif col == 1: + x_pos = x_col2 + else: + x_pos = x_col3 + + # Move to next row after every 3 words + if col == 0 and idx > 0: + y_position -= 0.2 * inch + + # Check if we need a new page + if y_position < 1 * inch: + c.showPage() + c.setFont("Helvetica", 9) + y_position = height - 1 * inch + + # Draw word and count + text = f"{word}: {count}" + c.drawString(x_pos, y_position, text) + + c.save() + + # Get PDF bytes and encode to base64 + pdf_bytes = pdf_buffer.getvalue() + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + # --- 3. Return the structured response (v2 MCP compliant) --- + report_name = f"analysis_report_{results_data.get('filename', 'document').replace('.pdf', '')}.pdf" + + return { + "results": { + "operation": "pdf_report_generation", + "status": "Success", + "message": f"Generated analysis report for {results_data.get('filename', 'document')}", + "total_word_count": results_data.get("total_word_count", 0), + "words_analyzed": len(top_100_words) + }, + "artifacts": [ + { + "name": report_name, + "b64": pdf_base64, + "mime": "application/pdf", + "size": len(pdf_bytes), + "description": "PDF analysis report with word frequency statistics" + } + ], + "display": { + "open_canvas": True, + "primary_file": report_name, + "mode": "replace", + "viewer_hint": "pdf" + }, + "meta_data": { + "source_file": results_data.get("filename", "Unknown"), + "total_words": results_data.get("total_word_count", 0) + } + } + + except Exception as e: + import traceback + traceback.print_exc() + return { + "results": { + "error": f"Report generation failed: {str(e)}" + } + } + if __name__ == "__main__": diff --git a/config/overrides/mcp.json b/config/overrides/mcp.json index c3a7bbd..89b5ba6 100644 --- a/config/overrides/mcp.json +++ b/config/overrides/mcp.json @@ -31,6 +31,16 @@ "author": "Chat UI Team", "short_description": "PDF text extraction and analysis", "help_email": "support@chatui.example.com" + }, + "file_size_test": { + "command": ["python", "mcp/file_size_test/main.py"], + "cwd": "backend", + "groups": ["users"], + "is_exclusive": false, + "description": "Simple test tool that accepts a file transfer and returns the file size in bytes", + "author": "Chat UI Team", + "short_description": "File transfer test tool", + "help_email": "support@chatui.example.com" } } From 88cabf267590d2f8694457e0564d908c4fb5cfa7 Mon Sep 17 00:00:00 2001 From: Anthony Date: Thu, 30 Oct 2025 00:55:12 +0000 Subject: [PATCH 3/3] fix: resolve Dependabot configuration issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove non-existent security-team references from reviewers and assignees. The required labels (security, github-actions, python, docker) have been created directly in the repository. This fixes the configuration errors reported in PRs #22 and #23. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/dependabot.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 2969475..d0afe24 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,10 +9,6 @@ updates: day: "monday" time: "09:00" open-pull-requests-limit: 10 - reviewers: - - "security-team" - assignees: - - "security-team" commit-message: prefix: "security(deps)" include: "scope" @@ -37,10 +33,6 @@ updates: day: "monday" time: "09:00" open-pull-requests-limit: 10 - reviewers: - - "security-team" - assignees: - - "security-team" commit-message: prefix: "security(deps)" include: "scope" @@ -69,10 +61,6 @@ updates: interval: "weekly" day: "tuesday" time: "09:00" - reviewers: - - "security-team" - assignees: - - "security-team" commit-message: prefix: "security(docker)" labels: @@ -87,10 +75,6 @@ updates: interval: "weekly" day: "wednesday" time: "09:00" - reviewers: - - "security-team" - assignees: - - "security-team" commit-message: prefix: "security(actions)" labels: