diff --git a/.env.example b/.env.example index 0e60bb9..8ba9f6c 100644 --- a/.env.example +++ b/.env.example @@ -78,22 +78,28 @@ FEATURE_AGENT_MODE_AVAILABLE=true AGENT_LOOP_STRATEGY=think-act # (Adjust above to stage rollouts. For a bare-bones chat set them all to false.) -APP_LOG_DIR=/workspaces/atlas-ui-3-11/logs +APP_LOG_DIR=/workspaces/atlas-ui-3/logs CAPABILITY_TOKEN_SECRET=blablah ############################################# # S3/MinIO Storage Configuration ############################################# -# MinIO endpoint (use localhost for local dev, minio for docker-compose) -S3_ENDPOINT=http://localhost:9000 -S3_BUCKET_NAME=atlas-files -S3_ACCESS_KEY=minioadmin -S3_SECRET_KEY=minioadmin -S3_REGION=us-east-1 -S3_TIMEOUT=30 - -S3_USE_SSL=false +# Choose ONE option below (comment out the other) + +# --- Option 1: Mock S3 (Default - No Docker required) --- +USE_MOCK_S3=true + +# --- Option 2: MinIO (Requires Docker) --- +# Uncomment below and set USE_MOCK_S3=false to use MinIO +# USE_MOCK_S3=false +# S3_ENDPOINT=http://localhost:9000 +# S3_BUCKET_NAME=atlas-files # Must match bucket created in docker-compose.yml +# S3_ACCESS_KEY=minioadmin +# S3_SECRET_KEY=minioadmin +# S3_REGION=us-east-1 +# S3_TIMEOUT=30 +# S3_USE_SSL=false SECURITY_CSP_VALUE="default-src 'self'; img-src 'self' data: blob:; script-src 'self'; style-src 'self' 'unsafe-inline'; connect-src 'self'; frame-src 'self' blob: data:; frame-ancestors 'self'" diff --git a/.gitignore b/.gitignore index f18bc00..6d5990d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.pptx *.jsonl - +.ruff_cache # Environment variables .env .claude diff --git a/CLAUDE.md b/CLAUDE.md index fa64b0c..3608881 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,12 +61,16 @@ mkdir -p logs ```bash bash agent_start.sh ``` -This script handles: killing old processes, clearing logs, building frontend, starting mock S3, and starting backend. +This script handles: killing old processes, clearing logs, building frontend, starting S3 storage (MinIO or Mock based on `USE_MOCK_S3` in `.env`), and starting backend. **Options:** - `bash agent_start.sh -f` - Only rebuild frontend - `bash agent_start.sh -b` - Only restart backend +**Note:** The script automatically reads `USE_MOCK_S3` from `.env`: +- If `true`: Uses in-process Mock S3 (no Docker) +- If `false`: Starts MinIO via docker-compose + ### Manual Development Workflow **Frontend Build (CRITICAL):** @@ -82,11 +86,24 @@ cd backend python main.py # NEVER use uvicorn --reload (causes problems) ``` -**Mock S3 (Optional):** -```bash -cd mocks/s3-mock -python main.py # Runs on http://127.0.0.1:8003 -``` +**S3 Storage (Mock vs MinIO):** + +The project supports two S3 storage backends: + +1. **Mock S3 (Default, Recommended for Development)** + - Set `USE_MOCK_S3=true` in `.env` + - Uses in-process FastAPI TestClient (no Docker required) + - Files stored in `minio-data/chatui/` on disk + - No external server needed - integrated directly into backend + - Faster startup, simpler development workflow + +2. **MinIO (Production-like)** + - Set `USE_MOCK_S3=false` in `.env` + - Requires Docker: `docker-compose up -d minio minio-init` + - Full S3 compatibility with all features + - Use for testing production-like scenarios + +The mock automatically activates when the backend starts if `USE_MOCK_S3=true`. ### Testing @@ -246,9 +263,11 @@ Three agent loop strategies implement different reasoning patterns: - **Act** (`backend/application/chat/agent/act_loop.py`): Pure action loop without explicit reasoning steps, fastest with minimal overhead. LLM calls tools directly and signals completion via the "finished" tool ### File Storage -S3-compatible storage via `backend/modules/file_storage/s3_client.py`: -- Production: Real S3 or S3-compatible service -- Development: Mock S3 (`mocks/s3-mock/`) +S3-compatible storage via `backend/modules/file_storage/`: +- Production/MinIO: `s3_client.py` - boto3-based client for real S3/MinIO +- Development: `mock_s3_client.py` - TestClient-based in-process mock (no Docker) +- Controlled by `USE_MOCK_S3` env var (default: true) +- Both implementations share the same interface ### Security Middleware Stack ``` diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000..45380c4 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,65 @@ +# GEMINI.md + +This file provides guidance to the Gemini AI agent when working with code in this repository. + +## Project Overview + +Atlas UI 3 is a full-stack LLM chat interface with Model Context Protocol (MCP) integration, supporting multiple LLM providers (OpenAI, Anthropic Claude, Google Gemini), RAG, and agentic capabilities. + +**Tech Stack:** +- Backend: FastAPI + WebSockets, LiteLLM, FastMCP +- Frontend: React 19 + Vite 7 + Tailwind CSS +- Python Package Manager: **uv** (NOT pip!) +- Configuration: Pydantic with YAML/JSON configs + +## Building and Running + +### Quick Start (Recommended) +```bash +bash agent_start.sh +``` +This script handles: killing old processes, clearing logs, building frontend, and starting the backend. + +### Manual Development Workflow + +**Frontend Build (CRITICAL):** +```bash +cd frontend +npm install +npm run build # Use build, NOT npm run dev (WebSocket issues) +``` + +**Backend Start:** +```bash +cd backend +python main.py # NEVER use uvicorn --reload (causes problems) +``` + +### Testing + +**Run all tests:** +```bash +./test/run_tests.sh all +``` + +**Individual test suites:** +```bash +./test/run_tests.sh backend +./test/run_tests.sh frontend +./test/run_tests.sh e2e +``` + +## Development Conventions + +- **Python Package Manager**: **ALWAYS use `uv`**, never pip or conda. +- **Frontend Development**: **NEVER use `npm run dev`**, it has WebSocket connection problems. Always use `npm run build`. +- **Backend Development**: **NEVER use `uvicorn --reload`**, it causes problems. +- **File Naming**: Do not use generic names like `utils.py` or `helpers.py`. Use descriptive names that reflect the file's purpose. +- **No Emojis**: No emojis should ever be added in this repo. +- **Linting**: Run `ruff check backend/` for Python and `npm run lint` for the frontend before committing. + + +Also read. +/workspaces/atlas-ui-3/.github/copilot-instructions.md + +and CLAUDE.md \ No newline at end of file diff --git a/README.md b/README.md index d7e7045..db8312b 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,7 @@ cp .env.example .env # Edit with your API keys # Build frontend cd frontend && npm install && npm run build -# there is a mock s3 that you might want to enable. Switching to minio sooon. -cd mocks/s3-mocks && python main.py - -# Start backend +# Start backend cd ../backend && python main.py # OR the quickest way to start is to use the agent_start.sh @@ -79,6 +76,7 @@ bash agent_start.sh - **Use `npm run build`** instead of `npm run dev` for frontend development - **File limit**: Maximum 400 lines per file for maintainability - **Container Environment**: Use Fedora latest for Docker containers (GitHub Actions uses Ubuntu runners) +- **Mock S3**: The included S3 mock (`mocks/s3-mock/`) is for development/testing only and must NEVER be used in production due to lack of authentication, encryption, and other critical security features. ## License diff --git a/agent_start.sh b/agent_start.sh index bcaf53c..15182d2 100755 --- a/agent_start.sh +++ b/agent_start.sh @@ -20,14 +20,26 @@ done # Configuration USE_NEW_FRONTEND=${USE_NEW_FRONTEND:-true} -# Check if MinIO is running -if ! docker ps | grep -q atlas-minio; then - echo "⚠️ MinIO is not running. Starting MinIO with docker-compose..." - docker-compose up -d minio minio-init - echo "✅ MinIO started successfully" - sleep 3 +# Read USE_MOCK_S3 from .env file +if [ -f .env ]; then + USE_MOCK_S3=$(grep -E "^USE_MOCK_S3=" .env | cut -d '=' -f2) else - echo "✅ MinIO is already running" + USE_MOCK_S3="true" # Default to mock if no .env +fi + +# Only start MinIO if not using mock S3 +if [ "$USE_MOCK_S3" = "true" ]; then + echo "Using Mock S3 (no Docker required)" +else + # Check if MinIO is running + if ! docker ps | grep -q atlas-minio; then + echo "MinIO is not running. Starting MinIO with docker-compose..." + docker-compose up -d minio minio-init + echo "MinIO started successfully" + sleep 3 + else + echo "MinIO is already running" + fi fi # Kill any running uvicorn processes (skip if only rebuilding frontend) diff --git a/backend/application/chat/preprocessors/message_builder.py b/backend/application/chat/preprocessors/message_builder.py index a2f306e..98021bc 100644 --- a/backend/application/chat/preprocessors/message_builder.py +++ b/backend/application/chat/preprocessors/message_builder.py @@ -41,22 +41,27 @@ async def build_messages( ) -> List[Dict[str, Any]]: """ Build messages array from session history and context. - + Args: session: Current chat session include_files_manifest: Whether to append files manifest - + Returns: List of messages ready for LLM call """ # Get conversation history from session messages = session.history.get_messages_for_llm() - + # Optionally add files manifest if include_files_manifest: session_context = build_session_context(session) + files_in_context = session_context.get("files", {}) + logger.debug(f"Session has {len(files_in_context)} files: {list(files_in_context.keys())}") files_manifest = file_utils.build_files_manifest(session_context) if files_manifest: + logger.debug(f"Adding files manifest to messages: {files_manifest['content'][:100]}") messages.append(files_manifest) - + else: + logger.warning("No files manifest generated despite include_files_manifest=True") + return messages diff --git a/backend/infrastructure/app_factory.py b/backend/infrastructure/app_factory.py index a56d1da..46bd386 100644 --- a/backend/infrastructure/app_factory.py +++ b/backend/infrastructure/app_factory.py @@ -7,6 +7,7 @@ from interfaces.transport import ChatConnectionProtocol from modules.config import ConfigManager from modules.file_storage import S3StorageClient, FileManager +from modules.file_storage.mock_s3_client import MockS3StorageClient from modules.llm.litellm_caller import LiteLLMCaller from modules.mcp_tools import MCPToolManager from modules.rag import RAGClient @@ -37,7 +38,12 @@ def __init__(self) -> None: ) # File storage & manager - self.file_storage = S3StorageClient() + if self.config_manager.app_settings.use_mock_s3: + logger.info("Using MockS3StorageClient (in-process, no Docker required)") + self.file_storage = MockS3StorageClient() + else: + logger.info("Using S3StorageClient (MinIO/AWS S3)") + self.file_storage = S3StorageClient() self.file_manager = FileManager(self.file_storage) logger.info("AppFactory initialized") diff --git a/backend/modules/config/manager.py b/backend/modules/config/manager.py index 9b5698a..f21810a 100644 --- a/backend/modules/config/manager.py +++ b/backend/modules/config/manager.py @@ -127,6 +127,7 @@ def agent_mode_available(self) -> bool: test_user: str = "test@test.com" # Test user for development # S3/MinIO storage settings + use_mock_s3: bool = False # Use in-process S3 mock (no Docker required) s3_endpoint: str = "http://localhost:9000" s3_bucket_name: str = "atlas-files" s3_access_key: str = "minioadmin" diff --git a/backend/modules/file_storage/mock_s3_client.py b/backend/modules/file_storage/mock_s3_client.py new file mode 100644 index 0000000..7feb355 --- /dev/null +++ b/backend/modules/file_storage/mock_s3_client.py @@ -0,0 +1,381 @@ +""" +Mock S3 Storage Client using FastAPI TestClient. + +This client provides the same interface as S3StorageClient but uses the +S3 mock server via TestClient, eliminating the need for Docker/MinIO in development. +""" + +import base64 +import hashlib +import logging +import time +import uuid +from typing import Dict, List, Optional, Any + +from core.utils import sanitize_for_logging + + +logger = logging.getLogger(__name__) + + +class MockS3StorageClient: + """Mock S3 client using FastAPI TestClient for in-process testing.""" + + def __init__( + self, + s3_bucket_name: str = None, + ): + """Initialize the mock S3 client.""" + from modules.config import config_manager + + self.bucket_name = s3_bucket_name or config_manager.app_settings.s3_bucket_name + self.endpoint_url = "in-process-mock" # For health check compatibility + self.region = "us-east-1" # For health check compatibility + self._client = None # Lazy initialization + + logger.info(f"MockS3StorageClient initialized with bucket: {self.bucket_name}") + + @property + def client(self): + """Lazy-load the TestClient to avoid circular imports.""" + if self._client is None: + from fastapi.testclient import TestClient + import sys + import importlib.util + from pathlib import Path + + # Get the S3 mock path + mock_path = Path(__file__).parent.parent.parent.parent / "mocks" / "s3-mock" + main_py_path = mock_path / "main.py" + + # Add mock directory to sys.path for relative imports (storage, s3_xml) + mock_path_str = str(mock_path) + if mock_path_str not in sys.path: + sys.path.insert(0, mock_path_str) + + # Import the main.py module explicitly with a unique name + spec = importlib.util.spec_from_file_location("s3_mock_app", main_py_path) + s3_mock_module = importlib.util.module_from_spec(spec) + + # Add to sys.modules so relative imports work + sys.modules['s3_mock_app'] = s3_mock_module + spec.loader.exec_module(s3_mock_module) + + self._client = TestClient(s3_mock_module.get_app()) + logger.info("TestClient for S3 mock initialized") + + return self._client + + def _generate_s3_key(self, user_email: str, filename: str, source_type: str = "user") -> str: + """Generate an S3-style key with user isolation.""" + timestamp = int(time.time()) + unique_id = str(uuid.uuid4())[:8] + safe_filename = filename.replace(" ", "_").replace("/", "_") + + if source_type == "tool": + return f"users/{user_email}/generated/{timestamp}_{unique_id}_{safe_filename}" + else: + return f"users/{user_email}/uploads/{timestamp}_{unique_id}_{safe_filename}" + + def _calculate_etag(self, content_bytes: bytes) -> str: + """Calculate ETag for file content.""" + return hashlib.md5(content_bytes, usedforsecurity=False).hexdigest() + + async def upload_file( + self, + user_email: str, + filename: str, + content_base64: str, + content_type: str = "application/octet-stream", + tags: Optional[Dict[str, str]] = None, + source_type: str = "user" + ) -> Dict[str, Any]: + """ + Upload a file to mock S3 storage. + + Args: + user_email: Email of the user uploading the file + filename: Original filename + content_base64: Base64 encoded file content + content_type: MIME type of the file + tags: Additional metadata tags + source_type: Type of file ("user" or "tool") + + Returns: + Dictionary containing file metadata including the S3 key + """ + try: + # Decode base64 content + content_bytes = base64.b64decode(content_base64) + + # Generate S3 key + s3_key = self._generate_s3_key(user_email, filename, source_type) + + # Prepare tags + file_tags = tags or {} + file_tags["source"] = source_type + file_tags["user_email"] = user_email + file_tags["original_filename"] = filename + + # Convert tags to query param format + tag_param = "&".join([f"{k}={v}" for k, v in file_tags.items()]) + + # Upload via TestClient + headers = { + "Content-Type": content_type, + "x-amz-meta-user_email": user_email, + "x-amz-meta-original_filename": filename, + "x-amz-meta-source_type": source_type + } + + response = self.client.put( + f"/{self.bucket_name}/{s3_key}", + content=content_bytes, + headers=headers, + params={"tagging": tag_param} + ) + + if response.status_code != 200: + raise Exception(f"Upload failed: {response.text}") + + etag = response.headers.get("ETag", "").strip('"') + + result = { + "key": s3_key, + "filename": filename, + "size": len(content_bytes), + "content_type": content_type, + "last_modified": None, # Mock doesn't need exact timestamp + "etag": etag, + "tags": file_tags, + "user_email": user_email + } + + logger.info(f"File uploaded successfully: {sanitize_for_logging(s3_key)} for user {sanitize_for_logging(user_email)}") + return result + + except Exception as e: + logger.error(f"Error uploading file to mock S3: {str(e)}") + raise + + async def get_file(self, user_email: str, file_key: str) -> Dict[str, Any]: + """ + Get a file from mock S3 storage. + + Args: + user_email: Email of the user requesting the file + file_key: S3 key of the file to retrieve + + Returns: + Dictionary containing file data and metadata + """ + try: + # Verify user has access to this file + if not file_key.startswith(f"users/{user_email}/"): + logger.warning(f"Access denied: {sanitize_for_logging(user_email)} attempted to access {sanitize_for_logging(file_key)}") + raise Exception("Access denied to file") + + # Get object via TestClient + response = self.client.get(f"/{self.bucket_name}/{file_key}") + + if response.status_code == 404: + logger.warning(f"File not found: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return None + + if response.status_code != 200: + raise Exception(f"Get failed: {response.text}") + + # Read file content + content_bytes = response.content + content_base64 = base64.b64encode(content_bytes).decode() + + # Get tags + tags_response = self.client.get(f"/{self.bucket_name}/{file_key}", params={"tagging": ""}) + tags = {} + if tags_response.status_code == 200: + # Parse XML tags (simplified - just extract from response) + import xml.etree.ElementTree as ET + try: + root = ET.fromstring(tags_response.text) + for tag_elem in root.findall(".//Tag"): + key_elem = tag_elem.find("Key") + value_elem = tag_elem.find("Value") + if key_elem is not None and value_elem is not None: + tags[key_elem.text] = value_elem.text + except ET.ParseError: + # Failed to parse tags XML; tags will be left empty. This is non-fatal as tags are optional. + logger.warning(f"Failed to parse tags XML for file {sanitize_for_logging(file_key)}", exc_info=True) + + # Extract filename from metadata headers + filename = response.headers.get("x-amz-meta-original_filename", file_key.split('/')[-1]) + + result = { + "key": file_key, + "filename": filename, + "content_base64": content_base64, + "content_type": response.headers.get("Content-Type", "application/octet-stream"), + "size": len(content_bytes), + "last_modified": None, + "etag": response.headers.get("ETag", "").strip('"'), + "tags": tags + } + + logger.info(f"File retrieved successfully: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return result + + except Exception as e: + logger.error(f"Error getting file from mock S3: {str(e)}") + raise + + async def list_files( + self, + user_email: str, + file_type: Optional[str] = None, + limit: int = 100 + ) -> List[Dict[str, Any]]: + """ + List files for a user. + + Args: + user_email: Email of the user + file_type: Optional filter by file type ("user" or "tool") + limit: Maximum number of files to return + + Returns: + List of file metadata dictionaries + """ + try: + # Determine prefix + prefix = f"users/{user_email}/" + if file_type == "tool": + prefix = f"users/{user_email}/generated/" + elif file_type == "user": + prefix = f"users/{user_email}/uploads/" + + # List via TestClient + response = self.client.get( + f"/{self.bucket_name}", + params={"list-type": "2", "prefix": prefix, "max-keys": str(limit)} + ) + + if response.status_code != 200: + raise Exception(f"List failed: {response.text}") + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + files = [] + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + size_elem = content.find("s3:Size", ns) or content.find("Size") + etag_elem = content.find("s3:ETag", ns) or content.find("ETag") + + if key_elem is None: + continue + + key = key_elem.text + size = int(size_elem.text) if size_elem is not None else 0 + etag = etag_elem.text.strip('"') if etag_elem is not None else "" + + # Get metadata via HEAD + head_response = self.client.head(f"/{self.bucket_name}/{key}") + filename = head_response.headers.get("x-amz-meta-original_filename", key.split('/')[-1]) + content_type = head_response.headers.get("Content-Type", "application/octet-stream") + + files.append({ + "key": key, + "filename": filename, + "size": size, + "content_type": content_type, + "last_modified": None, + "etag": etag, + "tags": {}, + "user_email": user_email + }) + + logger.info(f"Listed {len(files)} files for user {sanitize_for_logging(user_email)}") + return files + + except Exception as e: + logger.error(f"Error listing files from mock S3: {str(e)}") + raise + + async def delete_file(self, user_email: str, file_key: str) -> bool: + """ + Delete a file from mock S3 storage. + + Args: + user_email: Email of the user deleting the file + file_key: S3 key of the file to delete + + Returns: + True if deletion was successful + """ + try: + # Verify user has access to this file + if not file_key.startswith(f"users/{user_email}/"): + logger.warning(f"Access denied for deletion: {sanitize_for_logging(user_email)} attempted to delete {sanitize_for_logging(file_key)}") + raise Exception("Access denied to delete file") + + # Delete via TestClient + response = self.client.delete(f"/{self.bucket_name}/{file_key}") + + if response.status_code == 404: + logger.warning(f"File not found for deletion: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return False + + if response.status_code != 204: + raise Exception(f"Delete failed: {response.text}") + + logger.info(f"File deleted successfully: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return True + + except Exception as e: + logger.error(f"Error deleting file from mock S3: {str(e)}") + raise + + async def get_user_stats(self, user_email: str) -> Dict[str, Any]: + """ + Get file statistics for a user. + + Args: + user_email: Email of the user + + Returns: + Dictionary containing file statistics + """ + try: + # List all user files + files = await self.list_files(user_email, limit=1000) + + total_size = 0 + upload_count = 0 + generated_count = 0 + + for file_data in files: + total_size += file_data['size'] + + # Determine type from key path + if "/generated/" in file_data['key']: + generated_count += 1 + else: + upload_count += 1 + + result = { + "total_files": len(files), + "total_size": total_size, + "upload_count": upload_count, + "generated_count": generated_count + } + + logger.info(f"Got file stats for user {sanitize_for_logging(user_email)}: {result}") + return result + + except Exception as e: + logger.error(f"Error getting user stats from mock S3: {str(e)}") + raise diff --git a/config/overrides/mcp.json b/config/overrides/mcp.json index 2aca381..26a0c6b 100644 --- a/config/overrides/mcp.json +++ b/config/overrides/mcp.json @@ -53,8 +53,15 @@ "description": "Specialized system prompts for AI behavior modification", "author": "Chat UI Team", "short_description": "AI behavior prompts", - "help_email": "support@chatui.example.com", + "help_email": "support@chatui.example.com", "compliance_level": "Public" + }, + "progress_demo": { + "transport": "stdio", + "command": ["python", "main.py"], + "cwd": "backend/mcp/progress_demo", + "groups": ["mcp_basic"], + "compliance_level": "Public" } } diff --git a/docs/config-migration.md b/docs/config-migration.md deleted file mode 100644 index 8a6f456..0000000 --- a/docs/config-migration.md +++ /dev/null @@ -1,66 +0,0 @@ -# Configuration & Runtime Directory Migration - -This project migrated to a cleaner separation of code, configuration, and runtime artifacts. - -## New Layout - -``` -config/ - defaults/ # Template / version-controlled baseline configs - overrides/ # Editable overrides (mounted volume / env APP_CONFIG_OVERRIDES) -runtime/ - logs/ # Application logs (JSONL) - feedback/ # User feedback JSON files - uploads/ # Future file uploads -``` - -Legacy directories (still supported for backward compatibility): - -``` -backend/configfiles -> config/defaults -backend/configfilesadmin -> config/overrides -backend/logs -> runtime/logs -feedback (root) -> runtime/feedback -``` - -## Environment Variables - -You can customize locations: - -- `APP_CONFIG_OVERRIDES` (default: `config/overrides`) -- `APP_CONFIG_DEFAULTS` (default: `config/defaults`) -- `RUNTIME_FEEDBACK_DIR` (default: `runtime/feedback`) - -## Code Changes - -- `ConfigManager._search_paths` now searches new directories first, then legacy paths. -- Admin routes seed `config/overrides` from `config/defaults` or legacy dirs if empty. -- Feedback routes use `runtime/feedback` (override with `RUNTIME_FEEDBACK_DIR`). -- MCP tool manager chooses `config/overrides/mcp.json` with legacy fallback. - -## Migration Steps (Already Applied) - -1. Created `config/defaults` and copied existing `backend/configfiles` contents. -2. Created `config/overrides` and copied existing `backend/configfilesadmin` contents. -3. Added new runtime directories: `runtime/logs`, `runtime/feedback`. -4. Updated `.gitignore` to exclude runtime artifacts. -5. Added backward-compatible search paths so no immediate breakage. - -## Next Clean-Up (Optional) - -- Remove legacy `backend/configfiles*` once confident no tooling relies on them. -- Update any deployment manifests / Docker volumes to mount `config/overrides` & `runtime`. -- Document environment variables in main README. - -## Rollback - -If needed, you can restore previous behavior by setting: - -``` -APP_CONFIG_OVERRIDES=backend/configfilesadmin -APP_CONFIG_DEFAULTS=backend/configfiles -RUNTIME_FEEDBACK_DIR=feedback -``` - ---- -Generated on migration date. diff --git a/mocks/s3-mock/README.md b/mocks/s3-mock/README.md new file mode 100644 index 0000000..41014e8 --- /dev/null +++ b/mocks/s3-mock/README.md @@ -0,0 +1,196 @@ +# S3 Mock Server + +A minimal FastAPI server that mimics S3's REST API for development and testing purposes. Compatible with botocore/boto3 expectations. + +## CRITICAL WARNING: DEVELOPMENT ONLY + +**THIS MOCK SERVER MUST NEVER BE USED IN PRODUCTION ENVIRONMENTS.** + +This implementation is intended ONLY for local development and testing. It has critical security limitations that make it completely unsuitable for production use: + +- **No authentication or authorization** - Accepts all requests regardless of credentials +- **No signature verification** - Does not validate S3v4 signatures (security bypass) +- **No encryption** - Data stored in plain text on disk +- **No access control** - No bucket policies, IAM, or ACLs +- **No audit logging** - No CloudTrail equivalent for compliance +- **No data durability guarantees** - Simple filesystem storage without redundancy +- **Minimal error handling** - May expose internal paths or system information + +For production deployments, use: +- **AWS S3** - For AWS-hosted applications +- **MinIO** - For self-hosted S3-compatible storage with proper security +- Other production-grade S3-compatible services + +## Features + +- **Path-style addressing**: `http://localhost:9001/{bucket}/{key}` +- **S3v4 signed requests**: Accepts but doesn't validate signatures +- **Persistent storage**: Data survives server restarts +- **XML responses**: Compatible with botocore parsing +- **Minimal S3 operations**: PUT, GET, HEAD, DELETE, ListObjectsV2, Tagging + +## Supported Operations + +### Object Operations +- `PUT /{bucket}/{key}` - Upload object with optional metadata and tagging +- `GET /{bucket}/{key}` - Download object +- `HEAD /{bucket}/{key}` - Get object metadata +- `DELETE /{bucket}/{key}` - Delete object + +### Listing Operations +- `GET /{bucket}?list-type=2&prefix=...` - List objects V2 + +### Tagging Operations +- `GET /{bucket}/{key}?tagging` - Get object tags +- `PUT /{bucket}/{key}?tagging` - Set object tags + +## Storage + +- **Root directory**: `minio-data/chatui/` (configurable via `MOCK_S3_ROOT`) +- **Bucket structure**: `{root}/{bucket}/` +- **Object files**: `{bucket}/{key}` (data) +- **Metadata files**: `{bucket}/{key}.meta.json` (content-type, etag, metadata, tags) +- **Tag files**: Tags stored in metadata JSON + +## Running the Server + +### Prerequisites +```bash +uv pip install fastapi uvicorn +``` + +### Start the server +```bash +cd mocks/s3-mock +python main.py +``` + +Server runs on `http://localhost:9001` by default. Configure port with `PORT` environment variable. + +## Backend Configuration + +Configure your backend to use the mock server: + +```bash +# Environment variables +S3_ENDPOINT_URL=http://localhost:9001 +S3_REGION=us-east-1 +S3_BUCKET=atlas-files +S3_USE_SSL=false +S3_ADDRESSING_STYLE=path +S3_ACCESS_KEY_ID=any_value +S3_SECRET_ACCESS_KEY=any_value +``` + +## Testing + +Use the included smoke test script to verify functionality: + +```bash +cd mocks/s3-mock +python smoke_test.py +``` + +The test performs: +- PUT object with metadata and tags +- GET object and verify content/metadata/ETag +- HEAD object and verify headers +- List objects with prefix filtering +- Get/set object tagging +- DELETE object and verify cleanup + +## API Details + +### PUT Object +``` +PUT /{bucket}/{key} +Content-Type: +x-amz-meta-*: +?tagging=key1%3Dvalue1%26key2%3Dvalue2 + +Body: + +Response: 200 OK +ETag: "" +``` + +### GET Object +``` +GET /{bucket}/{key} + +Response: 200 OK +Content-Type: +ETag: "" +x-amz-meta-*: +Body: +``` + +### List Objects V2 +``` +GET /{bucket}?list-type=2&prefix=&max-keys= + +Response: 200 OK +Content-Type: application/xml + + + bucket + prefix + n + 1000 + false + + object-key + 2025-11-02T10:00:00.000Z + "md5hex" + 123 + STANDARD + + +``` + +### Tagging +``` +GET /{bucket}/{key}?tagging + +Response: 200 OK +Content-Type: application/xml + + + + + key1 + value1 + + + +``` + +## Error Responses + +Returns S3-compatible XML error responses: + +```xml + + NoSuchKey + The specified key does not exist. + /{bucket}/{key} + +``` + +## Limitations + +- No multipart upload support +- No versioning, ACLs, or presigned URLs +- No signature verification (accepts all auth headers) +- Virtual-hosted-style addressing not supported +- Single PUT operations only (no chunked uploads) +- Basic prefix filtering for listing (no delimiter support) + +## Development + +The mock server is structured as: + +- `main.py` - FastAPI application and routes +- `storage.py` - Filesystem storage operations +- `s3_xml.py` - XML generation and parsing +- `README.md` - This documentation diff --git a/mocks/s3-mock/main.py b/mocks/s3-mock/main.py new file mode 100644 index 0000000..5c97f19 --- /dev/null +++ b/mocks/s3-mock/main.py @@ -0,0 +1,214 @@ +import os +import urllib.parse +from pathlib import Path +from typing import Dict + +from fastapi import FastAPI, HTTPException, Request, Response +from fastapi.responses import StreamingResponse +import uvicorn + +from storage import ( + ensure_bucket, load_object, save_object, delete_object, + list_objects, get_tags, set_tags, load_meta +) +from s3_xml import ( + create_list_objects_xml, create_tagging_xml, parse_tagging_xml, create_error_xml +) + +app = FastAPI(title="S3 Mock Server") + + +@app.middleware("http") +async def log_requests(request: Request, call_next): + """Middleware to log when the S3 mock is hit.""" + print(f"S3 Mock hit: {request.method} {request.url.path}") + response = await call_next(request) + return response + + +def get_app(): + """Get the FastAPI app instance for testing.""" + return app + +# Configuration +MOCK_S3_ROOT = Path(os.getenv("MOCK_S3_ROOT", "minio-data/chatui")) + + +def get_bucket_root(bucket: str) -> Path: + """Get the root path for a bucket.""" + return ensure_bucket(MOCK_S3_ROOT, bucket) + + +def extract_metadata(headers: Dict[str, str]) -> Dict[str, str]: + """Extract x-amz-meta-* headers.""" + metadata = {} + for key, value in headers.items(): + if key.lower().startswith("x-amz-meta-"): + meta_key = key[11:].lower() # Remove "x-amz-meta-" prefix + metadata[meta_key] = value + return metadata + + +def add_metadata_headers(response: Response, metadata: Dict[str, str]): + """Add metadata headers to response.""" + for key, value in metadata.items(): + response.headers[f"x-amz-meta-{key}"] = value + + +@app.put("/{bucket}/{key:path}") +async def put_object(bucket: str, key: str, request: Request): + """PUT Object endpoint.""" + # Check if this is a tagging request with XML body + if request.query_params.get("tagging") and request.headers.get("content-type") == "application/xml": + bucket_root = get_bucket_root(bucket) + + # Check if object exists + if load_meta(bucket_root, key) is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + try: + body = await request.body() + xml_content = body.decode("utf-8") + tags = parse_tagging_xml(xml_content) + set_tags(bucket_root, key, tags) + return Response(status_code=200) + except ValueError: + error_xml = create_error_xml("MalformedXML", "The XML you provided was not well-formed or did not validate against the published schema.", f"/{bucket}/{key}?tagging") + raise HTTPException(status_code=400, detail=error_xml) + + # Regular object PUT + try: + bucket_root = get_bucket_root(bucket) + body = await request.body() + + # Extract content type + content_type = request.headers.get("content-type", "application/octet-stream") + + # Extract metadata + metadata = extract_metadata(dict(request.headers)) + + # Extract tagging from query params + tags = {} + tagging_param = request.query_params.get("tagging") + if tagging_param: + # Parse URL-encoded tags: key1=value1&key2=value2 + tag_pairs = urllib.parse.parse_qs(tagging_param, keep_blank_values=True) + for tag_key, tag_values in tag_pairs.items(): + if tag_values: + tags[tag_key] = tag_values[0] + + # Save object + meta = save_object(bucket_root, key, body, content_type, metadata, tags) + + # Return response with ETag + response = Response(status_code=200) + response.headers["ETag"] = f'"{meta["etag"]}"' + return response + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/{bucket}/{key:path}") +async def get_object(bucket: str, key: str, request: Request): + """GET Object endpoint.""" + bucket_root = get_bucket_root(bucket) + + # Check if this is a tagging request + if request.query_params.get("tagging"): + # Check if object exists + if load_meta(bucket_root, key) is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml, media_type="application/xml") + + tags = get_tags(bucket_root, key) + xml_response = create_tagging_xml(tags) + return Response(content=xml_response, media_type="application/xml") + + # Regular object GET + result = load_object(bucket_root, key) + if result is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + data, meta = result + + # Create streaming response + def iter_data(): + yield data + + response = StreamingResponse(iter_data(), media_type=meta.get("content_type", "application/octet-stream")) + response.headers["ETag"] = f'"{meta["etag"]}"' + response.headers["Content-Type"] = meta.get("content_type", "application/octet-stream") + + # Add metadata headers + add_metadata_headers(response, meta.get("metadata", {})) + + return response + + +@app.head("/{bucket}/{key:path}") +async def head_object(bucket: str, key: str): + """HEAD Object endpoint.""" + bucket_root = get_bucket_root(bucket) + result = load_object(bucket_root, key) + if result is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + data, meta = result + + response = Response(status_code=200) + response.headers["ETag"] = f'"{meta["etag"]}"' + response.headers["Content-Type"] = meta.get("content_type", "application/octet-stream") + + # Add metadata headers + add_metadata_headers(response, meta.get("metadata", {})) + + return response + + +@app.delete("/{bucket}/{key:path}") +async def delete_object_endpoint(bucket: str, key: str): + """DELETE Object endpoint.""" + bucket_root = get_bucket_root(bucket) + deleted = delete_object(bucket_root, key) + if not deleted: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + return Response(status_code=204) + + +@app.get("/{bucket}") +async def list_objects_v2(bucket: str, request: Request): + """List Objects V2 endpoint.""" + bucket_root = get_bucket_root(bucket) + + # Check if bucket exists (has any objects) + if not any(bucket_root.rglob("*")): + error_xml = create_error_xml("NoSuchBucket", "The specified bucket does not exist.", f"/{bucket}") + raise HTTPException(status_code=404, detail=error_xml) + + prefix = request.query_params.get("prefix", "") + max_keys = int(request.query_params.get("max-keys", "1000")) + + objects = list_objects(bucket_root, prefix, max_keys) + xml_response = create_list_objects_xml(bucket, prefix, objects) + + return Response(content=xml_response, media_type="application/xml") + + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + """Handle HTTP exceptions with XML error responses.""" + if exc.detail and exc.detail.startswith(""): + return Response(content=exc.detail, media_type="application/xml", status_code=exc.status_code) + return Response(content=str(exc.detail), status_code=exc.status_code) + + +if __name__ == "__main__": + port = int(os.getenv("PORT", "9001")) + uvicorn.run(app, host="0.0.0.0", port=port) diff --git a/mocks/s3-mock/s3_xml.py b/mocks/s3-mock/s3_xml.py new file mode 100644 index 0000000..e1188f0 --- /dev/null +++ b/mocks/s3-mock/s3_xml.py @@ -0,0 +1,78 @@ +import xml.etree.ElementTree as ET +from typing import Dict, List +from xml.dom import minidom + + +def create_list_objects_xml(bucket: str, prefix: str, objects: List[Dict[str, str]], is_truncated: bool = False, continuation_token: str = "") -> str: + """Generate S3 ListObjectsV2 XML response.""" + root = ET.Element("ListBucketResult", xmlns="http://s3.amazonaws.com/doc/2006-03-01/") + + ET.SubElement(root, "Name").text = bucket + ET.SubElement(root, "Prefix").text = prefix + ET.SubElement(root, "KeyCount").text = str(len(objects)) + ET.SubElement(root, "MaxKeys").text = "1000" + ET.SubElement(root, "IsTruncated").text = "true" if is_truncated else "false" + if continuation_token: + ET.SubElement(root, "ContinuationToken").text = continuation_token + + for obj in objects: + contents = ET.SubElement(root, "Contents") + ET.SubElement(contents, "Key").text = obj["Key"] + ET.SubElement(contents, "LastModified").text = obj["LastModified"] + ET.SubElement(contents, "ETag").text = f'"{obj["ETag"]}"' + ET.SubElement(contents, "Size").text = str(obj["Size"]) + ET.SubElement(contents, "StorageClass").text = "STANDARD" + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") + + +def create_tagging_xml(tags: Dict[str, str]) -> str: + """Generate S3 Tagging XML response.""" + root = ET.Element("Tagging") + tag_set = ET.SubElement(root, "TagSet") + + for key, value in tags.items(): + tag = ET.SubElement(tag_set, "Tag") + ET.SubElement(tag, "Key").text = key + ET.SubElement(tag, "Value").text = value + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") + + +def parse_tagging_xml(xml_content: str) -> Dict[str, str]: + """Parse S3 Tagging XML input.""" + try: + root = ET.fromstring(xml_content) + tags = {} + tag_set = root.find("TagSet") + if tag_set is not None: + for tag in tag_set.findall("Tag"): + key_elem = tag.find("Key") + value_elem = tag.find("Value") + if key_elem is not None and value_elem is not None: + key = key_elem.text or "" + value = value_elem.text or "" + tags[key] = value + return tags + except ET.ParseError: + raise ValueError("Malformed XML") + + +def create_error_xml(code: str, message: str, resource: str = "") -> str: + """Generate S3 error XML response.""" + root = ET.Element("Error") + ET.SubElement(root, "Code").text = code + ET.SubElement(root, "Message").text = message + if resource: + ET.SubElement(root, "Resource").text = resource + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") diff --git a/mocks/s3-mock/smoke_test.py b/mocks/s3-mock/smoke_test.py new file mode 100644 index 0000000..0db88d1 --- /dev/null +++ b/mocks/s3-mock/smoke_test.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Smoke test for S3 Mock Server using FastAPI TestClient. +Tests all supported S3 operations to ensure compatibility. +""" + +from fastapi.testclient import TestClient +from main import get_app + +# Test configuration +BUCKET = "test-bucket" + +def create_test_client(): + """Create FastAPI TestClient for direct testing.""" + app = get_app() + return TestClient(app) + +def test_put_get_object(): + """Test PUT and GET object operations.""" + print("Testing PUT/GET object...") + + client = create_test_client() + key = "test-file.txt" + test_data = b"Hello, S3 Mock!" + tags = "environment=test&type=smoke" + + # PUT object + headers = { + "Content-Type": "text/plain", + "x-amz-meta-author": "test", + "x-amz-meta-version": "1.0" + } + response = client.put( + f"/{BUCKET}/{key}", + content=test_data, + headers=headers, + params={"tagging": tags} + ) + assert response.status_code == 200 + etag = response.headers.get("ETag") + print(f" PUT successful, ETag: {etag}") + + # GET object + response = client.get(f"/{BUCKET}/{key}") + assert response.status_code == 200 + assert response.content == test_data, "Data mismatch" + assert response.headers["Content-Type"] == "text/plain", "Content-Type mismatch" + assert response.headers.get("ETag") == etag, "ETag mismatch" + assert response.headers.get("x-amz-meta-author") == "test", "Metadata mismatch" + assert response.headers.get("x-amz-meta-version") == "1.0", "Metadata mismatch" + + print(" GET successful, data/metadata verified") + +def test_head_object(): + """Test HEAD object operation.""" + print("Testing HEAD object...") + + client = create_test_client() + key = "test-file.txt" + + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 200 + + assert response.headers["Content-Type"] == "text/plain", "Content-Type mismatch" + assert 'ETag' in response.headers, "ETag missing" + assert response.headers.get("x-amz-meta-author") == "test", "Metadata mismatch" + + print(" HEAD successful, headers verified") + +def test_list_objects(): + """Test ListObjectsV2 operation.""" + print("Testing ListObjectsV2...") + + client = create_test_client() + + # Create test objects for this test + test_objects = ["list-test-file.txt", "folder/file0.txt", "folder/file1.txt", "folder/file2.txt"] + + for obj_key in test_objects: + response = client.put( + f"/{BUCKET}/{obj_key}", + content=f"Content for {obj_key}".encode(), + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # List all objects + response = client.get(f"/{BUCKET}", params={"list-type": "2"}) + assert response.status_code == 200 + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + + # Handle XML namespace + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + assert len(contents) >= 4, f"Not all objects listed, found {len(contents)}" + + # Check that our test files are present + keys = [] + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + if key_elem is not None and key_elem.text: + keys.append(key_elem.text) + + print(f" Expected keys: {test_objects}") + print(f" Found keys: {keys}") + + for obj_key in test_objects: + assert obj_key in keys, f"Test file {obj_key} not in listing" + + print(f" Listed {len(contents)} objects") + + # Test prefix filtering + response = client.get(f"/{BUCKET}", params={"list-type": "2", "prefix": "folder/"}) + assert response.status_code == 200 + + root = ET.fromstring(response.text) + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + assert len(contents) == 3, f"Prefix filtering failed, found {len(contents)}" + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + key = key_elem.text if key_elem is not None else "" + assert key.startswith("folder/"), f"Prefix filter not working for {key}" + + print(" Prefix filtering works correctly") + +def test_tagging(): + """Test object tagging operations.""" + print("Testing object tagging...") + + client = create_test_client() + key = "tagged-file.txt" + + # Create object + response = client.put( + f"/{BUCKET}/{key}", + content=b"Tagged content", + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # Set tags + tagging_xml = """ + + + + Environment + Test + + + Type + SmokeTest + + +""" + + response = client.put( + f"/{BUCKET}/{key}", + content=tagging_xml, + headers={"Content-Type": "application/xml"} + ) + assert response.status_code == 200 + + # Get tags + response = client.get(f"/{BUCKET}/{key}", params={"tagging": ""}) + assert response.status_code == 200 + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + tag_elements = root.findall(".//Tag") + + retrieved_tags = {} + for tag_elem in tag_elements: + key_elem = tag_elem.find("Key") + value_elem = tag_elem.find("Value") + if key_elem is not None and value_elem is not None: + retrieved_tags[key_elem.text] = value_elem.text + + expected_tags = {"Environment": "Test", "Type": "SmokeTest"} + + assert retrieved_tags == expected_tags, f"Tags mismatch: {retrieved_tags} != {expected_tags}" + + print(" Tagging operations successful") + +def test_delete_object(): + """Test DELETE object operation.""" + print("Testing DELETE object...") + + client = create_test_client() + key = "to-delete.txt" + + # Create object + response = client.put( + f"/{BUCKET}/{key}", + content=b"Delete me", + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # Verify it exists + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 200 + + # Delete object + response = client.delete(f"/{BUCKET}/{key}") + assert response.status_code == 204 + + # Verify it's gone + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 404 + + print(" DELETE successful, object no longer exists") + +def cleanup(): + """Clean up test objects.""" + print("Cleaning up test objects...") + + client = create_test_client() + + # List all objects + response = client.get(f"/{BUCKET}", params={"list-type": "2"}) + if response.status_code == 200: + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + if key_elem is not None and key_elem.text: + key = key_elem.text + client.delete(f"/{BUCKET}/{key}") + print(f" Deleted {key}") + +def main(): + """Run all smoke tests.""" + print("S3 Mock Server Smoke Test") + print("=" * 40) + + try: + # Test all operations + test_put_get_object() + test_head_object() + test_list_objects() + test_tagging() + test_delete_object() + + print("\n" + "=" * 40) + print("All tests passed!") + + except Exception as e: + print(f"\nERROR: Test failed: {e}") + raise + finally: + # Always cleanup + try: + cleanup() + except Exception as e: + print(f"Warning: Cleanup failed: {e}") + +if __name__ == "__main__": + main() diff --git a/mocks/s3-mock/storage.py b/mocks/s3-mock/storage.py new file mode 100644 index 0000000..4f0efad --- /dev/null +++ b/mocks/s3-mock/storage.py @@ -0,0 +1,150 @@ +import os +import json +import hashlib +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Tuple, Optional, List + +HTTP_DATE_FMT = "%a, %d %b %Y %H:%M:%S GMT" + + +def ensure_bucket(root: Path, bucket: str) -> Path: + bucket_path = root / bucket + bucket_path.mkdir(parents=True, exist_ok=True) + return bucket_path + + +def object_paths(bucket_root: Path, key: str) -> Tuple[Path, Path]: + # Sanitize key to prevent traversal and reject dangerous keys + key = key.lstrip("/") + if os.path.isabs(key): + raise ValueError("Absolute paths are not allowed in S3 keys") + safe_parts = [] + for part in key.split("/"): + if part in ("..", ".", ""): + raise ValueError(f"Invalid path component in S3 key: {part!r}") + # On Windows, reject drive letters (e.g., C:) + if os.name == "nt" and len(part) == 2 and part[1] == ":" and part[0].isalpha(): + raise ValueError(f"Drive letter not allowed in S3 key: {part!r}") + safe_parts.append(part) + safe_key = "/".join(safe_parts) + obj_path = (bucket_root / safe_key).resolve() + meta_path = (bucket_root / f"{safe_key}.meta.json").resolve() + # Ensure the resolved paths are within the bucket root + bucket_root_resolved = bucket_root.resolve() + if not str(obj_path).startswith(str(bucket_root_resolved)): + raise ValueError("Path traversal detected in S3 key") + if not str(meta_path).startswith(str(bucket_root_resolved)): + raise ValueError("Path traversal detected in S3 key (meta)") + obj_path.parent.mkdir(parents=True, exist_ok=True) + return obj_path, meta_path + + +def calc_etag(data: bytes) -> str: + return hashlib.md5(data, usedforsecurity=False).hexdigest() + + +def httpdate(ts: float) -> str: + return datetime.fromtimestamp(ts, tz=timezone.utc).strftime(HTTP_DATE_FMT) + + +def iso8601(ts: float) -> str: + return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat().replace("+00:00", "Z") + + +def save_object(bucket_root: Path, key: str, data: bytes, content_type: str, metadata: Dict[str, str], tags: Dict[str, str]) -> Dict[str, str]: + obj_path, meta_path = object_paths(bucket_root, key) + tmp_path = obj_path.with_suffix(obj_path.suffix + ".tmp") + # Atomic write + with open(tmp_path, "wb") as f: + f.write(data) + os.replace(tmp_path, obj_path) + etag = calc_etag(data) + meta = { + "content_type": content_type or "application/octet-stream", + "metadata": metadata or {}, + "tags": tags or {}, + "etag": etag, + "last_modified": httpdate(os.path.getmtime(obj_path)), + } + with open(meta_path, "w", encoding="utf-8") as f: + json.dump(meta, f) + return meta + + +def load_meta(bucket_root: Path, key: str) -> Optional[Dict[str, str]]: + _, meta_path = object_paths(bucket_root, key) + if not meta_path.exists(): + return None + try: + with open(meta_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return None + + +def load_object(bucket_root: Path, key: str) -> Optional[Tuple[bytes, Dict[str, str]]]: + obj_path, _ = object_paths(bucket_root, key) + if not obj_path.exists(): + return None + with open(obj_path, "rb") as f: + data = f.read() + meta = load_meta(bucket_root, key) or {} + # Refresh last_modified if needed + meta["last_modified"] = httpdate(os.path.getmtime(obj_path)) + return data, meta + + +def delete_object(bucket_root: Path, key: str) -> bool: + obj_path, meta_path = object_paths(bucket_root, key) + deleted = False + if obj_path.exists(): + obj_path.unlink() + deleted = True + if meta_path.exists(): + meta_path.unlink() + return deleted + + +def list_objects(bucket_root: Path, prefix: str = "", max_keys: int = 100) -> List[Dict[str, str]]: + items: List[Dict[str, str]] = [] + base = bucket_root + for root, _, files in os.walk(base): + for name in files: + if name.endswith(".meta.json"): + continue + rel_path = Path(root).joinpath(name).relative_to(base) + key = str(rel_path).replace(os.sep, "/") + if not key.startswith(prefix): + continue + obj_path = base / rel_path + size = obj_path.stat().st_size + meta = load_meta(bucket_root, key) or {} + etag = meta.get("etag") + if not etag: + with open(obj_path, "rb") as f: + etag = calc_etag(f.read()) + items.append({ + "Key": key, + "Size": size, + "ETag": etag, + "LastModified": iso8601(obj_path.stat().st_mtime), + }) + if len(items) >= max_keys: + return items + # Sort by last modified desc to mimic S3 behavior often seen by clients + items.sort(key=lambda x: x["LastModified"], reverse=True) + return items + + +def set_tags(bucket_root: Path, key: str, tags: Dict[str, str]) -> None: + meta = load_meta(bucket_root, key) or {"metadata": {}, "content_type": "application/octet-stream", "etag": ""} + meta["tags"] = tags + _, meta_path = object_paths(bucket_root, key) + with open(meta_path, "w", encoding="utf-8") as f: + json.dump(meta, f) + + +def get_tags(bucket_root: Path, key: str) -> Dict[str, str]: + meta = load_meta(bucket_root, key) or {} + return meta.get("tags", {})