diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
index 82f5aa4..a525521 100644
--- a/.github/workflows/python-tests.yml
+++ b/.github/workflows/python-tests.yml
@@ -65,49 +65,3 @@ jobs:
         uv run pytest --run-api-tests
       env:
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-
-  docker:
-    needs: test
-    runs-on: ubuntu-latest
-    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-
-    - name: Log in to Docker Hub
-      uses: docker/login-action@v3
-      with:
-        username: ${{ secrets.DOCKER_USERNAME }}
-        password: ${{ secrets.DOCKER_TOKEN }}
-
-    - name: Log in to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-
-    - name: Extract version from __init__.py
-      id: version
-      run: |
-        VERSION=$(grep '__version__ =' agent_memory_server/__init__.py | sed 's/__version__ = "\(.*\)"/\1/' || echo "latest")
-        echo "version=$VERSION" >> $GITHUB_OUTPUT
-        echo "Version: $VERSION"
-
-    - name: Build and push Docker image
-      uses: docker/build-push-action@v5
-      with:
-        context: .
-        file: ./Dockerfile
-        platforms: linux/amd64,linux/arm64
-        push: true
-        tags: |
-          redislabs/agent-memory-server:latest
-          redislabs/agent-memory-server:${{ steps.version.outputs.version }}
-          ghcr.io/${{ github.repository }}:latest
-          ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}
-        cache-from: type=gha
-        cache-to: type=gha,mode=max
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..295ca66
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,88 @@
+name: Release Docker Images
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version to release (leave empty to use version from __init__.py)'
+        required: false
+        type: string
+      push_latest:
+        description: 'Also tag as latest'
+        required: true
+        type: boolean
+        default: true
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Log in to Docker Hub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_TOKEN }}
+
+    - name: Log in to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Determine version
+      id: version
+      run: |
+        if [ -n "${{ inputs.version }}" ]; then
+          VERSION="${{ inputs.version }}"
+        else
+          VERSION=$(grep '__version__ =' agent_memory_server/__init__.py | sed 's/__version__ = "\(.*\)"/\1/' || echo "latest")
+        fi
+        echo "version=$VERSION" >> $GITHUB_OUTPUT
+        echo "Version to release: $VERSION"
+
+    - name: Build tags list
+      id: tags
+      run: |
+        TAGS="redislabs/agent-memory-server:${{ steps.version.outputs.version }}"
+        TAGS="$TAGS,ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}"
+
+        if [ "${{ inputs.push_latest }}" = "true" ]; then
+          TAGS="$TAGS,redislabs/agent-memory-server:latest"
+          TAGS="$TAGS,ghcr.io/${{ github.repository }}:latest"
+        fi
+
+        echo "tags=$TAGS" >> $GITHUB_OUTPUT
+        echo "Tags to push: $TAGS"
+
+    - name: Build and push Docker image
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        file: ./Dockerfile
+        platforms: linux/amd64,linux/arm64
+        push: true
+        tags: ${{ steps.tags.outputs.tags }}
+        cache-from: type=gha
+        cache-to: type=gha,mode=max
+
+    - name: Create GitHub Release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        tag_name: v${{ steps.version.outputs.version }}
+        release_name: Release v${{ steps.version.outputs.version }}
+        body: |
+          Docker images published:
+          - `redislabs/agent-memory-server:${{ steps.version.outputs.version }}`
+          - `ghcr.io/${{ github.repository }}:${{ steps.version.outputs.version }}`
+          ${{ inputs.push_latest && format('- `redislabs/agent-memory-server:latest`{0}- `ghcr.io/{1}:latest`', '\n          ', github.repository) || '' }}
+        draft: false
+        prerelease: false
diff --git a/agent-memory-client/README.md b/agent-memory-client/README.md
index c09123c..cb775a8 100644
--- a/agent-memory-client/README.md
+++ b/agent-memory-client/README.md
@@ -100,6 +100,7 @@ working_memory = WorkingMemory(
     messages=[
         MemoryMessage(role="user", content="Hello!"),
         MemoryMessage(role="assistant", content="Hi there! How can I help?")
+        # created_at timestamps are automatically set for proper chronological ordering
     ],
     namespace="chat-app"
 )
diff --git a/agent-memory-client/agent_memory_client/__init__.py b/agent-memory-client/agent_memory_client/__init__.py
index 6ab3f17..ab9b86f 100644
--- a/agent-memory-client/agent_memory_client/__init__.py
+++ b/agent-memory-client/agent_memory_client/__init__.py
@@ -5,7 +5,7 @@
 memory management capabilities for AI agents and applications.
 """
 
-__version__ = "0.12.1"
+__version__ = "0.12.2"
 
 from .client import MemoryAPIClient, MemoryClientConfig, create_memory_client
 from .exceptions import (
diff --git a/agent-memory-client/agent_memory_client/client.py b/agent-memory-client/agent_memory_client/client.py
index 9d8461a..9883f50 100644
--- a/agent-memory-client/agent_memory_client/client.py
+++ b/agent-memory-client/agent_memory_client/client.py
@@ -17,7 +17,12 @@
 from pydantic import BaseModel
 from ulid import ULID
 
-from .exceptions import MemoryClientError, MemoryServerError, MemoryValidationError
+from .exceptions import (
+    MemoryClientError,
+    MemoryNotFoundError,
+    MemoryServerError,
+    MemoryValidationError,
+)
 from .filters import (
     CreatedAt,
     Entities,
@@ -364,8 +369,15 @@ async def get_or_create_working_memory(
                 return (True, created_memory)
 
             return (False, existing_memory)
-        except httpx.HTTPStatusError as e:
-            if e.response.status_code == 404:
+        except (httpx.HTTPStatusError, MemoryNotFoundError) as e:
+            # Handle both HTTPStatusError and MemoryNotFoundError for 404s
+            is_404 = False
+            if isinstance(e, httpx.HTTPStatusError):
+                is_404 = e.response.status_code == 404
+            elif isinstance(e, MemoryNotFoundError):
+                is_404 = True
+
+            if is_404:
                 # Session doesn't exist, create it
                 empty_memory = WorkingMemory(
                     session_id=session_id,
@@ -885,14 +897,6 @@ async def search_long_term_memory(
             )
             response.raise_for_status()
             data = response.json()
-            # Some tests may stub json() as an async function; handle awaitable
-            try:
-                import inspect
-
-                if inspect.isawaitable(data):
-                    data = await data
-            except Exception:
-                pass
             return MemoryRecordResults(**data)
         except httpx.HTTPStatusError as e:
             self._handle_http_error(e.response)
@@ -1477,8 +1481,8 @@ def get_add_memory_tool_schema(cls) -> dict[str, Any]:
                         },
                         "memory_type": {
                             "type": "string",
-                            "enum": ["episodic", "semantic", "message"],
-                            "description": "Type of memory: 'episodic' (events/experiences), 'semantic' (facts/preferences), 'message' (conversation snippets)",
+                            "enum": ["episodic", "semantic"],
+                            "description": "Type of memory: 'episodic' (events/experiences), 'semantic' (facts/preferences)",
                         },
                         "topics": {
                             "type": "array",
@@ -1595,8 +1599,8 @@ def edit_long_term_memory_tool_schema(cls) -> dict[str, Any]:
                         },
                         "memory_type": {
                             "type": "string",
-                            "enum": ["episodic", "semantic", "message"],
-                            "description": "Updated memory type: 'episodic' (events/experiences), 'semantic' (facts/preferences), 'message' (conversation snippets)",
+                            "enum": ["episodic", "semantic"],
+                            "description": "Updated memory type: 'episodic' (events/experiences), 'semantic' (facts/preferences)",
                         },
                         "namespace": {
                             "type": "string",
@@ -1620,6 +1624,67 @@ def edit_long_term_memory_tool_schema(cls) -> dict[str, Any]:
             },
         }
 
+    @classmethod
+    def create_long_term_memory_tool_schema(cls) -> dict[str, Any]:
+        """
+        Get OpenAI-compatible tool schema for creating long-term memories directly.
+
+        Returns:
+            Tool schema dictionary compatible with OpenAI tool calling format
+        """
+        return {
+            "type": "function",
+            "function": {
+                "name": "create_long_term_memory",
+                "description": (
+                    "Create long-term memories directly for immediate storage and retrieval. "
+                    "Use this for important information that should be permanently stored without going through working memory. "
+                    "This is the 'eager' approach - memories are created immediately in long-term storage. "
+                    "Examples: User preferences, important facts, key events that need to be searchable right away. "
+                    "For episodic memories, include event_date in ISO format."
+                ),
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "memories": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "text": {
+                                        "type": "string",
+                                        "description": "The memory content to store",
+                                    },
+                                    "memory_type": {
+                                        "type": "string",
+                                        "enum": ["episodic", "semantic"],
+                                        "description": "Type of memory: 'episodic' (events/experiences), 'semantic' (facts/preferences)",
+                                    },
+                                    "topics": {
+                                        "type": "array",
+                                        "items": {"type": "string"},
+                                        "description": "Optional topics for categorization",
+                                    },
+                                    "entities": {
+                                        "type": "array",
+                                        "items": {"type": "string"},
+                                        "description": "Optional entities mentioned in the memory",
+                                    },
+                                    "event_date": {
+                                        "type": "string",
+                                        "description": "Optional event date for episodic memories (ISO 8601 format: '2024-01-15T14:30:00Z')",
+                                    },
+                                },
+                                "required": ["text", "memory_type"],
+                            },
+                            "description": "List of memories to create",
+                        },
+                    },
+                    "required": ["memories"],
+                },
+            },
+        }
+
     @classmethod
     def delete_long_term_memories_tool_schema(cls) -> dict[str, Any]:
         """
@@ -1674,6 +1739,7 @@ def get_all_memory_tool_schemas(cls) -> Sequence[dict[str, Any]]:
             cls.get_add_memory_tool_schema(),
             cls.get_update_memory_data_tool_schema(),
             cls.get_long_term_memory_tool_schema(),
+            cls.create_long_term_memory_tool_schema(),
             cls.edit_long_term_memory_tool_schema(),
             cls.delete_long_term_memories_tool_schema(),
             cls.get_current_datetime_tool_schema(),
@@ -1706,6 +1772,7 @@ def get_all_memory_tool_schemas_anthropic(cls) -> Sequence[dict[str, Any]]:
             cls.get_add_memory_tool_schema_anthropic(),
             cls.get_update_memory_data_tool_schema_anthropic(),
             cls.get_long_term_memory_tool_schema_anthropic(),
+            cls.create_long_term_memory_tool_schema_anthropic(),
             cls.edit_long_term_memory_tool_schema_anthropic(),
             cls.delete_long_term_memories_tool_schema_anthropic(),
             cls.get_current_datetime_tool_schema_anthropic(),
@@ -1764,6 +1831,12 @@ def get_long_term_memory_tool_schema_anthropic(cls) -> dict[str, Any]:
         openai_schema = cls.get_long_term_memory_tool_schema()
         return cls._convert_openai_to_anthropic_schema(openai_schema)
 
+    @classmethod
+    def create_long_term_memory_tool_schema_anthropic(cls) -> dict[str, Any]:
+        """Get create long-term memory tool schema in Anthropic format."""
+        openai_schema = cls.create_long_term_memory_tool_schema()
+        return cls._convert_openai_to_anthropic_schema(openai_schema)
+
     @classmethod
     def edit_long_term_memory_tool_schema_anthropic(cls) -> dict[str, Any]:
         """Get edit long-term memory tool schema in Anthropic format."""
@@ -2143,6 +2216,11 @@ async def resolve_function_call(
             elif function_name == "get_long_term_memory":
                 result = await self._resolve_get_long_term_memory(args)
 
+            elif function_name == "create_long_term_memory":
+                result = await self._resolve_create_long_term_memory(
+                    args, effective_namespace, user_id
+                )
+
             elif function_name == "edit_long_term_memory":
                 result = await self._resolve_edit_long_term_memory(args)
 
@@ -2287,6 +2365,40 @@ async def _resolve_get_long_term_memory(
         result = await self.get_long_term_memory(memory_id=memory_id)
         return {"memory": result}
 
+    async def _resolve_create_long_term_memory(
+        self, args: dict[str, Any], namespace: str | None, user_id: str | None = None
+    ) -> dict[str, Any]:
+        """Resolve create_long_term_memory function call."""
+        memories_data = args.get("memories")
+        if not memories_data:
+            raise ValueError(
+                "memories parameter is required for create_long_term_memory"
+            )
+
+        # Convert dict memories to ClientMemoryRecord objects
+        from .models import ClientMemoryRecord, MemoryTypeEnum
+
+        memories = []
+        for memory_data in memories_data:
+            # Apply defaults
+            if namespace and "namespace" not in memory_data:
+                memory_data["namespace"] = namespace
+            if user_id and "user_id" not in memory_data:
+                memory_data["user_id"] = user_id
+
+            # Convert memory_type string to enum if needed
+            if "memory_type" in memory_data:
+                memory_data["memory_type"] = MemoryTypeEnum(memory_data["memory_type"])
+
+            memory = ClientMemoryRecord(**memory_data)
+            memories.append(memory)
+
+        result = await self.create_long_term_memory(memories)
+        return {
+            "status": result.status,
+            "message": f"Created {len(memories)} memories successfully",
+        }
+
     async def _resolve_edit_long_term_memory(
         self, args: dict[str, Any]
     ) -> dict[str, Any]:
@@ -2757,7 +2869,7 @@ async def memory_prompt(
         context_window_max: int | None = None,
         long_term_search: dict[str, Any] | None = None,
         user_id: str | None = None,
-        optimize_query: bool = True,
+        optimize_query: bool = False,
     ) -> dict[str, Any]:
         """
         Hydrate a user query with memory context and return a prompt ready to send to an LLM.
@@ -2861,7 +2973,7 @@ async def hydrate_memory_prompt(
         memory_type: dict[str, Any] | None = None,
         limit: int = 10,
         offset: int = 0,
-        optimize_query: bool = True,
+        optimize_query: bool = False,
     ) -> dict[str, Any]:
         """
         Hydrate a user query with long-term memory context using filters.
diff --git a/agent-memory-client/agent_memory_client/models.py b/agent-memory-client/agent_memory_client/models.py
index 2c83760..41be9c7 100644
--- a/agent-memory-client/agent_memory_client/models.py
+++ b/agent-memory-client/agent_memory_client/models.py
@@ -57,6 +57,10 @@ class MemoryMessage(BaseModel):
         default_factory=lambda: str(ULID()),
         description="Unique identifier for the message (auto-generated)",
     )
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+        description="Timestamp when the message was created",
+    )
     persisted_at: datetime | None = Field(
         default=None,
         description="Server-assigned timestamp when message was persisted to long-term storage",
diff --git a/agent-memory-client/tests/test_client.py b/agent-memory-client/tests/test_client.py
index 56d2a17..e6402f0 100644
--- a/agent-memory-client/tests/test_client.py
+++ b/agent-memory-client/tests/test_client.py
@@ -7,7 +7,7 @@
 
 import asyncio
 from collections.abc import AsyncGenerator
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, Mock, patch
 
 import httpx
 import pytest
@@ -310,7 +310,7 @@ class TestRecencyConfig:
     async def test_recency_config_descriptive_parameters(self, enhanced_test_client):
         """Test that RecencyConfig descriptive parameters are properly sent to API."""
         with patch.object(enhanced_test_client._client, "post") as mock_post:
-            mock_response = AsyncMock()
+            mock_response = Mock()
             mock_response.raise_for_status.return_value = None
             mock_response.json.return_value = MemoryRecordResults(
                 total=0, memories=[], next_offset=None
diff --git a/agent-memory-client/tests/test_tool_schemas.py b/agent-memory-client/tests/test_tool_schemas.py
new file mode 100644
index 0000000..7182166
--- /dev/null
+++ b/agent-memory-client/tests/test_tool_schemas.py
@@ -0,0 +1,312 @@
+"""
+Test file for memory tool schemas.
+
+Tests that tool schemas are correctly structured and that the 'message' memory type
+is not exposed to LLM tools (it should only be used server-side).
+"""
+
+from agent_memory_client import MemoryAPIClient
+
+
+class TestToolSchemaStructure:
+    """Tests for tool schema structure and completeness."""
+
+    def test_get_memory_search_tool_schema(self):
+        """Test memory search tool schema structure."""
+        schema = MemoryAPIClient.get_memory_search_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "search_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+        assert schema["function"]["parameters"]["type"] == "object"
+        assert "query" in schema["function"]["parameters"]["properties"]
+        assert "query" in schema["function"]["parameters"]["required"]
+
+    def test_get_add_memory_tool_schema(self):
+        """Test add_memory_to_working_memory tool schema structure."""
+        schema = MemoryAPIClient.get_add_memory_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "add_memory_to_working_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "text" in params["properties"]
+        assert "memory_type" in params["properties"]
+        assert "text" in params["required"]
+        assert "memory_type" in params["required"]
+
+    def test_create_long_term_memory_tool_schema(self):
+        """Test create_long_term_memory tool schema structure."""
+        schema = MemoryAPIClient.create_long_term_memory_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "create_long_term_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memories" in params["properties"]
+        assert "memories" in params["required"]
+
+        # Check nested structure
+        memory_items = params["properties"]["memories"]["items"]
+        assert "text" in memory_items["properties"]
+        assert "memory_type" in memory_items["properties"]
+        assert "text" in memory_items["required"]
+        assert "memory_type" in memory_items["required"]
+
+    def test_edit_long_term_memory_tool_schema(self):
+        """Test edit_long_term_memory tool schema structure."""
+        schema = MemoryAPIClient.edit_long_term_memory_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "edit_long_term_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memory_id" in params["properties"]
+        assert "memory_id" in params["required"]
+        assert "text" in params["properties"]
+        assert "memory_type" in params["properties"]
+
+    def test_delete_long_term_memories_tool_schema(self):
+        """Test delete_long_term_memories tool schema structure."""
+        schema = MemoryAPIClient.delete_long_term_memories_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "delete_long_term_memories"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memory_ids" in params["properties"]
+        assert "memory_ids" in params["required"]
+
+    def test_get_all_memory_tool_schemas(self):
+        """Test getting all memory tool schemas."""
+        schemas = MemoryAPIClient.get_all_memory_tool_schemas()
+
+        # Should have multiple tools
+        assert len(schemas) > 0
+
+        # Check that all expected tools are present
+        function_names = {schema["function"]["name"] for schema in schemas}
+        expected_tools = {
+            "search_memory",
+            "get_or_create_working_memory",
+            "add_memory_to_working_memory",
+            "update_working_memory_data",
+            "get_long_term_memory",
+            "create_long_term_memory",
+            "edit_long_term_memory",
+            "delete_long_term_memories",
+            "get_current_datetime",
+        }
+        assert expected_tools.issubset(function_names)
+
+
+class TestMemoryTypeEnumExclusion:
+    """Tests that 'message' memory type is NOT exposed in creation/editing tool schemas.
+
+    Note: search_memory CAN include 'message' in its filter enum since it's for
+    searching/reading existing memories, not creating new ones. The restriction
+    only applies to tools that create or modify memories.
+    """
+
+    def test_add_memory_excludes_message_type(self):
+        """Test that add_memory_to_working_memory excludes 'message' type."""
+        schema = MemoryAPIClient.get_add_memory_tool_schema()
+
+        params = schema["function"]["parameters"]
+        memory_type_prop = params["properties"]["memory_type"]
+
+        # Should only have episodic and semantic
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_create_long_term_memory_excludes_message_type(self):
+        """Test that create_long_term_memory excludes 'message' type."""
+        schema = MemoryAPIClient.create_long_term_memory_tool_schema()
+
+        params = schema["function"]["parameters"]
+        memory_items = params["properties"]["memories"]["items"]
+        memory_type_prop = memory_items["properties"]["memory_type"]
+
+        # Should only have episodic and semantic
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_edit_long_term_memory_excludes_message_type(self):
+        """Test that edit_long_term_memory excludes 'message' type."""
+        schema = MemoryAPIClient.edit_long_term_memory_tool_schema()
+
+        params = schema["function"]["parameters"]
+        memory_type_prop = params["properties"]["memory_type"]
+
+        # Should only have episodic and semantic
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_search_memory_allows_message_type_filter(self):
+        """Test that search_memory DOES allow 'message' type for filtering.
+
+        This is intentional - search tools should be able to filter by message type
+        to find conversation history, but creation/editing tools should not be able
+        to create or modify message-type memories.
+        """
+        schema = MemoryAPIClient.get_memory_search_tool_schema()
+
+        params = schema["function"]["parameters"]
+        memory_type_prop = params["properties"]["memory_type"]
+
+        # Search should include all types including message
+        assert "episodic" in memory_type_prop["enum"]
+        assert "semantic" in memory_type_prop["enum"]
+        assert "message" in memory_type_prop["enum"]
+
+    def test_creation_and_editing_tools_exclude_message_type(self):
+        """Test that creation and editing tools (not search) exclude 'message'."""
+        all_schemas = MemoryAPIClient.get_all_memory_tool_schemas()
+
+        # Tools that should NOT expose message type (creation/editing tools)
+        restricted_tools = {
+            "add_memory_to_working_memory",
+            "create_long_term_memory",
+            "edit_long_term_memory",
+        }
+
+        # Tools that CAN expose message type (search/read tools)
+        allowed_tools = {
+            "search_memory",
+            "get_long_term_memory",
+        }
+
+        for schema in all_schemas:
+            function_name = schema["function"]["name"]
+            params = schema["function"]["parameters"]
+
+            # Check direct memory_type property
+            if "memory_type" in params["properties"]:
+                memory_type_prop = params["properties"]["memory_type"]
+                if "enum" in memory_type_prop:
+                    if function_name in restricted_tools:
+                        assert (
+                            "message" not in memory_type_prop["enum"]
+                        ), f"Creation/editing tool '{function_name}' should not expose 'message' memory type"
+                    elif function_name in allowed_tools:
+                        # These tools are allowed to have message in enum for filtering
+                        pass
+
+            # Check nested properties (like in create_long_term_memory)
+            if "memories" in params["properties"]:
+                items = params["properties"]["memories"].get("items", {})
+                if (
+                    "properties" in items
+                    and "memory_type" in items["properties"]
+                    and "enum" in items["properties"]["memory_type"]
+                    and function_name in restricted_tools
+                ):
+                    memory_type_prop = items["properties"]["memory_type"]
+                    assert (
+                        "message" not in memory_type_prop["enum"]
+                    ), f"Creation/editing tool '{function_name}' should not expose 'message' memory type in nested properties"
+
+
+class TestAnthropicSchemas:
+    """Tests for Anthropic-formatted tool schemas."""
+
+    def test_get_memory_search_tool_schema_anthropic(self):
+        """Test memory search tool schema in Anthropic format."""
+        schema = MemoryAPIClient.get_memory_search_tool_schema_anthropic()
+
+        assert schema["name"] == "search_memory"
+        assert "description" in schema
+        assert "input_schema" in schema
+        assert schema["input_schema"]["type"] == "object"
+        assert "query" in schema["input_schema"]["properties"]
+        assert "query" in schema["input_schema"]["required"]
+
+    def test_create_long_term_memory_tool_schema_anthropic(self):
+        """Test create_long_term_memory tool schema in Anthropic format."""
+        schema = MemoryAPIClient.create_long_term_memory_tool_schema_anthropic()
+
+        assert schema["name"] == "create_long_term_memory"
+        assert "description" in schema
+        assert "input_schema" in schema
+        assert schema["input_schema"]["type"] == "object"
+        assert "memories" in schema["input_schema"]["properties"]
+
+    def test_edit_long_term_memory_tool_schema_anthropic(self):
+        """Test edit_long_term_memory tool schema in Anthropic format."""
+        schema = MemoryAPIClient.edit_long_term_memory_tool_schema_anthropic()
+
+        assert schema["name"] == "edit_long_term_memory"
+        assert "description" in schema
+        assert "input_schema" in schema
+        assert schema["input_schema"]["type"] == "object"
+        assert "memory_id" in schema["input_schema"]["properties"]
+
+    def test_delete_long_term_memories_tool_schema_anthropic(self):
+        """Test delete_long_term_memories tool schema in Anthropic format."""
+        schema = MemoryAPIClient.delete_long_term_memories_tool_schema_anthropic()
+
+        assert schema["name"] == "delete_long_term_memories"
+        assert "description" in schema
+        assert "input_schema" in schema
+        assert schema["input_schema"]["type"] == "object"
+        assert "memory_ids" in schema["input_schema"]["properties"]
+
+    def test_anthropic_schemas_exclude_message_type_for_creation(self):
+        """Test that Anthropic creation/editing schemas exclude 'message' type."""
+        all_schemas = MemoryAPIClient.get_all_memory_tool_schemas_anthropic()
+
+        # Tools that should NOT expose message type (creation/editing tools)
+        restricted_tools = {
+            "add_memory_to_working_memory",
+            "create_long_term_memory",
+            "edit_long_term_memory",
+        }
+
+        # Tools that CAN expose message type (search/read tools)
+        allowed_tools = {
+            "search_memory",
+            "get_long_term_memory",
+        }
+
+        for schema in all_schemas:
+            function_name = schema["name"]
+            params = schema["input_schema"]
+
+            # Check direct memory_type property
+            if "memory_type" in params["properties"]:
+                memory_type_prop = params["properties"]["memory_type"]
+                if "enum" in memory_type_prop:
+                    if function_name in restricted_tools:
+                        assert (
+                            "message" not in memory_type_prop["enum"]
+                        ), f"Anthropic creation/editing tool '{function_name}' should not expose 'message' memory type"
+                    elif function_name in allowed_tools:
+                        # These tools are allowed to have message in enum for filtering
+                        pass
+
+            # Check nested properties
+            if "memories" in params["properties"]:
+                items = params["properties"]["memories"].get("items", {})
+                if (
+                    "properties" in items
+                    and "memory_type" in items["properties"]
+                    and "enum" in items["properties"]["memory_type"]
+                    and function_name in restricted_tools
+                ):
+                    memory_type_prop = items["properties"]["memory_type"]
+                    assert (
+                        "message" not in memory_type_prop["enum"]
+                    ), f"Anthropic creation/editing tool '{function_name}' should not expose 'message' memory type in nested properties"
diff --git a/agent_memory_server/__init__.py b/agent_memory_server/__init__.py
index c836dfc..6ea2f13 100644
--- a/agent_memory_server/__init__.py
+++ b/agent_memory_server/__init__.py
@@ -1,3 +1,3 @@
 """Redis Agent Memory Server - A memory system for conversational AI."""
 
-__version__ = "0.12.1"
+__version__ = "0.12.2"
diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
index adadb84..db45950 100644
--- a/agent_memory_server/api.py
+++ b/agent_memory_server/api.py
@@ -27,6 +27,7 @@
     SearchRequest,
     SessionListResponse,
     SystemMessage,
+    UpdateWorkingMemory,
     WorkingMemory,
     WorkingMemoryResponse,
 )
@@ -138,9 +139,14 @@ def _calculate_context_usage_percentages(
         - until_summarization_percentage: Percentage (0-100) until summarization triggers
         Both values are None if no model info provided
     """
-    if not messages or (not model_name and not context_window_max):
+    # Return None only when no model information is provided
+    if not model_name and not context_window_max:
         return None, None
 
+    # If no messages but model info is provided, return 0% usage
+    if not messages:
+        return 0.0, 0.0
+
     # Calculate current token usage
     current_tokens = _calculate_messages_token_count(messages)
 
@@ -148,11 +154,18 @@ def _calculate_context_usage_percentages(
     max_tokens = _get_effective_token_limit(model_name, context_window_max)
 
     # Calculate percentage of total context window used
+    if max_tokens <= 0:
+        return None, None
+
     total_percentage = (current_tokens / max_tokens) * 100.0
 
     # Calculate percentage until summarization threshold
     token_threshold = int(max_tokens * settings.summarization_threshold)
-    until_summarization_percentage = (current_tokens / token_threshold) * 100.0
+    if token_threshold <= 0:
+        # If threshold is 0 or negative, we're already at 100% until summarization
+        until_summarization_percentage = 100.0
+    else:
+        until_summarization_percentage = (current_tokens / token_threshold) * 100.0
 
     # Cap both at 100% for display purposes
     return min(total_percentage, 100.0), min(until_summarization_percentage, 100.0)
@@ -346,6 +359,7 @@ async def get_working_memory(
     namespace: str | None = None,
     model_name: ModelNameLiteral | None = None,
     context_window_max: int | None = None,
+    recent_messages_limit: int | None = None,
     x_client_version: str | None = Header(None, alias="X-Client-Version"),
     current_user: UserInfo = Depends(get_current_user),
 ):
@@ -361,6 +375,7 @@ async def get_working_memory(
         namespace: The namespace to use for the session
         model_name: The client's LLM model name (will determine context window size if provided)
         context_window_max: Direct specification of the context window max tokens (overrides model_name)
+        recent_messages_limit: Maximum number of recent messages to return (most recent first)
 
     Returns:
         Working memory containing messages, context, and structured memory records
@@ -372,6 +387,7 @@ async def get_working_memory(
         namespace=namespace,
         redis_client=redis,
         user_id=user_id,
+        recent_messages_limit=recent_messages_limit,
     )
 
     # Handle missing sessions based on client version
@@ -439,8 +455,7 @@ async def get_working_memory(
 @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
 async def put_working_memory(
     session_id: str,
-    memory: WorkingMemory,
-    user_id: str | None = None,
+    memory: UpdateWorkingMemory,
     model_name: ModelNameLiteral | None = None,
     context_window_max: int | None = None,
     background_tasks=Depends(get_background_tasks),
@@ -449,33 +464,37 @@ async def put_working_memory(
     """
     Set working memory for a session. Replaces existing working memory.
 
+    The session_id comes from the URL path, not the request body.
     If the token count exceeds the context window threshold, messages will be summarized
     immediately and the updated memory state returned to the client.
 
+    NOTE on context_percentage_* fields:
+    The response includes `context_percentage_total_used` and `context_percentage_until_summarization`
+    fields that show token usage. These fields will be `null` unless you provide either:
+    - `model_name` query parameter (e.g., `?model_name=gpt-4o-mini`)
+    - `context_window_max` query parameter (e.g., `?context_window_max=500`)
+
     Args:
-        session_id: The session ID
-        memory: Working memory to save
-        user_id: Optional user ID for the session (overrides user_id in memory object)
+        session_id: The session ID (from URL path)
+        memory: Working memory data to save (session_id not required in body)
         model_name: The client's LLM model name for context window determination
-        context_window_max: Direct specification of context window max tokens
+        context_window_max: Direct specification of context window max tokens (overrides model_name)
         background_tasks: DocketBackgroundTasks instance (injected automatically)
 
     Returns:
-        Updated working memory (potentially with summary if tokens were condensed)
+        Updated working memory (potentially with summary if tokens were condensed).
+        Includes context_percentage_total_used and context_percentage_until_summarization
+        if model information is provided.
     """
     redis = await get_redis_conn()
 
     # PUT semantics: we simply replace whatever exists (or create if it doesn't exist)
 
-    # Ensure session_id matches
-    memory.session_id = session_id
-
-    # Override user_id if provided as query parameter
-    if user_id is not None:
-        memory.user_id = user_id
+    # Convert UpdateWorkingMemory to WorkingMemory with session_id from URL path
+    working_memory_obj = memory.to_working_memory(session_id)
 
     # Validate that all long-term memories have id (if any)
-    for long_term_mem in memory.memories:
+    for long_term_mem in working_memory_obj.memories:
         if not long_term_mem.id:
             raise HTTPException(
                 status_code=400,
@@ -483,7 +502,7 @@ async def put_working_memory(
             )
 
     # Validate that all messages have non-empty content
-    for msg in memory.messages:
+    for msg in working_memory_obj.messages:
         if not msg.content or not msg.content.strip():
             raise HTTPException(
                 status_code=400,
@@ -491,10 +510,12 @@ async def put_working_memory(
             )
 
     # Handle summarization if needed (before storing) - now token-based
-    updated_memory = memory
-    if memory.messages:
+    updated_memory = working_memory_obj
+    if working_memory_obj.messages:
         updated_memory = await _summarize_working_memory(
-            memory, model_name=model_name, context_window_max=context_window_max
+            working_memory_obj,
+            model_name=model_name,
+            context_window_max=context_window_max,
         )
 
     await working_memory.set_working_memory(
@@ -507,6 +528,9 @@ async def put_working_memory(
         updated_memory.memories or updated_memory.messages
     ):
         # Promote structured memories from working memory to long-term storage
+        # TODO: Evaluate if this is an optimal way to pass around user ID. We
+        # need it to construct the key to get the working memory session from
+        # this task, if the session was saved with a user ID to begin with.
         background_tasks.add_task(
             long_term_memory.promote_working_memory_to_long_term,
             session_id=session_id,
@@ -515,6 +539,7 @@ async def put_working_memory(
         )
 
     # Calculate context usage percentages based on the final state (after potential summarization)
+    # This represents the current state of the session
     total_percentage, until_summarization_percentage = (
         _calculate_context_usage_percentages(
             messages=updated_memory.messages,
@@ -606,7 +631,7 @@ async def create_long_term_memory(
 @router.post("/v1/long-term-memory/search", response_model=MemoryRecordResultsResponse)
 async def search_long_term_memory(
     payload: SearchRequest,
-    optimize_query: bool = True,
+    optimize_query: bool = False,
     current_user: UserInfo = Depends(get_current_user),
 ):
     """
@@ -614,7 +639,7 @@ async def search_long_term_memory(
 
     Args:
         payload: Search payload with filter objects for precise queries
-        optimize_query: Whether to optimize the query for vector search using a fast model (default: True)
+        optimize_query: Whether to optimize the query for vector search using a fast model (default: False)
 
     Returns:
         List of search results
@@ -639,8 +664,7 @@ async def search_long_term_memory(
 
     logger.debug(f"Long-term search kwargs: {kwargs}")
 
-    # Pass text and filter objects to the search function (no redis needed for vectorstore adapter)
-    # Server-side recency rerank toggle (Redis-only path); defaults to False
+    # Server-side recency rerank toggle
     server_side_recency = (
         payload.server_side_recency
         if payload.server_side_recency is not None
@@ -654,18 +678,13 @@ async def search_long_term_memory(
     raw_results = await long_term_memory.search_long_term_memories(**kwargs)
 
     # Soft-filter fallback: if strict filters yield no results, relax filters and
-    # inject hints into the query text to guide semantic search. For memory_prompt
-    # unit tests, the underlying function is mocked; avoid triggering fallback to
-    # keep call counts stable when optimize_query behavior is being asserted.
+    # inject hints into the query text to guide semantic search.
     try:
         had_any_strict_filters = any(
             key in kwargs and kwargs[key] is not None
             for key in ("topics", "entities", "namespace", "memory_type", "event_date")
         )
-        is_mocked = "unittest.mock" in str(
-            type(long_term_memory.search_long_term_memories)
-        )
-        if raw_results.total == 0 and had_any_strict_filters and not is_mocked:
+        if raw_results.total == 0 and had_any_strict_filters:
             fallback_kwargs = dict(kwargs)
             for key in ("topics", "entities", "namespace", "memory_type", "event_date"):
                 fallback_kwargs.pop(key, None)
@@ -713,6 +732,8 @@ def _vals(f):
         logger.warning(f"Soft-filter fallback failed: {e}")
 
     # Recency-aware re-ranking of results (configurable)
+    # TODO: Why did we need to go this route instead of using recency boost at
+    # the query level?
     try:
         from datetime import UTC, datetime as _dt
 
@@ -832,7 +853,7 @@ async def update_long_term_memory(
 @router.post("/v1/memory/prompt", response_model=MemoryPromptResponse)
 async def memory_prompt(
     params: MemoryPromptRequest,
-    optimize_query: bool = True,
+    optimize_query: bool = False,
     current_user: UserInfo = Depends(get_current_user),
 ) -> MemoryPromptResponse:
     """
@@ -850,7 +871,7 @@ async def memory_prompt(
 
     Args:
         params: MemoryPromptRequest
-        optimize_query: Whether to optimize the query for vector search using a fast model (default: True)
+        optimize_query: Whether to optimize the query for vector search using a fast model (default: False)
 
     Returns:
         List of messages to send to an LLM, hydrated with relevant memory context
diff --git a/agent_memory_server/llms.py b/agent_memory_server/llms.py
index de4901c..835c244 100644
--- a/agent_memory_server/llms.py
+++ b/agent_memory_server/llms.py
@@ -96,7 +96,7 @@ async def create_chat_completion(
 
             choices = [{"message": {"content": content}}]
 
-            # Handle both object and dictionary usage formats for testing
+            # Handle both object and dictionary usage formats from API responses
             input_tokens = output_tokens = 0
             if hasattr(response, "usage"):
                 if isinstance(response.usage, dict):
@@ -180,7 +180,7 @@ async def create_chat_completion(
             )
 
             # Convert to unified format
-            # Handle both object and dictionary usage formats for testing
+            # Handle both object and dictionary usage formats from API responses
             total_tokens = 0
             if hasattr(response, "usage"):
                 if isinstance(response.usage, dict):
diff --git a/agent_memory_server/long_term_memory.py b/agent_memory_server/long_term_memory.py
index 3097e74..1c24150 100644
--- a/agent_memory_server/long_term_memory.py
+++ b/agent_memory_server/long_term_memory.py
@@ -877,7 +877,7 @@ async def search_long_term_memories(
     recency_params: dict | None = None,
     limit: int = 10,
     offset: int = 0,
-    optimize_query: bool = True,
+    optimize_query: bool = False,
 ) -> MemoryRecordResults:
     """
     Search for long-term memories using the pluggable VectorStore adapter.
@@ -897,7 +897,7 @@ async def search_long_term_memories(
         memory_hash: Optional memory hash filter
         limit: Maximum number of results
         offset: Offset for pagination
-        optimize_query: Whether to optimize the query for vector search using a fast model (default: True)
+        optimize_query: Whether to optimize the query for vector search using a fast model (default: False)
 
     Returns:
         MemoryRecordResults containing matching memories
@@ -933,8 +933,7 @@ async def search_long_term_memories(
     )
 
     # If an optimized query with a strict distance threshold returns no results,
-    # retry once with the original query to preserve recall. Skip this retry when
-    # the adapter is a unittest mock to avoid altering test expectations.
+    # retry once with the original query to preserve recall.
     try:
         if (
             optimized_applied
@@ -942,34 +941,24 @@ async def search_long_term_memories(
             and results.total == 0
             and search_query != text
         ):
-            # Detect unittest.mock objects without importing globally
-            is_mock = False
-            try:
-                from unittest.mock import Mock  # type: ignore
-
-                is_mock = isinstance(getattr(adapter, "search_memories", None), Mock)
-            except Exception:
-                is_mock = False
-
-            if not is_mock:
-                results = await adapter.search_memories(
-                    query=text,
-                    session_id=session_id,
-                    user_id=user_id,
-                    namespace=namespace,
-                    created_at=created_at,
-                    last_accessed=last_accessed,
-                    topics=topics,
-                    entities=entities,
-                    memory_type=memory_type,
-                    event_date=event_date,
-                    memory_hash=memory_hash,
-                    distance_threshold=distance_threshold,
-                    server_side_recency=server_side_recency,
-                    recency_params=recency_params,
-                    limit=limit,
-                    offset=offset,
-                )
+            results = await adapter.search_memories(
+                query=text,
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                created_at=created_at,
+                last_accessed=last_accessed,
+                topics=topics,
+                entities=entities,
+                memory_type=memory_type,
+                event_date=event_date,
+                memory_hash=memory_hash,
+                distance_threshold=distance_threshold,
+                server_side_recency=server_side_recency,
+                recency_params=recency_params,
+                limit=limit,
+                offset=offset,
+            )
     except Exception:
         # Best-effort fallback; return the original results on any error
         pass
@@ -1314,24 +1303,7 @@ async def promote_working_memory_to_long_term(
         logger.debug(f"No working memory found for session {session_id}")
         return 0
 
-    # Find memories with no persisted_at (eligible for promotion)
-    unpersisted_memories = [
-        memory
-        for memory in current_working_memory.memories
-        if memory.persisted_at is None
-    ]
-
-    # Find unpersisted messages (similar to unpersisted memories)
-    if settings.index_all_messages_in_long_term_memory:
-        unpersisted_messages = [
-            msg for msg in current_working_memory.messages if msg.persisted_at is None
-        ]
-    else:
-        unpersisted_messages = []
-
-    logger.info(
-        f"Promoting {len(unpersisted_memories)} memories and {len(unpersisted_messages)} messages from session {session_id}"
-    )
+    logger.info("Promoting memories to long-term storage...")
 
     promoted_count = 0
     updated_memories = []
@@ -1344,6 +1316,7 @@ async def promote_working_memory_to_long_term(
         if message.discrete_memory_extracted == "f"
     ]
 
+    extracted_memories = []
     if settings.enable_discrete_memory_extraction and unextracted_messages:
         # Check if we should run thread-aware extraction (debounced)
         if await should_extract_session_thread(session_id, redis):
@@ -1362,9 +1335,16 @@ async def promote_working_memory_to_long_term(
 
         else:
             logger.info(f"Skipping extraction for session {session_id} - debounced")
-            extracted_memories = []
 
-    for memory in current_working_memory.memories:
+    # Combine existing memories with newly extracted memories for processing
+    all_memories_to_process = list(current_working_memory.memories)
+    if extracted_memories:
+        logger.info(
+            f"Adding {len(extracted_memories)} extracted memories for promotion"
+        )
+        all_memories_to_process.extend(extracted_memories)
+
+    for memory in all_memories_to_process:
         if memory.persisted_at is None:
             # This memory needs to be promoted
 
@@ -1404,13 +1384,6 @@ async def promote_working_memory_to_long_term(
             # This memory is already persisted, keep as-is
             updated_memories.append(memory)
 
-    # Add extracted memories to working memory for future promotion
-    if extracted_memories:
-        logger.info(
-            f"Adding {len(extracted_memories)} extracted memories to working memory"
-        )
-        updated_memories.extend(extracted_memories)
-
     count_persisted_messages = 0
     message_records_to_index = []
 
@@ -1436,6 +1409,8 @@ async def promote_working_memory_to_long_term(
                     namespace=namespace,
                     user_id=current_working_memory.user_id,
                     persisted_at=None,
+                    created_at=msg.created_at,
+                    memory_type=MemoryTypeEnum.MESSAGE,
                 )
 
                 # Apply same deduplication logic as structured memories
@@ -1449,12 +1424,7 @@ async def promote_working_memory_to_long_term(
                 current_memory.persisted_at = datetime.now(UTC)
 
                 # Set extraction strategy configuration from working memory
-                current_memory.extraction_strategy = (
-                    current_working_memory.long_term_memory_strategy.strategy
-                )
-                current_memory.extraction_strategy_config = (
-                    current_working_memory.long_term_memory_strategy.config
-                )
+                current_memory.extraction_strategy = "message"
 
                 # Collect memory record for batch indexing
                 message_records_to_index.append(current_memory)
@@ -1464,9 +1434,13 @@ async def promote_working_memory_to_long_term(
                 promoted_count += 1
 
                 if was_overwrite:
-                    logger.info(f"Overwrote existing message with id {msg.id}")
+                    logger.info(
+                        f"Overwrote existing long-term message memory with ID {msg.id}"
+                    )
                 else:
-                    logger.info(f"Promoted new message with id {msg.id}")
+                    logger.info(
+                        f"Promoted new long-term message memory with ID {msg.id}"
+                    )
 
             updated_messages.append(msg)
 
@@ -1482,8 +1456,20 @@ async def promote_working_memory_to_long_term(
         count_persisted_messages = 0
         updated_messages = current_working_memory.messages
 
+    # Check if any messages were marked as extracted
+    messages_marked_extracted = (
+        settings.enable_discrete_memory_extraction
+        and unextracted_messages
+        and await should_extract_session_thread(session_id, redis)
+    )
+
     # Update working memory with the new persisted_at timestamps and extracted memories
-    if promoted_count > 0 or extracted_memories or count_persisted_messages > 0:
+    if (
+        promoted_count > 0
+        or extracted_memories
+        or count_persisted_messages > 0
+        or messages_marked_extracted
+    ):
         updated_working_memory = current_working_memory.model_copy()
         updated_working_memory.memories = updated_memories
         updated_working_memory.messages = updated_messages
diff --git a/agent_memory_server/mcp.py b/agent_memory_server/mcp.py
index 6e3c2e4..7645c4c 100644
--- a/agent_memory_server/mcp.py
+++ b/agent_memory_server/mcp.py
@@ -875,11 +875,21 @@ async def set_working_memory(
 @mcp_app.tool()
 async def get_working_memory(
     session_id: str,
+    recent_messages_limit: int | None = None,
 ) -> WorkingMemory:
     """
     Get working memory for a session. This works like the GET /sessions/{id}/memory API endpoint.
+
+    Args:
+        session_id: The session ID to retrieve working memory for
+        recent_messages_limit: Optional limit on number of recent messages to return (most recent first)
+
+    Returns:
+        Working memory containing messages, context, and structured memory records
     """
-    return await core_get_working_memory(session_id=session_id)
+    return await core_get_working_memory(
+        session_id=session_id, recent_messages_limit=recent_messages_limit
+    )
 
 
 @mcp_app.tool()
diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py
index 01c240b..f96190f 100644
--- a/agent_memory_server/models.py
+++ b/agent_memory_server/models.py
@@ -87,6 +87,10 @@ class MemoryMessage(BaseModel):
         default_factory=lambda: str(ULID()),
         description="Unique identifier for the message (auto-generated if not provided)",
     )
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(UTC),
+        description="Timestamp when the message was created",
+    )
     persisted_at: datetime | None = Field(
         default=None,
         description="Server-assigned timestamp when message was persisted to long-term storage",
@@ -185,7 +189,15 @@ class MemoryRecord(BaseModel):
 
 
 class ExtractedMemoryRecord(MemoryRecord):
-    """A memory record that has already been extracted (e.g., explicit memories from API/MCP)"""
+    """
+    A memory record that has already been extracted.
+
+    We use this to represent data payloads where we consider the memory
+    in its final state:
+      - Long-term memories that clients created explicitly through the API
+      - Memories an LLM added to working memory (using a tool) that should be
+        "promoted" from working memory to long-term storage.
+    """
 
     discrete_memory_extracted: Literal["t", "f"] = Field(
         default="t",
@@ -378,6 +390,79 @@ async def create_long_term_memories_with_strategy(memories: list[dict]) -> dict:
         return create_long_term_memories_with_strategy
 
 
+class UpdateWorkingMemory(BaseModel):
+    """Working memory update payload for PUT requests - session_id comes from URL path"""
+
+    messages: list[MemoryMessage] = Field(
+        default_factory=list,
+        description="Conversation messages (role/content pairs)",
+    )
+    memories: list[MemoryRecord | ClientMemoryRecord] = Field(
+        default_factory=list,
+        description="Structured memory records for promotion to long-term storage",
+    )
+    data: dict[str, JSONTypes] | None = Field(
+        default=None,
+        description="Arbitrary JSON data storage (key-value pairs)",
+    )
+    context: str | None = Field(
+        default=None,
+        description="Summary of past session messages if server has auto-summarized",
+    )
+    user_id: str | None = Field(
+        default=None,
+        description="Optional user ID for the working memory",
+    )
+    tokens: int = Field(
+        default=0,
+        description="Optional number of tokens in the working memory",
+    )
+    namespace: str | None = Field(
+        default=None,
+        description="Optional namespace for the working memory",
+    )
+    long_term_memory_strategy: MemoryStrategyConfig = Field(
+        default_factory=MemoryStrategyConfig,
+        description="Configuration for memory extraction strategy when promoting to long-term memory",
+    )
+
+    # TTL and timestamps
+    ttl_seconds: int | None = Field(
+        default=None,  # Persistent by default
+        description="TTL for the working memory in seconds",
+    )
+    last_accessed: datetime = Field(
+        default_factory=lambda: datetime.now(UTC),
+        description="Datetime when the working memory was last accessed",
+    )
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(UTC),
+        description="Datetime when the working memory was created",
+    )
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(UTC),
+        description="Datetime when the working memory was last updated",
+    )
+
+    def to_working_memory(self, session_id: str) -> "WorkingMemory":
+        """Convert to WorkingMemory by adding the session_id from URL path"""
+        return WorkingMemory(
+            session_id=session_id,
+            messages=self.messages,
+            memories=self.memories,
+            data=self.data,
+            context=self.context,
+            user_id=self.user_id,
+            tokens=self.tokens,
+            namespace=self.namespace,
+            long_term_memory_strategy=self.long_term_memory_strategy,
+            ttl_seconds=self.ttl_seconds,
+            last_accessed=self.last_accessed,
+            created_at=self.created_at,
+            updated_at=self.updated_at,
+        )
+
+
 class WorkingMemoryResponse(WorkingMemory):
     """Response containing working memory"""
 
diff --git a/agent_memory_server/utils/redis.py b/agent_memory_server/utils/redis.py
index c660500..07aa9da 100644
--- a/agent_memory_server/utils/redis.py
+++ b/agent_memory_server/utils/redis.py
@@ -29,7 +29,7 @@ async def get_redis_conn(url: str = settings.redis_url, **kwargs) -> Redis:
     global _redis_pool
 
     # Always use the existing _redis_pool if it's not None, regardless of the URL parameter
-    # This ensures that the patched _redis_pool from the test fixture is used
+    # This ensures connection reuse and prevents multiple Redis connections
     if _redis_pool is None:
         _redis_pool = Redis.from_url(url, **kwargs)
     return _redis_pool
diff --git a/agent_memory_server/utils/redis_query.py b/agent_memory_server/utils/redis_query.py
index 3a4e4c3..4ddfac3 100644
--- a/agent_memory_server/utils/redis_query.py
+++ b/agent_memory_server/utils/redis_query.py
@@ -89,6 +89,6 @@ def paginate(self, offset: int, limit: int) -> RecencyAggregationQuery:
         self.limit(offset, limit)
         return self
 
-    # Compatibility helper for tests that inspect the built query
     def build_args(self) -> list:
+        """Build the query arguments for Redis search."""
         return super().build_args()
diff --git a/agent_memory_server/vectorstore_adapter.py b/agent_memory_server/vectorstore_adapter.py
index 815dc97..2636ce6 100644
--- a/agent_memory_server/vectorstore_adapter.py
+++ b/agent_memory_server/vectorstore_adapter.py
@@ -580,8 +580,8 @@ async def search_memories(
         event_date: EventDate | None = None,
         memory_hash: MemoryHash | None = None,
         id: Id | None = None,
-        distance_threshold: float | None = None,
         discrete_memory_extracted: DiscreteMemoryExtracted | None = None,
+        distance_threshold: float | None = None,
         server_side_recency: bool | None = None,
         recency_params: dict | None = None,
         limit: int = 10,
@@ -1186,10 +1186,9 @@ async def count_memories(
                 else:
                     redis_filter = reduce(lambda x, y: x & y, filters)
 
-            # Use the same search method as search_memories but for counting
-            # We use the same query that would match the indexed content
+            # Use empty query to match all content with the vector search interface
             search_results = await self.vectorstore.asimilarity_search(
-                query="duplicate",  # Use a query that should match test content
+                query="",  # Empty query to match all content
                 filter=redis_filter,
                 k=10000,  # Large number to get all results
             )
diff --git a/agent_memory_server/working_memory.py b/agent_memory_server/working_memory.py
index e210438..c75b637 100644
--- a/agent_memory_server/working_memory.py
+++ b/agent_memory_server/working_memory.py
@@ -74,18 +74,25 @@ async def get_working_memory(
     user_id: str | None = None,
     namespace: str | None = None,
     redis_client: Redis | None = None,
+    recent_messages_limit: int | None = None,
 ) -> WorkingMemory | None:
     """
     Get working memory for a session.
 
+    If no working memory exists but index_all_messages_in_long_term_memory is enabled,
+    attempts to reconstruct working memory from messages stored in long-term memory.
+
     Args:
         session_id: The session ID
         namespace: Optional namespace for the session
         redis_client: Optional Redis client
+        recent_messages_limit: Optional limit on number of recent messages to return
 
     Returns:
         WorkingMemory object or None if not found
     """
+    from agent_memory_server.config import settings
+
     if not redis_client:
         redis_client = await get_redis_conn()
 
@@ -101,6 +108,21 @@ async def get_working_memory(
             logger.debug(
                 f"No working memory found for parameters: {session_id}, {user_id}, {namespace}"
             )
+
+            # Try to reconstruct from long-term memory if enabled
+            if settings.index_all_messages_in_long_term_memory:
+                reconstructed = await _reconstruct_working_memory_from_long_term(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    recent_messages_limit=recent_messages_limit,
+                )
+                if reconstructed:
+                    logger.info(
+                        f"Reconstructed working memory for session {session_id} from long-term storage"
+                    )
+                    return reconstructed
+
             return None
 
         # Parse the JSON data
@@ -118,6 +140,13 @@ async def get_working_memory(
             message = MemoryMessage(**message_data)
             messages.append(message)
 
+        # Apply recent messages limit if specified (in-memory slice)
+        if recent_messages_limit is not None and recent_messages_limit > 0:
+            # Sort messages by created_at timestamp to ensure proper chronological order
+            messages.sort(key=lambda m: m.created_at)
+            # Get the most recent N messages
+            messages = messages[-recent_messages_limit:]
+
         # Handle memory strategy configuration
         strategy_data = working_memory_data.get("long_term_memory_strategy")
         if strategy_data:
@@ -258,3 +287,114 @@ async def delete_working_memory(
     except Exception as e:
         logger.error(f"Error deleting working memory for session {session_id}: {e}")
         raise
+
+
+async def _reconstruct_working_memory_from_long_term(
+    session_id: str,
+    user_id: str | None = None,
+    namespace: str | None = None,
+    recent_messages_limit: int | None = None,
+) -> WorkingMemory | None:
+    """
+    Reconstruct working memory from messages stored in long-term memory.
+
+    This function searches for messages in long-term memory that belong to the
+    specified session and reconstructs a WorkingMemory object from them.
+
+    Args:
+        session_id: The session ID to reconstruct
+        user_id: Optional user ID filter
+        namespace: Optional namespace filter
+        recent_messages_limit: Optional limit on number of recent messages to return
+
+    Returns:
+        Reconstructed WorkingMemory object or None if no messages found
+    """
+    from agent_memory_server.filters import MemoryType, Namespace, SessionId, UserId
+    from agent_memory_server.long_term_memory import search_long_term_memories
+
+    try:
+        # Search for message-type memories for this session
+        session_filter = SessionId(eq=session_id)
+        user_filter = UserId(eq=user_id) if user_id else None
+        namespace_filter = Namespace(eq=namespace) if namespace else None
+        memory_type_filter = MemoryType(eq="message")
+
+        # Search for messages with appropriate limit
+        # We use empty text since we're filtering by session_id and memory_type
+        search_limit = recent_messages_limit if recent_messages_limit else 1000
+        results = await search_long_term_memories(
+            text="",  # Empty query since we're filtering by metadata
+            session_id=session_filter,
+            user_id=user_filter,
+            namespace=namespace_filter,
+            memory_type=memory_type_filter,
+            limit=search_limit,
+            offset=0,
+        )
+
+        if not results.memories:
+            logger.debug(
+                f"No message memories found for session {session_id} in long-term storage"
+            )
+            return None
+
+        # Convert memory records back to messages
+        messages = []
+        for memory in results.memories:
+            # Parse the message text which should be in format "role: content"
+            text = memory.text
+            if ": " in text:
+                role, content = text.split(": ", 1)
+                message = MemoryMessage(
+                    id=memory.id,
+                    role=role.lower(),
+                    content=content,
+                    created_at=memory.created_at,  # Use the original creation time
+                    persisted_at=memory.persisted_at,  # Mark as already persisted
+                )
+                messages.append(message)
+            else:
+                logger.warning(
+                    f"Skipping malformed message memory: {memory.id} - {text}"
+                )
+
+        if not messages:
+            logger.debug(f"No valid messages found for session {session_id}")
+            return None
+
+        # Sort messages by creation time to maintain conversation order (most recent first for API response)
+        messages.sort(key=lambda m: m.created_at, reverse=True)
+
+        # If we have a limit, take only the most recent N messages
+        if recent_messages_limit and len(messages) > recent_messages_limit:
+            messages = messages[:recent_messages_limit]
+
+        # Reverse back to chronological order for working memory (oldest first)
+        messages.reverse()
+
+        # Create reconstructed working memory
+        now = datetime.now(UTC)
+        reconstructed = WorkingMemory(
+            session_id=session_id,
+            namespace=namespace,
+            user_id=user_id,
+            messages=messages,
+            memories=[],  # No structured memories in reconstruction
+            context="",  # No context in reconstruction
+            data={},  # No session data in reconstruction
+            created_at=messages[0].persisted_at or now if messages else now,
+            updated_at=now,
+            last_accessed=now,
+        )
+
+        logger.info(
+            f"Reconstructed working memory for session {session_id} with {len(messages)} messages"
+        )
+        return reconstructed
+
+    except Exception as e:
+        logger.error(
+            f"Error reconstructing working memory for session {session_id}: {e}"
+        )
+        return None
diff --git a/docs/README.md b/docs/README.md
index 7b136d8..643d322 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -24,7 +24,7 @@ Understand the fundamentals:
 
 - **[Query Optimization](query-optimization.md)** - AI-powered query refinement for better search accuracy
 - **[Contextual Grounding](contextual-grounding.md)** - Resolve pronouns and references in extracted memories
-- **[Memory Editing](memory-editing.md)** - Update, correct, and enrich existing memories
+- **[Memory Editing](memory-lifecycle.md#memory-editing)** - Update, correct, and enrich existing memories
 - **[Recency Boost](recency-boost.md)** - Time-aware memory ranking and intelligent scoring
 - **[Vector Store Backends](vector-store-backends.md)** - Alternative storage backends (Pinecone, Chroma, etc.)
 
@@ -68,7 +68,7 @@ For contributors and advanced users:
 ### By Interface Preference
 
 **REST API users** → [API Documentation](api.md) → [Authentication](authentication.md)
-**MCP/Claude users** → [MCP Server](mcp.md) → [Memory Editing](memory-editing.md)
+**MCP/Claude users** → [MCP Server](mcp.md) → [Memory Editing](memory-lifecycle.md#memory-editing)
 **CLI management** → [CLI Reference](cli.md) → [Configuration](configuration.md)
 
 ## Feature Cross-Reference
@@ -76,7 +76,7 @@ For contributors and advanced users:
 | Feature | REST API | MCP Server | CLI | Documentation |
 |---------|----------|------------|-----|---------------|
 | **Memory Search** | ✅ `/v1/long-term-memory/search` | ✅ `search_long_term_memory` | ❌ | [REST API](api.md), [MCP](mcp.md) |
-| **Memory Editing** | ✅ `PATCH /v1/long-term-memory/{id}` | ✅ `edit_long_term_memory` | ❌ | [Memory Editing](memory-editing.md) |
+| **Memory Editing** | ✅ `PATCH /v1/long-term-memory/{id}` | ✅ `edit_long_term_memory` | ❌ | [Memory Editing](memory-lifecycle.md#memory-editing) |
 | **Query Optimization** | ✅ `optimize_query` param | ✅ `optimize_query` param | ❌ | [Query Optimization](query-optimization.md) |
 | **Recency Boost** | ✅ Default enabled | ✅ Available | ❌ | [Recency Boost](recency-boost.md) |
 | **Authentication** | ✅ JWT/Token | ✅ Inherited | ✅ Token management | [Authentication](authentication.md) |
diff --git a/docs/development.md b/docs/development.md
index b403b35..848ae40 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -24,11 +24,27 @@ uv run pytest
 
 ## Releasing Agent Memory Server
 
-Merging a PR to the main branch will trigger building and pushing a new image
-to Docker Hub based on the commits in main (including the version number).
-Currently, that image pushes to a test project:
+Releases are triggered manually via GitHub Actions workflow dispatch.
 
-https://hub.docker.com/r/redislabs/agent-memory-server
+### Steps to Release
+
+1. Update the version in `agent_memory_server/__init__.py`
+2. Commit and push the version change to main
+3. Go to GitHub Actions → "Release Docker Images" workflow
+4. Click "Run workflow"
+5. Choose options:
+   - **Version**: Leave empty to use version from `__init__.py`, or specify a custom version
+   - **Push latest tag**: Check to also tag as `latest` (recommended for stable releases)
+6. Click "Run workflow"
+
+This will:
+- Build Docker images for linux/amd64 and linux/arm64
+- Push to Docker Hub: `redislabs/agent-memory-server:<version>`
+- Push to GitHub Container Registry: `ghcr.io/redis/agent-memory-server:<version>`
+- Optionally tag as `latest` on both registries
+- Create a GitHub release with the version tag
+
+Docker Hub: https://hub.docker.com/r/redislabs/agent-memory-server
 
 
 ## Releasing Agent Memory Client
diff --git a/docs/index.md b/docs/index.md
index 0af6e6a..2a2c51e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -90,7 +90,7 @@ print(f"Found: {results.memories[0].text}")
 !!! info "Working Memory (Session-scoped)"
     - Current conversation state and context
     - Automatic summarization when conversations get long
-    - TTL-based expiration (1 hour default)
+    - Durable by default, optional TTL expiration
 
 !!! success "Long-Term Memory (Persistent)"
     - User preferences, facts, and important information
@@ -168,7 +168,7 @@ Jump into the Developer Guide for memory patterns and integration strategies.
 
     Update and correct existing memories through REST API and MCP tools
 
-    [Learn More →](memory-editing.md)
+    [Learn More →](memory-lifecycle.md#memory-editing)
 
 -   🕐 **Recency Boost**
 
@@ -184,7 +184,7 @@ Jump into the Developer Guide for memory patterns and integration strategies.
 
     Configurable memory extraction: discrete facts, summaries, preferences, or custom prompts
 
-    [Learn More →](memory-strategies.md)
+    [Learn More →](memory-extraction-strategies.md)
 
 </div>
 
diff --git a/docs/long-term-memory.md b/docs/long-term-memory.md
new file mode 100644
index 0000000..9fae535
--- /dev/null
+++ b/docs/long-term-memory.md
@@ -0,0 +1,289 @@
+# Long-term Memory
+
+Long-term memory is **persistent**, **cross-session** storage designed for knowledge that should be retained and searchable across all interactions. It's the "knowledge base" of facts, preferences, and experiences the agent learns at runtime.
+
+## Overview
+
+Long-term memory provides persistent storage that survives server restarts and session expiration. It's optimized for semantic search, deduplication, and rich metadata to enable intelligent retrieval of relevant information.
+
+| Feature | Details |
+|---------|---------|
+| **Scope** | Cross-session, persistent |
+| **Lifespan** | Permanent until manually deleted |
+| **Storage** | Redis with vector indexing |
+| **Search** | Semantic vector search |
+| **Capacity** | Unlimited (with compaction) |
+| **Use Case** | Knowledge base, user preferences |
+| **Indexing** | Vector embeddings + metadata |
+| **Deduplication** | Hash-based and semantic |
+
+## Characteristics
+
+- **Cross-Session**: Accessible from any session
+- **Persistent**: Survives server restarts and session expiration
+- **Vector Indexed**: Semantic search with OpenAI embeddings
+- **Deduplication**: Automatic hash-based and semantic deduplication
+- **Rich Metadata**: Topics, entities, timestamps, memory types
+- **Compaction**: Automatic cleanup and merging of duplicates
+
+## Memory Types
+
+Long-term memory supports three types of memories:
+
+### 1. Semantic Memory
+Facts, preferences, general knowledge
+
+```json
+{
+  "text": "User prefers dark mode interfaces",
+  "memory_type": "semantic",
+  "topics": ["preferences", "ui"],
+  "entities": ["dark mode"]
+}
+```
+
+### 2. Episodic Memory
+Events with temporal context
+
+```json
+{
+  "text": "User visited Paris in March 2024",
+  "memory_type": "episodic",
+  "event_date": "2024-03-15T10:00:00Z",
+  "topics": ["travel"],
+  "entities": ["Paris"]
+}
+```
+
+### 3. Message Memory
+Conversation records (auto-generated)
+
+```json
+{
+  "text": "user: What's the weather like?",
+  "memory_type": "message",
+  "session_id": "chat_123"
+}
+```
+
+## When to Use Long-Term Memory
+
+### 1. User Preferences and Profile
+
+```python
+# Store lasting user preferences
+memories = [
+    MemoryRecord(
+        text="User prefers metric units for temperature",
+        id="pref_metric_temp",
+        memory_type="semantic",
+        topics=["preferences", "units"],
+        user_id="user_123"
+    )
+]
+```
+
+### 2. Important Facts and Knowledge
+
+```python
+# Store domain knowledge
+memories = [
+    MemoryRecord(
+        text="Customer's subscription expires on 2024-06-15",
+        id="sub_expiry_customer_456",
+        memory_type="episodic",
+        event_date=datetime(2024, 6, 15),
+        entities=["customer_456", "subscription"],
+        user_id="user_123"
+    )
+]
+```
+
+### 3. Cross-Session Context
+
+```python
+# Store context that spans conversations
+memories = [
+    MemoryRecord(
+        text="User is working on a Python machine learning project",
+        id="context_ml_project",
+        memory_type="semantic",
+        topics=["programming", "machine-learning", "python"],
+        namespace="work_context"
+    )
+]
+```
+
+## API Endpoints
+
+```http
+# Create long-term memories
+POST /v1/long-term-memory/
+
+# Search long-term memories
+POST /v1/long-term-memory/search
+```
+
+## Search Capabilities
+
+Long-term memory provides powerful search features:
+
+### Semantic Vector Search
+```json
+{
+  "text": "python programming help",
+  "limit": 10,
+  "distance_threshold": 0.8
+}
+```
+
+### Advanced Filtering
+```json
+{
+  "text": "user preferences",
+  "filters": {
+    "user_id": {"eq": "user_123"},
+    "memory_type": {"eq": "semantic"},
+    "topics": {"any": ["preferences", "settings"]},
+    "created_at": {"gte": "2024-01-01T00:00:00Z"}
+  }
+}
+```
+
+## Deduplication and Compaction
+
+Long-term memory automatically manages duplicates through:
+
+### Hash-Based Deduplication
+- Identical text content is automatically deduplicated
+- Preserves the most recent version with complete metadata
+
+### Semantic Deduplication
+- Uses vector similarity to identify semantically similar memories
+- LLM-powered merging of related memories
+- Configurable similarity thresholds
+
+### Automatic Compaction
+```python
+# Server automatically:
+# - Identifies hash-based duplicates
+# - Finds semantically similar memories
+# - Merges related memories using LLM
+# - Removes obsolete duplicates
+```
+
+## Memory Prompt Integration
+
+The memory system integrates with AI prompts through the `/v1/memory/prompt` endpoint:
+
+```python
+# Get memory-enriched prompt
+response = await memory_prompt({
+    "query": "Help me plan dinner",
+    "session": {
+        "session_id": "current_chat",
+        "model_name": "gpt-4o",
+        "context_window_max": 4000
+    },
+    "long_term_search": {
+        "text": "food preferences dietary restrictions",
+        "filters": {"user_id": {"eq": "user_123"}},
+        "limit": 5
+    }
+})
+
+# Returns ready-to-use messages with:
+# - Conversation context from working memory
+# - Relevant memories from long-term storage
+# - User's query as final message
+```
+
+## Creating Long-Term Memories
+
+There are two main ways to create long-term memories:
+
+### 1. Automatic Promotion from Working Memory
+
+The most common approach is to let the system automatically promote memories from working memory to long-term storage. This handles extraction strategies, background processing, and batch optimization.
+
+!!! info "Working Memory Integration"
+    For automatic memory promotion from conversations, see the [Working Memory documentation](working-memory.md). This covers extraction strategies, background processing, and how to configure the memory server to automatically create long-term memories from conversation content.
+
+### 2. Manual Creation via API
+
+For immediate storage of important facts, you can create long-term memories directly using the API or LLM tools.
+
+#### Direct API Calls
+
+```python
+# Create memories directly via Python client
+await client.create_long_term_memories([
+    {
+        "text": "User prefers dark mode interfaces",
+        "memory_type": "semantic",
+        "topics": ["preferences", "ui"],
+        "entities": ["dark mode"],
+        "user_id": "user_123"
+    },
+    {
+        "text": "User completed Python certification on January 15, 2024",
+        "memory_type": "episodic",
+        "event_date": "2024-01-15T10:00:00Z",
+        "topics": ["education", "certification"],
+        "entities": ["Python certification"],
+        "user_id": "user_123"
+    }
+])
+```
+
+#### LLM Tool Usage (Eager Creation)
+
+Your LLM can use the `create_long_term_memory` tool for immediate storage:
+
+```python
+# LLM tool call for eager memory creation
+tools = [client.create_long_term_memory_tool_schema()]
+
+# LLM can call:
+# create_long_term_memory(
+#     memories=[
+#         {
+#             "text": "User works as a software engineer at TechCorp",
+#             "memory_type": "semantic",
+#             "topics": ["career", "work"],
+#             "entities": ["software engineer", "TechCorp"]
+#         }
+#     ]
+# )
+```
+
+This approach is ideal when:
+- You need memories to be immediately searchable
+- You're processing batch data or imports
+- You want to bypass working memory entirely
+- You have structured data that doesn't need extraction
+
+## Configuration
+
+Some long-term memory behavior can be configured through environment variables:
+
+```bash
+# Long-term memory settings
+LONG_TERM_MEMORY=true                         # Enable long-term memory features
+ENABLE_DISCRETE_MEMORY_EXTRACTION=true        # Extract memories from messages
+INDEX_ALL_MESSAGES_IN_LONG_TERM_MEMORY=false  # Index messages in long-term memory (default: false)
+GENERATION_MODEL=gpt-4o-mini                  # Model for summarization/extraction
+
+# Vector search settings
+EMBEDDING_MODEL=text-embedding-3-small  # OpenAI embedding model
+DISTANCE_THRESHOLD=0.8                  # Similarity threshold for search
+```
+
+For complete configuration options, see the [Configuration Guide](configuration.md).
+
+## Related Documentation
+
+- [Working Memory](working-memory.md) - Session-scoped memory storage for conversations
+- [Memory Integration Patterns](memory-integration-patterns.md) - How to integrate memory with your applications
+- [Memory Extraction Strategies](memory-extraction-strategies.md) - Different approaches to memory extraction and storage
+- [Vector Store Backends](vector-store-backends.md) - Configuring different vector storage backends
diff --git a/docs/memory-editing.md b/docs/memory-editing.md
deleted file mode 100644
index 804c064..0000000
--- a/docs/memory-editing.md
+++ /dev/null
@@ -1,551 +0,0 @@
-# Memory Editing
-
-The Redis Agent Memory Server provides comprehensive memory editing capabilities, allowing you to update, correct, and refine stored memories through both REST API endpoints and MCP tools. This feature enables AI agents and applications to maintain accurate, up-to-date memory records over time.
-
-## Overview
-
-Memory editing allows you to modify existing long-term memories without losing their search indexing or metadata. This is essential for:
-
-- **Correcting mistakes**: Fix inaccurate information in stored memories
-- **Updating information**: Reflect changes in user preferences or circumstances
-- **Adding details**: Enrich memories with additional context or information
-- **Maintaining accuracy**: Keep memory store current and reliable
-
-**Key Features:**
-- **Partial updates**: Modify only the fields you want to change
-- **Automatic re-indexing**: Updated memories are re-indexed for search
-- **Vector consistency**: Embeddings are regenerated when text changes
-- **Metadata preservation**: IDs, timestamps, and other metadata remain stable
-- **Atomic operations**: Updates succeed or fail completely
-
-## Memory Editing Workflow
-
-### 1. Find the Memory
-
-First, locate the memory you want to edit using search:
-
-```python
-# Search for memories to edit
-results = await client.search_long_term_memory(
-    text="user food preferences",
-    limit=5
-)
-
-# Find the specific memory
-memory_to_edit = results.memories[0]
-memory_id = memory_to_edit.id
-```
-
-### 2. Prepare Updates
-
-Specify only the fields you want to change:
-
-```python
-# Update only the text content
-updates = {
-    "text": "User prefers Mediterranean cuisine and is vegetarian"
-}
-
-# Or update multiple fields
-updates = {
-    "text": "User was promoted to Senior Engineer on January 15, 2024",
-    "memory_type": "episodic",
-    "event_date": "2024-01-15T14:30:00Z",
-    "topics": ["career", "promotion", "engineering"],
-    "entities": ["Senior Engineer", "promotion"]
-}
-```
-
-### 3. Apply the Update
-
-Use the appropriate interface to apply your changes:
-
-```python
-# Update the memory
-updated_memory = await client.edit_long_term_memory(
-    memory_id=memory_id,
-    updates=updates
-)
-```
-
-## REST API Interface
-
-### Endpoint
-
-**PATCH /v1/long-term-memory/{memory_id}**
-
-Updates specific fields of an existing memory record.
-
-### Request Format
-
-```http
-PATCH /v1/long-term-memory/01HXE2B1234567890ABCDEF
-Content-Type: application/json
-Authorization: Bearer your_token_here
-
-{
-  "text": "Updated memory text",
-  "topics": ["new", "topics"],
-  "entities": ["updated", "entities"],
-  "memory_type": "semantic",
-  "event_date": "2024-01-15T14:30:00Z",
-  "namespace": "updated_namespace",
-  "user_id": "updated_user"
-}
-```
-
-### Response Format
-
-```json
-{
-  "id": "01HXE2B1234567890ABCDEF",
-  "text": "Updated memory text",
-  "memory_type": "semantic",
-  "topics": ["new", "topics"],
-  "entities": ["updated", "entities"],
-  "created_at": "2024-01-10T12:00:00Z",
-  "persisted_at": "2024-01-10T12:00:00Z",
-  "updated_at": "2024-01-16T10:30:00Z",
-  "last_accessed": "2024-01-16T10:30:00Z",
-  "user_id": "user_123",
-  "session_id": "session_456",
-  "namespace": "updated_namespace",
-  "memory_hash": "new_hash_after_update"
-}
-```
-
-### cURL Examples
-
-**Update memory text:**
-```bash
-curl -X PATCH "http://localhost:8000/v1/long-term-memory/01HXE2B1234567890ABCDEF" \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your_token" \
-  -d '{
-    "text": "User prefers dark mode interfaces and uses vim for coding"
-  }'
-```
-
-**Update multiple fields:**
-```bash
-curl -X PATCH "http://localhost:8000/v1/long-term-memory/01HXE2B1234567890ABCDEF" \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer your_token" \
-  -d '{
-    "text": "User completed Python certification on January 15, 2024",
-    "memory_type": "episodic",
-    "event_date": "2024-01-15T14:30:00Z",
-    "topics": ["education", "certification", "python"],
-    "entities": ["Python", "certification"]
-  }'
-```
-
-## MCP Tool Interface
-
-### Tool: edit_long_term_memory
-
-The MCP server provides an `edit_long_term_memory` tool for AI agents to modify memories through natural conversation.
-
-### Tool Schema
-
-```python
-{
-    "name": "edit_long_term_memory",
-    "description": "Update an existing long-term memory with new or corrected information",
-    "parameters": {
-        "type": "object",
-        "properties": {
-            "memory_id": {
-                "type": "string",
-                "description": "The ID of the memory to edit (get this from search results)"
-            },
-            "text": {
-                "type": "string",
-                "description": "Updated memory text content"
-            },
-            "topics": {
-                "type": "array",
-                "items": {"type": "string"},
-                "description": "Updated list of topics"
-            },
-            "entities": {
-                "type": "array",
-                "items": {"type": "string"},
-                "description": "Updated list of entities"
-            },
-            "memory_type": {
-                "type": "string",
-                "enum": ["semantic", "episodic", "message"],
-                "description": "Type of memory"
-            },
-            "event_date": {
-                "type": "string",
-                "description": "Event date for episodic memories (ISO 8601 format)"
-            },
-            "namespace": {
-                "type": "string",
-                "description": "Memory namespace"
-            },
-            "user_id": {
-                "type": "string",
-                "description": "User ID associated with the memory"
-            }
-        },
-        "required": ["memory_id"]
-    }
-}
-```
-
-### MCP Usage Examples
-
-**Simple text update:**
-```python
-await client.call_tool("edit_long_term_memory", {
-    "memory_id": "01HXE2B1234567890ABCDEF",
-    "text": "User prefers tea over coffee (updated preference)"
-})
-```
-
-**Update memory type and event date:**
-```python
-await client.call_tool("edit_long_term_memory", {
-    "memory_id": "01HXE2B1234567890ABCDEF",
-    "memory_type": "episodic",
-    "event_date": "2024-01-15T14:30:00Z"
-})
-```
-
-**Comprehensive update:**
-```python
-await client.call_tool("edit_long_term_memory", {
-    "memory_id": "01HXE2B1234567890ABCDEF",
-    "text": "User was promoted to Principal Engineer on January 15, 2024",
-    "memory_type": "episodic",
-    "event_date": "2024-01-15T14:30:00Z",
-    "topics": ["career", "promotion", "engineering", "principal"],
-    "entities": ["Principal Engineer", "promotion", "January 15, 2024"]
-})
-```
-
-## Python Client Interface
-
-### Method: edit_long_term_memory
-
-```python
-async def edit_long_term_memory(
-    self,
-    memory_id: str,
-    updates: dict[str, Any]
-) -> MemoryRecord:
-    """
-    Edit an existing long-term memory record.
-
-    Args:
-        memory_id: The ID of the memory to edit
-        updates: Dictionary of fields to update
-
-    Returns:
-        The updated memory record
-
-    Raises:
-        HTTPException: If memory not found or update fails
-    """
-```
-
-### Client Usage Examples
-
-```python
-from agent_memory_client import MemoryAPIClient
-
-client = MemoryAPIClient(base_url="http://localhost:8000")
-
-# Simple text correction
-updated_memory = await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={"text": "User actually prefers coffee, not tea"}
-)
-
-# Add more context
-updated_memory = await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={
-        "text": "User prefers Italian cuisine, especially pasta and pizza",
-        "topics": ["food", "preferences", "italian", "cuisine"],
-        "entities": ["Italian cuisine", "pasta", "pizza"]
-    }
-)
-
-# Update namespace and user
-updated_memory = await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={
-        "namespace": "work_preferences",
-        "user_id": "user_456"
-    }
-)
-```
-
-## Editable Fields
-
-### Core Content Fields
-
-- **text**: The main memory content (triggers embedding regeneration)
-- **topics**: List of topic tags for categorization
-- **entities**: List of named entities mentioned in the memory
-- **memory_type**: Type classification (semantic, episodic, message)
-
-### Temporal Fields
-
-- **event_date**: Specific date/time for episodic memories (ISO 8601 format)
-
-### Organization Fields
-
-- **namespace**: Memory namespace for organization
-- **user_id**: User associated with the memory
-
-### Read-Only Fields
-
-These fields cannot be edited and are managed automatically:
-
-- **id**: Unique memory identifier
-- **created_at**: Original creation timestamp
-- **persisted_at**: When memory was first saved to long-term storage
-- **updated_at**: Last modification timestamp (updated automatically)
-- **last_accessed**: Last time memory was retrieved (managed by recency system)
-- **memory_hash**: Content hash (regenerated when text changes)
-
-## Update Behavior
-
-### Automatic Updates
-
-When you edit a memory, the system automatically:
-
-1. **Updates timestamps**: Sets `updated_at` to current time
-2. **Regenerates embeddings**: If text content changes, new embeddings are created
-3. **Recalculates hash**: Content hash is updated for deduplication
-4. **Re-indexes memory**: Search index is updated with new content
-5. **Updates access time**: Sets `last_accessed` to current time
-
-### Partial Updates
-
-Only specify fields you want to change - other fields remain unchanged:
-
-```python
-# Only update topics - text, entities, etc. stay the same
-updates = {"topics": ["programming", "python", "web-development"]}
-
-# Only update text - topics, entities, etc. stay the same
-updates = {"text": "Updated description of the user's preferences"}
-```
-
-### Vector Re-indexing
-
-When memory text changes, the system automatically:
-- Generates new embeddings using the configured embedding model
-- Updates the vector index for accurate semantic search
-- Maintains search performance and accuracy
-
-## Error Handling
-
-### Common Errors
-
-**Memory Not Found (404):**
-```json
-{
-  "detail": "Memory not found: 01HXE2B1234567890ABCDEF",
-  "status_code": 404
-}
-```
-
-**Invalid Memory ID (400):**
-```json
-{
-  "detail": "Invalid memory ID format",
-  "status_code": 400
-}
-```
-
-**Validation Error (422):**
-```json
-{
-  "detail": [
-    {
-      "loc": ["body", "event_date"],
-      "msg": "invalid datetime format",
-      "type": "value_error"
-    }
-  ],
-  "status_code": 422
-}
-```
-
-### Error Handling in Code
-
-```python
-try:
-    updated_memory = await client.edit_long_term_memory(
-        memory_id="01HXE2B1234567890ABCDEF",
-        updates={"text": "Updated text"}
-    )
-except HTTPException as e:
-    if e.status_code == 404:
-        print("Memory not found")
-    elif e.status_code == 422:
-        print("Invalid update data")
-    else:
-        print(f"Update failed: {e.detail}")
-```
-
-## Use Cases and Examples
-
-### Correcting User Information
-
-**Scenario**: User corrects their job title
-
-```python
-# 1. Search for the memory
-results = await client.search_long_term_memory(
-    text="user job title engineer",
-    limit=1
-)
-
-# 2. Update with correction
-if results.memories:
-    await client.edit_long_term_memory(
-        memory_id=results.memories[0].id,
-        updates={
-            "text": "User works as a Senior Software Engineer at TechCorp",
-            "entities": ["Senior Software Engineer", "TechCorp"]
-        }
-    )
-```
-
-### Adding Context to Sparse Memories
-
-**Scenario**: Enrich a basic memory with additional details
-
-```python
-# Original: "User likes pizza"
-# Enhanced with context:
-await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={
-        "text": "User likes pizza, especially thin crust with pepperoni and mushrooms from Mario's Pizzeria",
-        "topics": ["food", "preferences", "pizza", "italian"],
-        "entities": ["pizza", "thin crust", "pepperoni", "mushrooms", "Mario's Pizzeria"]
-    }
-)
-```
-
-### Converting Memory Types
-
-**Scenario**: Convert a general memory to an episodic memory with event date
-
-```python
-# Change from semantic to episodic with specific date
-await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={
-        "text": "User got promoted to Team Lead on March 15, 2024",
-        "memory_type": "episodic",
-        "event_date": "2024-03-15T09:00:00Z",
-        "topics": ["career", "promotion", "team-lead"],
-        "entities": ["Team Lead", "promotion", "March 15, 2024"]
-    }
-)
-```
-
-### Batch Memory Updates
-
-**Scenario**: Update multiple related memories
-
-```python
-# Find all memories about a specific topic
-results = await client.search_long_term_memory(
-    text="old project name",
-    limit=10
-)
-
-# Update each memory with the new project name
-for memory in results.memories:
-    updated_text = memory.text.replace("old project", "new project name")
-    await client.edit_long_term_memory(
-        memory_id=memory.id,
-        updates={
-            "text": updated_text,
-            "entities": [entity.replace("old project", "new project name")
-                        for entity in memory.entities or []]
-        }
-    )
-```
-
-## Best Practices
-
-### Memory Identification
-
-1. **Use search first**: Always search to find the correct memory ID
-2. **Verify before editing**: Check memory content matches your expectations
-3. **Handle duplicates**: Consider if multiple memories need the same update
-
-### Update Strategy
-
-1. **Minimal changes**: Only update fields that actually need to change
-2. **Preserve context**: Don't remove important information when updating
-3. **Consistent formatting**: Maintain consistent data formats across memories
-4. **Validate inputs**: Check data formats before making updates
-
-### Error Prevention
-
-1. **Check memory exists**: Handle 404 errors gracefully
-2. **Validate data**: Ensure update data matches expected formats
-3. **Test updates**: Verify changes work as expected in development
-4. **Monitor performance**: Watch for degradation with frequent updates
-
-### Performance Considerations
-
-1. **Batch operations**: Group related updates when possible
-2. **Avoid unnecessary updates**: Don't update if content hasn't actually changed
-3. **Monitor embedding costs**: Text updates trigger new embedding generation
-4. **Consider timing**: Updates during low-traffic periods for better performance
-
-## Integration with Other Features
-
-### Memory Search
-
-Updated memories are immediately searchable with their new content:
-
-```python
-# After updating memory with new content
-await client.edit_long_term_memory(
-    memory_id="01HXE2B1234567890ABCDEF",
-    updates={"text": "User loves Mediterranean cuisine"}
-)
-
-# Can immediately search for the updated content
-results = await client.search_long_term_memory(
-    text="Mediterranean cuisine",
-    limit=5
-)
-# Updated memory will appear in results
-```
-
-### Recency Boost
-
-Memory editing updates the `last_accessed` timestamp, which affects recency scoring:
-
-```python
-# Editing a memory makes it "recently accessed"
-# This can boost its ranking in recency-weighted searches
-```
-
-### Working Memory
-
-Memories can be updated based on new information from working memory:
-
-```python
-# Extract new information from current conversation
-# Update existing memories with corrections or additions
-# Maintain consistency between working and long-term memory
-```
-
-This comprehensive memory editing system ensures that your AI agent's memory remains accurate, current, and useful over time, adapting to new information and corrections as they become available.
diff --git a/docs/memory-strategies.md b/docs/memory-extraction-strategies.md
similarity index 98%
rename from docs/memory-strategies.md
rename to docs/memory-extraction-strategies.md
index 938bcd5..2b1d0ae 100644
--- a/docs/memory-strategies.md
+++ b/docs/memory-extraction-strategies.md
@@ -412,7 +412,8 @@ pytest tests/test_prompt_security.py -v
 
 ## Related Documentation
 
-- **[Memory Types](memory-types.md)** - Understanding working vs long-term memory
+- **[Working Memory](working-memory.md)** - Session-scoped, ephemeral memory storage
+- **[Long-term Memory](long-term-memory.md)** - Persistent, cross-session memory storage
 - **[Security Guide](security-custom-prompts.md)** - Comprehensive security for custom strategies
 - **[Memory Lifecycle](memory-lifecycle.md)** - How memories are managed over time
 - **[API Reference](api.md)** - REST API for memory management
diff --git a/docs/memory-lifecycle.md b/docs/memory-lifecycle.md
index cb412ae..2d3926c 100644
--- a/docs/memory-lifecycle.md
+++ b/docs/memory-lifecycle.md
@@ -410,3 +410,482 @@ The system provides **automated memory lifecycle management** through server-con
 3. **Search and identify** memories for manual cleanup
 
 Automatic lifecycle management (forgetting, compaction, optimization) operates server-side based on configuration and background task scheduling. This design ensures consistent resource management and optimal server performance.
+
+## Memory Editing
+
+The Redis Agent Memory Server provides comprehensive memory editing capabilities, allowing you to update, correct, and refine stored memories through both REST API endpoints and MCP tools. This feature enables AI agents and applications to maintain accurate, up-to-date memory records over time.
+
+### Overview
+
+Memory editing allows you to modify existing long-term memories without losing their search indexing or metadata. This is essential for:
+
+- **Correcting mistakes**: Fix inaccurate information in stored memories
+- **Updating information**: Reflect changes in user preferences or circumstances
+- **Adding details**: Enrich memories with additional context or information
+- **Maintaining accuracy**: Keep memory store current and reliable
+
+**Key Features:**
+- **Partial updates**: Modify only the fields you want to change
+- **Automatic re-indexing**: Updated memories are re-indexed for search
+- **Vector consistency**: Embeddings are regenerated when text changes
+- **Metadata preservation**: IDs, timestamps, and other metadata remain stable
+- **Atomic operations**: Updates succeed or fail completely
+
+### Memory Editing Workflow
+
+#### 1. Find the Memory
+
+First, locate the memory you want to edit using search:
+
+```python
+# Search for memories to edit
+results = await client.search_long_term_memory(
+    text="user food preferences",
+    limit=5
+)
+
+# Find the specific memory
+memory_to_edit = results.memories[0]
+memory_id = memory_to_edit.id
+```
+
+#### 2. Prepare Updates
+
+Specify only the fields you want to change:
+
+```python
+# Update only the text content
+updates = {
+    "text": "User prefers Mediterranean cuisine and is vegetarian"
+}
+
+# Or update multiple fields
+updates = {
+    "text": "User prefers Mediterranean cuisine and is vegetarian",
+    "topics": ["food", "preferences", "diet", "mediterranean"],
+    "entities": ["Mediterranean cuisine", "vegetarian"]
+}
+```
+
+#### 3. Apply the Update
+
+Use the appropriate interface to apply your changes:
+
+```python
+# Update the memory
+updated_memory = await client.edit_long_term_memory(
+    memory_id=memory_id,
+    updates=updates
+)
+```
+
+### REST API Interface
+
+#### Endpoint
+
+**PATCH /v1/long-term-memory/{memory_id}**
+
+Updates specific fields of an existing memory record.
+
+#### Request Format
+
+```http
+PATCH /v1/long-term-memory/01HXE2B1234567890ABCDEF
+Content-Type: application/json
+Authorization: Bearer your_token_here
+
+{
+  "text": "Updated memory text",
+  "topics": ["new", "topics"],
+  "entities": ["updated", "entities"],
+  "memory_type": "semantic",
+  "event_date": "2024-01-15T14:30:00Z",
+  "namespace": "updated_namespace",
+  "user_id": "updated_user"
+}
+```
+
+#### Response Format
+
+```json
+{
+  "id": "01HXE2B1234567890ABCDEF",
+  "text": "Updated memory text",
+  "memory_type": "semantic",
+  "topics": ["new", "topics"],
+  "entities": ["updated", "entities"],
+  "created_at": "2024-01-10T12:00:00Z",
+  "persisted_at": "2024-01-10T12:00:00Z",
+  "updated_at": "2024-01-16T10:30:00Z",
+  "last_accessed": "2024-01-16T10:30:00Z",
+  "user_id": "user_123",
+  "session_id": "session_456",
+  "namespace": "updated_namespace",
+  "memory_hash": "new_hash_after_update"
+}
+```
+
+#### cURL Examples
+
+**Update memory text:**
+```bash
+curl -X PATCH "http://localhost:8000/v1/long-term-memory/01HXE2B1234567890ABCDEF" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer your_token" \
+  -d '{
+    "text": "User prefers dark mode interfaces and uses vim for coding"
+  }'
+```
+
+**Update multiple fields:**
+```bash
+curl -X PATCH "http://localhost:8000/v1/long-term-memory/01HXE2B1234567890ABCDEF" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer your_token" \
+  -d '{
+    "text": "User completed Python certification on January 15, 2024",
+    "memory_type": "episodic",
+    "event_date": "2024-01-15T14:30:00Z",
+    "topics": ["education", "certification", "python"],
+    "entities": ["Python", "certification"]
+  }'
+```
+
+### MCP Tool Interface
+
+#### Tool: edit_long_term_memory
+
+The MCP server provides an `edit_long_term_memory` tool for AI agents to modify memories through natural conversation.
+
+#### Tool Schema
+
+```python
+{
+    "name": "edit_long_term_memory",
+    "description": "Update an existing long-term memory with new or corrected information",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "memory_id": {
+                "type": "string",
+                "description": "The ID of the memory to edit (get this from search results)"
+            },
+            "text": {
+                "type": "string",
+                "description": "Updated memory text content"
+            },
+            "topics": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Updated list of topics"
+            },
+            "entities": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Updated list of entities"
+            },
+            "memory_type": {
+                "type": "string",
+                "enum": ["semantic", "episodic", "message"],
+                "description": "Type of memory"
+            },
+            "event_date": {
+                "type": "string",
+                "description": "Event date for episodic memories (ISO 8601 format)"
+            },
+            "namespace": {
+                "type": "string",
+                "description": "Memory namespace"
+            },
+            "user_id": {
+                "type": "string",
+                "description": "User ID associated with the memory"
+            }
+        },
+        "required": ["memory_id"]
+    }
+}
+```
+
+#### MCP Usage Examples
+
+**Simple text update:**
+```python
+await client.call_tool("edit_long_term_memory", {
+    "memory_id": "01HXE2B1234567890ABCDEF",
+    "text": "User prefers tea over coffee (updated preference)"
+})
+```
+
+**Update memory type and event date:**
+```python
+await client.call_tool("edit_long_term_memory", {
+    "memory_id": "01HXE2B1234567890ABCDEF",
+    "memory_type": "episodic",
+    "event_date": "2024-01-15T14:30:00Z"
+})
+```
+
+**Comprehensive update:**
+```python
+await client.call_tool("edit_long_term_memory", {
+    "memory_id": "01HXE2B1234567890ABCDEF",
+    "text": "User was promoted to Principal Engineer on January 15, 2024",
+    "memory_type": "episodic",
+    "event_date": "2024-01-15T14:30:00Z",
+    "topics": ["career", "promotion", "engineering", "principal"],
+    "entities": ["Principal Engineer", "promotion", "January 15, 2024"]
+})
+```
+
+### Python Client Interface
+
+#### Method: edit_long_term_memory
+
+```python
+async def edit_long_term_memory(
+    self,
+    memory_id: str,
+    updates: dict[str, Any]
+) -> MemoryRecord:
+    """
+    Edit an existing long-term memory record.
+
+    Args:
+        memory_id: The ID of the memory to edit
+        updates: Dictionary of fields to update
+
+    Returns:
+        The updated memory record
+
+    Raises:
+        HTTPException: If memory not found or update fails
+    """
+```
+
+#### Client Usage Examples
+
+```python
+from agent_memory_client import MemoryAPIClient
+
+client = MemoryAPIClient(base_url="http://localhost:8000")
+
+# Simple text correction
+updated_memory = await client.edit_long_term_memory(
+    memory_id="01HXE2B1234567890ABCDEF",
+    updates={"text": "User actually prefers coffee, not tea"}
+)
+
+# Add more context
+updated_memory = await client.edit_long_term_memory(
+    memory_id="01HXE2B1234567890ABCDEF",
+    updates={
+        "text": "User prefers Italian cuisine, especially pasta and pizza",
+        "topics": ["food", "preferences", "italian", "cuisine"],
+        "entities": ["Italian cuisine", "pasta", "pizza"]
+    }
+)
+
+# Update namespace and user
+updated_memory = await client.edit_long_term_memory(
+    memory_id="01HXE2B1234567890ABCDEF",
+    updates={
+        "namespace": "work_preferences",
+        "user_id": "user_456"
+    }
+)
+```
+
+### Editable Fields
+
+#### Core Content Fields
+
+- **text**: The main memory content (triggers embedding regeneration)
+- **topics**: List of topic tags for categorization
+- **entities**: List of named entities mentioned in the memory
+- **memory_type**: Type classification (semantic, episodic, message)
+
+#### Temporal Fields
+
+- **event_date**: Specific date/time for episodic memories (ISO 8601 format)
+
+#### Organization Fields
+
+- **namespace**: Memory namespace for organization
+- **user_id**: User associated with the memory
+
+#### Read-Only Fields
+
+These fields cannot be edited and are managed automatically:
+
+- **id**: Unique memory identifier
+- **created_at**: Original creation timestamp
+- **persisted_at**: When memory was first saved to long-term storage
+- **updated_at**: Last modification timestamp (updated automatically)
+- **last_accessed**: Last time memory was retrieved (managed by recency system)
+- **memory_hash**: Content hash (regenerated when text changes)
+
+### Update Behavior
+
+#### Automatic Updates
+
+When you edit a memory, the system automatically:
+
+1. **Updates timestamps**: Sets `updated_at` to current time
+2. **Regenerates embeddings**: If text content changes, new embeddings are created
+3. **Recalculates hash**: Content hash is updated for deduplication
+4. **Re-indexes memory**: Search index is updated with new content
+5. **Updates access time**: Sets `last_accessed` to current time
+
+#### Partial Updates
+
+Only specify fields you want to change - other fields remain unchanged:
+
+```python
+# Only update topics - text, entities, etc. stay the same
+updates = {"topics": ["programming", "python", "web-development"]}
+
+# Only update text - topics, entities, etc. stay the same
+updates = {"text": "Updated description of the user's preferences"}
+```
+
+#### Vector Re-indexing
+
+When memory text changes, the system automatically:
+- Generates new embeddings using the configured embedding model
+- Updates the vector index for accurate semantic search
+- Maintains search performance and accuracy
+
+### Error Handling
+
+#### Common Errors
+
+**Memory Not Found (404):**
+```json
+{
+  "detail": "Memory not found: 01HXE2B1234567890ABCDEF",
+  "status_code": 404
+}
+```
+
+**Invalid Memory ID (400):**
+```json
+{
+  "detail": "Invalid memory ID format",
+  "status_code": 400
+}
+```
+
+**Validation Error (422):**
+```json
+{
+  "detail": [
+    {
+      "loc": ["body", "event_date"],
+      "msg": "invalid datetime format",
+      "type": "value_error"
+    }
+  ],
+  "status_code": 422
+}
+```
+
+#### Error Handling in Code
+
+```python
+try:
+    updated_memory = await client.edit_long_term_memory(
+        memory_id="01HXE2B1234567890ABCDEF",
+        updates={"text": "Updated text"}
+    )
+except HTTPException as e:
+    if e.status_code == 404:
+        print("Memory not found")
+    elif e.status_code == 422:
+        print("Invalid update data")
+    else:
+        print(f"Update failed: {e.detail}")
+```
+
+### Use Cases and Examples
+
+#### Correcting User Information
+
+**Scenario**: User corrects their job title
+
+```python
+# 1. Search for the memory
+results = await client.search_long_term_memory(
+    text="user job title engineer",
+    limit=1
+)
+
+# 2. Update with correction
+if results.memories:
+    await client.edit_long_term_memory(
+        memory_id=results.memories[0].id,
+        updates={
+            "text": "User works as a Senior Software Engineer at TechCorp",
+            "entities": ["Senior Software Engineer", "TechCorp"]
+        }
+    )
+```
+
+#### Adding Context to Sparse Memories
+
+**Scenario**: Enrich a basic memory with additional details
+
+```python
+# Original: "User likes pizza"
+# Enhanced with context:
+await client.edit_long_term_memory(
+    memory_id="01HXE2B1234567890ABCDEF",
+    updates={
+        "text": "User likes pizza, especially thin crust with pepperoni and mushrooms from Mario's Pizzeria",
+        "topics": ["food", "preferences", "pizza", "italian"],
+        "entities": ["pizza", "thin crust", "pepperoni", "mushrooms", "Mario's Pizzeria"]
+    }
+)
+```
+
+#### Converting Memory Types
+
+**Scenario**: Convert a general memory to an episodic memory with event date
+
+```python
+# Change from semantic to episodic with specific date
+await client.edit_long_term_memory(
+    memory_id="01HXE2B1234567890ABCDEF",
+    updates={
+        "text": "User got promoted to Team Lead on March 15, 2024",
+        "memory_type": "episodic",
+        "event_date": "2024-03-15T09:00:00Z",
+        "topics": ["career", "promotion", "team-lead"],
+        "entities": ["Team Lead", "promotion", "March 15, 2024"]
+    }
+)
+```
+
+### Best Practices for Memory Editing
+
+#### Memory Identification
+
+1. **Use search first**: Always search to find the correct memory ID
+2. **Verify before editing**: Check memory content matches your expectations
+3. **Handle duplicates**: Consider if multiple memories need the same update
+
+#### Update Strategy
+
+1. **Minimal changes**: Only update fields that actually need to change
+2. **Preserve context**: Don't remove important information when updating
+3. **Consistent formatting**: Maintain consistent data formats across memories
+4. **Validate inputs**: Check data formats before making updates
+
+#### Error Prevention
+
+1. **Check memory exists**: Handle 404 errors gracefully
+2. **Validate data**: Ensure update data matches expected formats
+3. **Test updates**: Verify changes work as expected in development
+4. **Monitor performance**: Watch for degradation with frequent updates
+
+This comprehensive memory editing system ensures that your AI agent's memory remains accurate, current, and useful over time, adapting to new information and corrections as they become available.
diff --git a/docs/memory-types.md b/docs/memory-types.md
deleted file mode 100644
index 7e1754c..0000000
--- a/docs/memory-types.md
+++ /dev/null
@@ -1,445 +0,0 @@
-# Memory Types
-
-The Redis Agent Memory Server provides two distinct types of memory storage, each optimized for different use cases and access patterns: **Working Memory** and **Long-Term Memory**.
-
-## Overview
-
-| Feature | Working Memory | Long-Term Memory |
-|---------|----------------|------------------|
-| **Scope** | Session-scoped | Cross-session, persistent |
-| **Lifespan** | TTL-based (1 hour default) | Permanent until manually deleted |
-| **Storage** | Redis key-value with JSON | Redis with vector indexing |
-| **Search** | Simple text matching | Semantic vector search |
-| **Capacity** | Limited by window size | Unlimited (with compaction) |
-| **Use Case** | Active conversation state | Knowledge base, user preferences |
-| **Indexing** | None | Vector embeddings + metadata |
-| **Deduplication** | None | Hash-based and semantic |
-
-## Working Memory
-
-Working memory is **session-scoped**, **ephemeral** storage designed for active conversation state and temporary data. It's the "scratch pad" where an AI agent keeps track of the current conversation context.
-
-### Characteristics
-
-- **Session Scoped**: Each session has its own isolated working memory
-- **TTL-Based**: Automatically expires (default: 1 hour)
-- **Window Management**: Automatically summarizes when message count exceeds limits
-- **Mixed Content**: Stores both conversation messages and structured memory records
-- **No Indexing**: Simple JSON storage in Redis
-- **Promotion**: Structured memories can be promoted to long-term storage
-
-### Data Structure
-
-Working memory contains:
-
-- **Messages**: Conversation history (role/content pairs)
-- **Memories**: Structured memory records awaiting promotion
-- **Context**: Summary of past conversation when truncated
-- **Data**: Arbitrary JSON key-value storage
-- **Metadata**: User ID, timestamps, TTL settings
-
-### When to Use Working Memory
-
-1. **Active Conversation State**
-   ```python
-   import ulid
-
-   # Store current conversation messages
-   working_memory = WorkingMemory(
-       session_id="chat_123",
-       messages=[
-           MemoryMessage(role="user", content="What's the weather like?", id=ulid.ULID()),
-           MemoryMessage(role="assistant", content="I'll check that for you...", id=ulid.ULID())
-       ]
-   )
-   ```
-
-2. **Temporary Structured Data**
-   ```python
-   # Store temporary facts during conversation (using data field)
-   working_memory = WorkingMemory(
-       session_id="chat_123",
-       data={
-           "temp_trip_info": {
-               "destination": "Paris",
-               "travel_month": "next month",
-               "planning_stage": "initial"
-           },
-           "conversation_context": "travel planning"
-       }
-   )
-   ```
-
-3. **Session-Specific Settings**
-   ```python
-   # Store ephemeral configuration
-   working_memory = WorkingMemory(
-       session_id="chat_123",
-       data={
-           "user_preferences": {"temperature_unit": "celsius"},
-           "conversation_mode": "casual",
-           "current_task": "trip_planning"
-       }
-   )
-   ```
-
-4. **Promoting Memories to Long-Term Storage**
-   ```python
-   # Memories in working memory are automatically promoted to long-term storage
-   working_memory = WorkingMemory(
-       session_id="chat_123",
-       memories=[
-           MemoryRecord(
-               text="User is planning a trip to Paris next month",
-               id="trip_planning_paris",
-               memory_type="episodic",
-               topics=["travel", "planning"],
-               entities=["Paris"]
-           )
-       ]
-   )
-   # This memory will become permanent in long-term storage
-   ```
-
-> **🔑 Key Distinction**:
-> - Use `data` field for **temporary** facts that stay only in the session
-> - Use `memories` field for **permanent** facts that should be promoted to long-term storage
-> - Anything in the `memories` field will automatically become persistent and searchable across all future sessions
-
-### API Endpoints
-
-```http
-# Get working memory for a session
-GET /v1/working-memory/{session_id}?namespace=demo&model_name=gpt-4o
-
-# Set working memory (replaces existing)
-PUT /v1/working-memory/{session_id}
-
-# Delete working memory
-DELETE /v1/working-memory/{session_id}?namespace=demo
-```
-
-### Automatic Promotion
-
-When structured memories in working memory are stored, they are automatically promoted to long-term storage in the background:
-
-1. Memories with `persisted_at=null` are identified
-2. Server assigns unique IDs and timestamps
-3. Memories are indexed in long-term storage with vector embeddings
-4. Working memory is updated with `persisted_at` timestamps
-
-### Three Ways to Create Long-Term Memories
-
-Long-term memories are typically created by LLMs (either yours or the memory server's) based on conversations. There are three pathways:
-
-#### 1. 🤖 **Automatic Extraction from Conversations**
-The server automatically extracts memories from conversation messages using an LLM in the background:
-
-```python
-# Server analyzes messages and creates memories automatically
-working_memory = WorkingMemory(
-    session_id="chat_123",
-    messages=[
-        {"role": "user", "content": "I love Italian food, especially carbonara"},
-        {"role": "assistant", "content": "Great! I'll remember your preference for Italian cuisine."}
-    ]
-    # Server will extract: "User enjoys Italian food, particularly carbonara pasta"
-)
-```
-
-#### 2. ⚡ **LLM-Identified Memories via Working Memory** (Performance Optimization)
-Your LLM can pre-identify memories and add them to working memory for batch storage:
-
-```python
-# LLM identifies important facts and adds to memories field
-working_memory = WorkingMemory(
-    session_id="chat_123",
-    memories=[
-        MemoryRecord(
-            text="User prefers morning meetings and dislikes calls after 4 PM",
-            memory_type="semantic",
-            topics=["preferences", "scheduling"],
-            entities=["morning meetings", "4 PM"]
-        )
-    ]
-    # Automatically promoted to long-term storage when saving working memory
-)
-```
-
-#### 3. 🎯 **Direct Long-Term Memory Creation**
-Create memories directly via API or LLM tool calls:
-
-```python
-# Direct API call or LLM using create_long_term_memory tool
-await client.create_long_term_memories([
-    {
-        "text": "User works as a software engineer at TechCorp",
-        "memory_type": "semantic",
-        "topics": ["career", "work"],
-        "entities": ["software engineer", "TechCorp"]
-    }
-])
-```
-
-> **💡 LLM-Driven Design**: The system is designed for LLMs to make memory decisions. Your LLM can use memory tools to search existing memories, decide what's important to remember, and choose the most efficient storage method.
-
-## Long-Term Memory
-
-Long-term memory is **persistent**, **cross-session** storage designed for knowledge that should be retained and searchable across all interactions. It's the "knowledge base" where important facts, preferences, and experiences are stored.
-
-### Characteristics
-
-- **Cross-Session**: Accessible from any session
-- **Persistent**: Survives server restarts and session expiration
-- **Vector Indexed**: Semantic search with OpenAI embeddings
-- **Deduplication**: Automatic hash-based and semantic deduplication
-- **Rich Metadata**: Topics, entities, timestamps, memory types
-- **Compaction**: Automatic cleanup and merging of duplicates
-
-### Memory Types
-
-Long-term memory supports three types of memories:
-
-1. **Semantic**: Facts, preferences, general knowledge
-   ```json
-   {
-     "text": "User prefers dark mode interfaces",
-     "memory_type": "semantic",
-     "topics": ["preferences", "ui"],
-     "entities": ["dark mode"]
-   }
-   ```
-
-2. **Episodic**: Events with temporal context
-   ```json
-   {
-     "text": "User visited Paris in March 2024",
-     "memory_type": "episodic",
-     "event_date": "2024-03-15T10:00:00Z",
-     "topics": ["travel"],
-     "entities": ["Paris"]
-   }
-   ```
-
-3. **Message**: Conversation records (auto-generated)
-   ```json
-   {
-     "text": "user: What's the weather like?",
-     "memory_type": "message",
-     "session_id": "chat_123"
-   }
-   ```
-
-### When to Use Long-Term Memory
-
-1. **User Preferences and Profile**
-   ```python
-   # Store lasting user preferences
-   memories = [
-       MemoryRecord(
-           text="User prefers metric units for temperature",
-           id="pref_metric_temp",
-           memory_type="semantic",
-           topics=["preferences", "units"],
-           user_id="user_123"
-       )
-   ]
-   ```
-
-2. **Important Facts and Knowledge**
-   ```python
-   # Store domain knowledge
-   memories = [
-       MemoryRecord(
-           text="Customer's subscription expires on 2024-06-15",
-           id="sub_expiry_customer_456",
-           memory_type="episodic",
-           event_date=datetime(2024, 6, 15),
-           entities=["customer_456", "subscription"],
-           user_id="user_123"
-       )
-   ]
-   ```
-
-3. **Cross-Session Context**
-   ```python
-   # Store context that spans conversations
-   memories = [
-       MemoryRecord(
-           text="User is working on a Python machine learning project",
-           id="context_ml_project",
-           memory_type="semantic",
-           topics=["programming", "machine-learning", "python"],
-           namespace="work_context"
-       )
-   ]
-   ```
-
-### API Endpoints
-
-```http
-# Create long-term memories
-POST /v1/long-term-memory/
-
-# Search long-term memories
-POST /v1/long-term-memory/search
-```
-
-### Search Capabilities
-
-Long-term memory provides powerful search features:
-
-#### Semantic Vector Search
-```json
-{
-  "text": "python programming help",
-  "limit": 10,
-  "distance_threshold": 0.8
-}
-```
-
-#### Advanced Filtering
-```json
-{
-  "text": "user preferences",
-  "filters": {
-    "user_id": {"eq": "user_123"},
-    "memory_type": {"eq": "semantic"},
-    "topics": {"any": ["preferences", "settings"]},
-    "created_at": {"gte": "2024-01-01T00:00:00Z"}
-  }
-}
-```
-
-#### Hybrid Search
-```json
-{
-  "text": "travel plans",
-  "filters": {
-    "namespace": {"eq": "personal"},
-    "event_date": {"gte": "2024-03-01T00:00:00Z"}
-  },
-  "include_working_memory": true,
-  "include_long_term_memory": true
-}
-```
-
-## Memory Lifecycle
-
-### 1. Creation in Working Memory
-```python
-# Client creates structured memory
-memory = MemoryRecord(
-    text="User likes Italian food",
-    id="client_generated_id",
-    memory_type="semantic"
-)
-
-# Add to working memory
-working_memory = WorkingMemory(
-    session_id="current_session",
-    memories=[memory]
-)
-```
-
-### 2. Automatic Promotion
-```python
-# Server promotes to long-term storage (background)
-# - Assigns persisted_at timestamp
-# - Generates vector embeddings
-# - Indexes for search
-# - Updates working memory with timestamps
-```
-
-### 3. Deduplication and Compaction
-```python
-# Server automatically:
-# - Identifies hash-based duplicates
-# - Finds semantically similar memories
-# - Merges related memories using LLM
-# - Removes obsolete duplicates
-```
-
-### 4. Retrieval and Search
-```python
-# Client searches across all memory
-results = await search_memories(
-    text="food preferences",
-    filters={"user_id": {"eq": "user_123"}}
-)
-```
-
-## Memory Prompt Integration
-
-The memory system integrates with AI prompts through the `/v1/memory/prompt` endpoint:
-
-```python
-# Get memory-enriched prompt
-response = await memory_prompt({
-    "query": "Help me plan dinner",
-    "session": {
-        "session_id": "current_chat",
-        "model_name": "gpt-4o",
-        "context_window_max": 4000
-    },
-    "long_term_search": {
-        "text": "food preferences dietary restrictions",
-        "filters": {"user_id": {"eq": "user_123"}},
-        "limit": 5
-    }
-})
-
-# Returns ready-to-use messages with:
-# - Conversation context from working memory
-# - Relevant memories from long-term storage
-# - User's query as final message
-```
-
-## Best Practices
-
-### Working Memory
-- Keep conversation state and temporary data
-- Use for session-specific configuration
-- Store structured memories that might become long-term
-- Let automatic promotion handle persistence
-
-### Long-Term Memory
-- Store user preferences and lasting facts
-- Include rich metadata (topics, entities, timestamps)
-- Use meaningful IDs for easier retrieval
-- Leverage semantic search for discovery
-
-### Memory Design
-- Use semantic memory for timeless facts
-- Use episodic memory for time-bound events
-- Include relevant topics and entities for better search
-- Design memory text for LLM consumption
-
-### Search Strategy
-- Start with semantic search for discovery
-- Add filters for precision
-- Use unified search for comprehensive results
-- Consider both working and long-term contexts
-
-## Memory Extraction
-
-By default, the system automatically extracts structured memories from conversations as they flow from working memory to long-term storage. This extraction process can be customized using different **memory strategies**.
-
-!!! info "Memory Strategies"
-    The system supports multiple extraction strategies (discrete facts, summaries, preferences, custom prompts) that determine how conversations are processed into memories. See [Memory Strategies](memory-strategies.md) for complete documentation and examples.
-
-## Configuration
-
-Memory behavior can be configured through environment variables:
-
-```bash
-# Working memory settings
-WINDOW_SIZE=50                    # Message window before summarization
-LONG_TERM_MEMORY=true            # Enable long-term memory features
-
-# Long-term memory settings
-ENABLE_DISCRETE_MEMORY_EXTRACTION=true  # Extract memories from messages
-GENERATION_MODEL=gpt-4o-mini     # Model for summarization/extraction
-```
-
-For complete configuration options, see the [Configuration Guide](configuration.md).
diff --git a/docs/python-sdk.md b/docs/python-sdk.md
index 62bddd3..e110066 100644
--- a/docs/python-sdk.md
+++ b/docs/python-sdk.md
@@ -214,13 +214,14 @@ async def chat_with_memory(message: str, session_id: str):
 
 The SDK provides these tools for LLM integration:
 
-1. **`eagerly_create_long_term_memory`** - Eagerly create a long-term memory by making an API request
-2. **`lazily_create_long_term_memory`** - Lazily create a long-term memory by adding it to working memory (does not require an immediate network request; does require saving working memory afterward)
-3. **`search_long_term_memory`** - Search with semantic similarity
-4. **`edit_memory`** - Update existing memories
-5. **`delete_memory`** - Remove memories
-6. **`set_working_memory`** - Update or create a working memory session
-7. **`get_or_create_working_memory`** - Retrieve or create a working memory session
+1. **`create_long_term_memory`** - Eagerly create long-term memories by making an API request
+2. **`add_memory_to_working_memory`** - Lazily create memories by adding them to working memory (promoted to long-term storage later)
+3. **`search_memory`** - Search with semantic similarity across long-term memories
+4. **`edit_long_term_memory`** - Update existing long-term memories
+5. **`delete_long_term_memories`** - Remove long-term memories
+6. **`get_or_create_working_memory`** - Retrieve or create a working memory session
+7. **`update_working_memory_data`** - Update session-specific data in working memory
+8. **`get_current_datetime`** - Get current UTC datetime for grounding relative time expressions
 
 **Note:** The following tool names have been deprecated for clarity:
 - `create_long_term_memories` (deprecated) → use `eagerly_create_long_term_memory`
diff --git a/docs/query-optimization.md b/docs/query-optimization.md
index aec9d3d..3bac790 100644
--- a/docs/query-optimization.md
+++ b/docs/query-optimization.md
@@ -42,7 +42,7 @@ Query optimization is controlled by several settings that can be configured via
 
 ```bash
 # Enable/disable query optimization (default based on interface)
-# REST API: enabled by default (optimize_query=true)
+# REST API: disabled by default (optimize_query=false)
 # MCP Server: disabled by default (optimize_query=false)
 
 # Models for query optimization (can be different from generation model)
@@ -77,7 +77,7 @@ GENERATION_MODEL=gpt-4o
 Query optimization can be controlled per request using the `optimize_query` query parameter:
 
 ```bash
-# Search with optimization (default: true)
+# Search with optimization (default: false)
 curl -X POST "http://localhost:8000/v1/long-term-memory/search" \
   -H "Content-Type: application/json" \
   -d '{
@@ -134,7 +134,7 @@ from agent_memory_client import MemoryAPIClient
 
 client = MemoryAPIClient(base_url="http://localhost:8000")
 
-# Search with optimization (REST API default)
+# Search with optimization (explicitly enabled)
 results = await client.search_long_term_memory(
     text="what do I like to eat",
     limit=5
@@ -154,7 +154,7 @@ Different interfaces have different default behaviors for query optimization:
 
 | Interface | Default | Rationale |
 |-----------|---------|-----------|
-| **REST API** | `optimize_query=True` | Web applications benefit from improved search accuracy |
+| **REST API** | `optimize_query=False` | Consistent behavior across all interfaces |
 | **MCP Server** | `optimize_query=False` | AI agents may prefer direct control over queries |
 | **Client Library** | Follows API defaults | Inherits from underlying interface |
 
@@ -323,7 +323,7 @@ app.include_router(router)
 @app.post("/custom-search")
 async def custom_search(
     query: str,
-    optimize: bool = Query(True, alias="optimize_query")
+    optimize: bool = Query(False, alias="optimize_query")
 ):
     # Custom search with configurable optimization
     results = await search_long_term_memories(
diff --git a/docs/quick-start.md b/docs/quick-start.md
index d7ce01d..cfab63b 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -318,8 +318,8 @@ You've just worked with both types of memory:
 
 ### Working Memory
 - **Scope**: Session-specific
-- **Lifetime**: 1 hour (configurable TTL)
-- **Use case**: Active conversation state
+- **Lifetime**: Durable by default, optional TTL
+- **Use case**: Active conversation state and session data
 - **Auto-promotion**: Structured memories automatically move to long-term storage
 
 ### Long-Term Memory
@@ -335,12 +335,12 @@ Now that you have the basics working, explore these advanced features:
 ### 🔍 **Advanced Search**
 - Try filtering by topics, entities, or time ranges
 - Experiment with recency boost and query optimization
-- See [Memory Types Guide](memory-types.md) for detailed examples
+- See [Working Memory](working-memory.md) and [Long-term Memory](long-term-memory.md) guides for detailed examples
 
 ### ✏️ **Memory Editing**
 - Update existing memories with corrections
 - Add more context to sparse memories
-- See [Memory Editing Guide](memory-editing.md)
+- See [Memory Lifecycle Guide](memory-lifecycle.md#memory-editing)
 
 ### 🔒 **Production Setup**
 - Enable authentication (OAuth2/JWT or token-based)
@@ -521,7 +521,7 @@ redis-cli -h localhost -p 6379
 
 - **API Documentation**: Visit `http://localhost:8000/docs`
 - **Configuration Guide**: [Configuration](configuration.md)
-- **Memory Types**: [Memory Types Guide](memory-types.md)
+- **Memory Types**: [Working Memory](working-memory.md) and [Long-term Memory](long-term-memory.md)
 - **GitHub Issues**: Report problems or ask questions
 
 ## What's Next?
diff --git a/docs/security-custom-prompts.md b/docs/security-custom-prompts.md
index 2930e62..8437650 100644
--- a/docs/security-custom-prompts.md
+++ b/docs/security-custom-prompts.md
@@ -309,7 +309,8 @@ uv run pytest tests/test_memory_strategies.py tests/test_prompt_security.py
 
 ## Related Documentation
 
-- [Memory Types](memory-types.md) - Understanding different memory strategies
+- [Working Memory](working-memory.md) - Session-scoped memory storage
+- [Long-term Memory](long-term-memory.md) - Persistent memory storage
 - [Authentication](authentication.md) - Securing API access
 - [Configuration](configuration.md) - System configuration options
 - [Development Guide](development.md) - Development and testing practices
diff --git a/docs/working-memory.md b/docs/working-memory.md
new file mode 100644
index 0000000..48e3dc3
--- /dev/null
+++ b/docs/working-memory.md
@@ -0,0 +1,328 @@
+# Working Memory
+
+Working memory is **session-scoped**, **durable** storage designed for active conversation state and session data. It's the "scratch pad" where an AI agent keeps track of the current conversation context for a particular session.
+
+## Overview
+
+Working memory provides durable storage for a single conversation session. It's optimized for storing conversation messages, session-specific data, and structured memories that may later be promoted to long-term storage. By default, working memory persists to maintain conversation history, but you can set TTL expiration if your application doesn't need persistent conversation history.
+
+| Feature | Details |
+|---------|---------|
+| **Scope** | Session-scoped |
+| **Lifespan** | Durable by default, optional TTL |
+| **Storage** | Redis key-value with JSON |
+| **Search** | Simple text matching |
+| **Capacity** | Limited by window size |
+| **Use Case** | Active conversation state |
+| **Indexing** | None |
+| **Deduplication** | None |
+
+## Characteristics
+
+- **Session Scoped**: Each session has its own isolated working memory
+- **Durable by Default**: Persists conversation history unless TTL is explicitly set
+- **Optional TTL**: Can be configured to expire if conversation history isn't needed
+- **Window Management**: Automatically summarizes when message count exceeds limits
+- **Mixed Content**: Stores both conversation messages and structured memory records
+- **No Indexing**: Simple JSON storage in Redis
+- **Promotion**: Structured memories can be promoted to long-term storage
+
+## Data Structure
+
+Working memory contains:
+
+- **Messages**: Conversation history (role/content pairs)
+- **Memories**: Structured memory records awaiting promotion
+- **Context**: Summary of past conversation when truncated
+- **Data**: Arbitrary JSON key-value storage
+- **Metadata**: User ID, timestamps, TTL settings
+
+## When to Use Working Memory
+
+### 1. Conversation Messages
+
+The primary use of working memory is storing conversation messages to maintain context across turns:
+
+```python
+import ulid
+
+# Store conversation messages for context continuity
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    messages=[
+        MemoryMessage(role="user", content="I'm planning a trip to Paris next month", id=ulid.ULID()),
+        MemoryMessage(role="assistant", content="That sounds exciting! What type of activities are you interested in?", id=ulid.ULID()),
+        MemoryMessage(role="user", content="I love museums and good food", id=ulid.ULID())
+    ]
+)
+
+# On the next turn, the assistant can access this context:
+# - User is planning a Paris trip
+# - Trip is next month
+# - User likes museums and food
+# This enables coherent, context-aware responses
+```
+
+### 2. Session-Specific Data
+
+Use the `data` field for temporary session information that doesn't need to persist across conversations:
+
+```python
+# Store session-specific facts and configuration
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    data={
+        "temp_trip_info": {
+            "destination": "Paris",
+            "travel_month": "next month",
+            "planning_stage": "initial"
+        },
+        "user_preferences": {"temperature_unit": "celsius"},
+        "conversation_mode": "casual"
+    }
+)
+```
+
+### 3. Structured Memories for Long-Term Storage
+
+Use the `memories` field for important facts that should be remembered across all future conversations:
+
+```python
+# Important facts that should persist beyond this session
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    memories=[
+        MemoryRecord(
+            text="User is planning a trip to Paris next month",
+            id="trip_planning_paris",
+            memory_type="episodic",
+            topics=["travel", "planning"],
+            entities=["Paris"]
+        )
+    ]
+)
+# This memory will be automatically promoted to long-term storage
+```
+
+> **🔑 Key Distinction**:
+> - Use `data` field for **session-specific** facts that stay only in the session
+> - Use `memories` field for **important** facts that should be promoted to long-term storage
+> - Anything in the `memories` field will automatically become persistent and searchable across all future sessions
+
+## Producing Long-Term Memories from Working Memory
+
+Working memory can automatically extract and promote memories to long-term storage using different strategies. This is one of the most powerful features of the memory server - it can intelligently analyze conversation content and create persistent memories without manual intervention.
+
+### Memory Server Extracts in the Background
+
+By default, the memory server automatically analyzes working memory content and extracts meaningful memories in the background. This is ideal when you want the memory server to handle all LLM operations internally.
+
+```python
+# Configure automatic extraction strategy
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    long_term_memory_strategy=MemoryStrategyConfig(
+        extraction_strategy="thread_aware",  # Analyzes conversation threads
+        custom_prompt="Extract key facts about user preferences and important events",
+        enable_topic_extraction=True,
+        enable_entity_extraction=True
+    ),
+    messages=[
+        MemoryMessage(role="user", content="I'm a software engineer at TechCorp"),
+        MemoryMessage(role="assistant", content="That's great! What technologies do you work with?"),
+        MemoryMessage(role="user", content="Mainly Python and React for web applications")
+    ]
+)
+
+# The server will automatically extract memories like:
+# - "User is a software engineer at TechCorp"
+# - "User works with Python and React for web applications"
+```
+
+### Your LLM Extracts (Client-Side)
+
+If you prefer to manage all LLM activity in your application, you can have your LLM extract memories client-side and add them to working memory. This gives you full control over the extraction process and LLM usage.
+
+```python
+# Your LLM can use tools to lazily add memories to working memory
+# These will be promoted to long-term storage when the session is processed
+
+# Using the add_memory_to_working_memory tool (lazy approach)
+tools = [client.get_add_memory_tool_schema()]
+
+# Your LLM can call this tool to add memories:
+# add_memory_to_working_memory(
+#     session_id="chat_123",
+#     memory={
+#         "text": "User prefers Python for backend development",
+#         "memory_type": "semantic",
+#         "topics": ["programming", "preferences"]
+#     }
+# )
+```
+
+The Python SDK includes tools that allow your LLM to create memories either lazily (added to working memory for later promotion) or eagerly (created directly in long-term storage):
+
+- **Lazy approach**: `add_memory_to_working_memory` - adds memories to working memory for batch promotion
+- **Eager approach**: `create_long_term_memory` - creates memories directly in long-term storage
+
+See the [Long-Term Memory documentation](long-term-memory.md) for details on eager creation.
+
+## API Endpoints
+
+```http
+# Get working memory for a session
+GET /v1/working-memory/{session_id}?namespace=demo&model_name=gpt-4o
+
+# Set working memory (replaces existing, with optional TTL)
+PUT /v1/working-memory/{session_id}?ttl_seconds=3600
+
+# Delete working memory
+DELETE /v1/working-memory/{session_id}?namespace=demo
+```
+
+## Automatic Promotion
+
+When structured memories in working memory are stored, they are automatically promoted to long-term storage in the background:
+
+1. Memories with `persisted_at=null` are identified
+2. Server assigns unique IDs and timestamps
+3. Memories are indexed in long-term storage with vector embeddings
+4. Working memory is updated with `persisted_at` timestamps
+
+
+
+## Memory Lifecycle
+
+### 1. Creation in Working Memory
+```python
+# Client creates structured memory
+memory = MemoryRecord(
+    text="User likes Italian food",
+    id="client_generated_id",
+    memory_type="semantic"
+)
+
+# Add to working memory
+working_memory = WorkingMemory(
+    session_id="current_session",
+    memories=[memory]
+)
+```
+
+### 2. Automatic Promotion
+```python
+# Server promotes to long-term storage (background)
+# - Assigns persisted_at timestamp
+# - Generates vector embeddings
+# - Indexes for search
+# - Updates working memory with timestamps
+```
+
+## Best Practices
+
+### Working Memory Usage
+- Keep conversation state and session-specific data
+- Use for session-specific configuration and context
+- Store structured memories that should become long-term
+- Set TTL only if conversation history doesn't need to persist
+- Let automatic promotion handle long-term memory persistence
+
+### Memory Design
+- Use `data` field for session-specific facts that stay only in the session
+- Use `memories` field for important facts that should be promoted to long-term storage
+- Design memory text for LLM consumption
+- Include relevant topics and entities for better search
+
+## TTL and Persistence
+
+Working memory is **durable by default** to preserve conversation history. However, you can configure TTL (time-to-live) expiration if your application doesn't need persistent conversation history:
+
+```python
+# Durable working memory (default behavior)
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    messages=[...],
+    # No TTL - memory persists until explicitly deleted
+)
+
+# Working memory with TTL expiration
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    messages=[...],
+    ttl_seconds=3600  # Expires after 1 hour
+)
+```
+
+```http
+# Set working memory with TTL via REST API
+PUT /v1/working-memory/chat_123?ttl_seconds=3600
+```
+
+**When to use TTL:**
+- Temporary chat sessions that don't need history
+- Privacy-sensitive applications requiring automatic cleanup
+- Resource-constrained environments
+
+**When to keep durable (default):**
+- Applications that need conversation history
+- Multi-turn conversations that reference past context
+- Customer support or assistant applications
+
+## Transparent Reconstruction from Long-Term Memory
+
+When `index_all_messages_in_long_term_memory` is enabled, working memory can be transparently reconstructed from long-term storage. This allows you to use TTL expiration while still maintaining conversation continuity.
+
+**How it works:**
+1. Set `index_all_messages_in_long_term_memory=true` in configuration
+2. Messages are automatically indexed in long-term memory as they flow through working memory
+3. When working memory expires (TTL), the messages remain in long-term storage
+4. If you request a session that doesn't exist in working memory, the system automatically searches long-term memory for messages from that session and reconstructs the working memory
+
+**Example workflow:**
+```python
+# 1. Store working memory with TTL (expires after 1 hour)
+working_memory = WorkingMemory(
+    session_id="chat_123",
+    messages=[
+        MemoryMessage(role="user", content="Hello"),
+        MemoryMessage(role="assistant", content="Hi there!"),
+    ],
+    ttl_seconds=3600  # 1 hour expiration
+)
+
+# 2. Messages are automatically indexed in long-term memory
+# 3. After 1 hour, working memory expires and is deleted
+# 4. Later, when you request the session:
+
+# GET /v1/working-memory/chat_123
+# System automatically reconstructs from long-term memory
+# Returns working memory with original messages
+```
+
+This feature is perfect for applications that want to:
+- Reduce Redis memory usage with TTL expiration
+- Maintain conversation continuity across sessions
+- Automatically handle session restoration without manual intervention
+
+## Configuration
+
+Working memory behavior can be configured through environment variables:
+
+```bash
+# Working memory settings
+WINDOW_SIZE=50                    # Message window before summarization
+LONG_TERM_MEMORY=true            # Enable long-term memory features
+
+# Long-term memory settings
+ENABLE_DISCRETE_MEMORY_EXTRACTION=true  # Extract memories from messages
+GENERATION_MODEL=gpt-4o-mini     # Model for summarization/extraction
+```
+
+For complete configuration options, see the [Configuration Guide](configuration.md).
+
+## Related Documentation
+
+- [Long-term Memory](long-term-memory.md) - Persistent, cross-session memory storage
+- [Memory Integration Patterns](memory-integration-patterns.md) - How to integrate memory with your applications
+- [Memory Extraction Strategies](memory-extraction-strategies.md) - Different approaches to memory extraction and storage
diff --git a/examples/memory_editing_agent.py b/examples/memory_editing_agent.py
index 644f3fb..77f5fc4 100644
--- a/examples/memory_editing_agent.py
+++ b/examples/memory_editing_agent.py
@@ -131,6 +131,13 @@ async def _add_message_to_working_memory(
     ) -> None:
         """Add a message to working memory."""
         client = await self.get_client()
+        # Ensure working memory exists before appending messages
+        await client.get_or_create_working_memory(
+            session_id=session_id,
+            namespace=self._get_namespace(user_id),
+            model_name="gpt-4o-mini",
+            user_id=user_id,
+        )
         await client.append_messages_to_working_memory(
             session_id=session_id,
             messages=[{"role": role, "content": content}],
diff --git a/examples/memory_prompt_agent.py b/examples/memory_prompt_agent.py
index b653b7e..9820922 100644
--- a/examples/memory_prompt_agent.py
+++ b/examples/memory_prompt_agent.py
@@ -111,6 +111,13 @@ async def _add_message_to_working_memory(
     ) -> None:
         """Add a message to working memory."""
         client = await self.get_client()
+        # Ensure working memory exists before appending messages
+        await client.get_or_create_working_memory(
+            session_id=session_id,
+            namespace=self._get_namespace(user_id),
+            model_name="gpt-4o-mini",
+            user_id=user_id,
+        )
         await client.append_messages_to_working_memory(
             session_id=session_id,
             messages=[{"role": role, "content": content}],
diff --git a/examples/recent_messages_limit_demo.py b/examples/recent_messages_limit_demo.py
new file mode 100644
index 0000000..299e24e
--- /dev/null
+++ b/examples/recent_messages_limit_demo.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""
+Demonstration of the recent_messages_limit feature.
+
+This script shows how to use the new recent_messages_limit parameter
+to efficiently retrieve only the most recent N messages from working memory.
+"""
+
+import asyncio
+
+from agent_memory_server.models import MemoryMessage, WorkingMemory
+from agent_memory_server.utils.redis import get_redis_conn
+from agent_memory_server.working_memory import get_working_memory, set_working_memory
+
+
+async def demo_recent_messages_limit():
+    """Demonstrate the recent_messages_limit functionality"""
+    print("🚀 Recent Messages Limit Demo")
+    print("=" * 50)
+
+    # Get Redis connection
+    redis_client = await get_redis_conn()
+
+    # Create a session with many messages
+    session_id = "demo-session"
+    user_id = "demo-user"
+    namespace = "demo"
+
+    print("📝 Creating working memory with 10 messages...")
+
+    # Create 10 messages with automatic created_at timestamps
+    messages = []
+    for i in range(10):
+        messages.append(
+            MemoryMessage(
+                id=f"msg-{i}",
+                role="user" if i % 2 == 0 else "assistant",
+                content=f"This is message number {i}. It contains some conversation content.",
+                # created_at is automatically set to current time
+            )
+        )
+
+    # Create working memory
+    working_memory = WorkingMemory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        messages=messages,
+        context="This is a demo conversation",
+        data={"demo": True, "total_messages": 10},
+    )
+
+    # Store the working memory
+    await set_working_memory(working_memory, redis_client=redis_client)
+    print(f"✅ Stored working memory with {len(messages)} messages")
+
+    print("\n" + "=" * 50)
+    print("🔍 Testing different message limits:")
+    print("=" * 50)
+
+    # Test 1: Get all messages (no limit)
+    print("\n1️⃣ Getting ALL messages (no limit):")
+    result = await get_working_memory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        redis_client=redis_client,
+    )
+    print(f"   📊 Retrieved {len(result.messages)} messages")
+    print(f"   📝 First message: {result.messages[0].content}")
+    print(f"   📝 Last message: {result.messages[-1].content}")
+
+    # Test 2: Get last 3 messages
+    print("\n2️⃣ Getting last 3 messages:")
+    result = await get_working_memory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        redis_client=redis_client,
+        recent_messages_limit=3,
+    )
+    print(f"   📊 Retrieved {len(result.messages)} messages")
+    for i, msg in enumerate(result.messages):
+        print(f"   📝 Message {i}: {msg.content}")
+
+    # Test 3: Get last 5 messages
+    print("\n3️⃣ Getting last 5 messages:")
+    result = await get_working_memory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        redis_client=redis_client,
+        recent_messages_limit=5,
+    )
+    print(f"   📊 Retrieved {len(result.messages)} messages")
+    print(f"   📝 First of limited: {result.messages[0].content}")
+    print(f"   📝 Last of limited: {result.messages[-1].content}")
+
+    # Test 4: Get more messages than available
+    print("\n4️⃣ Getting 20 messages (more than available):")
+    result = await get_working_memory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        redis_client=redis_client,
+        recent_messages_limit=20,
+    )
+    print(f"   📊 Retrieved {len(result.messages)} messages (all available)")
+
+    # Test 5: Verify other data is preserved
+    print("\n5️⃣ Verifying other data is preserved:")
+    result = await get_working_memory(
+        session_id=session_id,
+        user_id=user_id,
+        namespace=namespace,
+        redis_client=redis_client,
+        recent_messages_limit=2,
+    )
+    print(f"   📊 Retrieved {len(result.messages)} messages")
+    print(f"   🗂️ Context preserved: {result.context}")
+    print(f"   🗂️ Data preserved: {result.data}")
+    print(f"   🗂️ Session ID: {result.session_id}")
+
+    print("\n" + "=" * 50)
+    print("🎯 Key Benefits:")
+    print("=" * 50)
+    print("✨ Efficient: Limits messages returned to client applications")
+    print("✨ Chronological: Uses created_at timestamps for proper message ordering")
+    print("✨ Simple: Uses in-memory slicing for working memory data")
+    print("✨ Flexible: Works with both working memory and long-term reconstruction")
+    print("✨ Safe: Preserves all other working memory data")
+    print("✨ Compatible: Available in both REST API and MCP server")
+
+    print("\n" + "=" * 50)
+    print("📚 Usage Examples:")
+    print("=" * 50)
+    print("🌐 REST API:")
+    print("   GET /v1/working-memory/{session_id}?recent_messages_limit=5")
+    print("\n🔧 MCP Tool:")
+    print("   get_working_memory(session_id='...', recent_messages_limit=5)")
+    print("\n🐍 Python:")
+    print("   await get_working_memory(..., recent_messages_limit=5)")
+
+    print("\n✅ Demo completed successfully!")
+
+
+if __name__ == "__main__":
+    asyncio.run(demo_recent_messages_limit())
diff --git a/examples/travel_agent.py b/examples/travel_agent.py
index fa233fc..74a76ff 100644
--- a/examples/travel_agent.py
+++ b/examples/travel_agent.py
@@ -296,6 +296,12 @@ async def _add_message_to_working_memory(
 
         # Get the memory client and save updated working memory
         client = await self.get_client()
+        # Ensure working memory exists before appending messages
+        await client.get_or_create_working_memory(
+            session_id=session_id,
+            namespace=self._get_namespace(user_id),
+            model_name="gpt-4o-mini",
+        )
         await client.append_messages_to_working_memory(
             session_id=session_id,
             messages=new_message,
diff --git a/mkdocs.yml b/mkdocs.yml
index 6873e20..6320896 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -78,9 +78,11 @@ nav:
 
   - Developer Guide:
     - Memory Integration Patterns: memory-integration-patterns.md
-    - Memory Strategies: memory-strategies.md
-    - Memory Editing: memory-editing.md
+    - Working Memory: working-memory.md
+    - Long-term Memory: long-term-memory.md
+    - Memory Extraction Strategies: memory-extraction-strategies.md
     - Memory Lifecycle: memory-lifecycle.md
+    - Vector Store Backends: vector-store-backends.md
     - Authentication: authentication.md
     - Security: security-custom-prompts.md
 
@@ -91,10 +93,6 @@ nav:
   - Examples:
     - Agent Examples: agent-examples.md
 
-  - Core Concepts:
-    - Memory Types: memory-types.md
-    - Vector Store Backends: vector-store-backends.md
-
   - Advanced Topics:
     - Query Optimization: query-optimization.md
     - Recency Boost: recency-boost.md
diff --git a/tests/test_api.py b/tests/test_api.py
index 436c8a0..4231b49 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -267,6 +267,246 @@ async def test_put_memory(self, client):
             assert msg["role"] == payload["messages"][i]["role"]
             assert msg["content"] == payload["messages"][i]["content"]
 
+    @pytest.mark.requires_api_keys
+    @pytest.mark.asyncio
+    async def test_put_memory_with_context_window_max(self, client):
+        """Test PUT memory with context_window_max parameter returns context percentages"""
+        payload = {
+            "messages": [
+                {"role": "user", "content": "Hello, how are you today?"},
+                {
+                    "role": "assistant",
+                    "content": "I'm doing well, thank you for asking!",
+                },
+                {
+                    "role": "user",
+                    "content": "That's great to hear. Can you help me with something?",
+                },
+            ],
+            "memories": [],
+            "context": "",
+            "namespace": "test-namespace",
+            "session_id": "test-session",
+        }
+
+        # Test with context_window_max as query parameter
+        response = await client.put(
+            "/v1/working-memory/test-session?context_window_max=500", json=payload
+        )
+
+        assert response.status_code == 200
+
+        data = response.json()
+        # Should return context percentages when context_window_max is provided
+        assert "context_percentage_total_used" in data
+        assert "context_percentage_until_summarization" in data
+        assert data["context_percentage_total_used"] is not None
+        assert data["context_percentage_until_summarization"] is not None
+        assert isinstance(data["context_percentage_total_used"], int | float)
+        assert isinstance(data["context_percentage_until_summarization"], int | float)
+        assert 0 <= data["context_percentage_total_used"] <= 100
+        assert 0 <= data["context_percentage_until_summarization"] <= 100
+
+    @pytest.mark.requires_api_keys
+    @pytest.mark.asyncio
+    async def test_put_memory_without_model_info(self, client):
+        """Test PUT memory without model info returns null context percentages"""
+        payload = {
+            "messages": [
+                {"role": "user", "content": "Hello"},
+                {"role": "assistant", "content": "Hi there"},
+            ],
+            "memories": [],
+            "context": "",
+            "namespace": "test-namespace",
+            "session_id": "test-session",
+        }
+
+        # Test without context_window_max or model_name
+        response = await client.put("/v1/working-memory/test-session", json=payload)
+
+        assert response.status_code == 200
+
+        data = response.json()
+        # Should return null context percentages when no model info is provided
+        assert "context_percentage_total_used" in data
+        assert "context_percentage_until_summarization" in data
+        assert data["context_percentage_total_used"] is None
+        assert data["context_percentage_until_summarization"] is None
+
+    @pytest.mark.requires_api_keys
+    @pytest.mark.asyncio
+    async def test_put_memory_context_percentages_with_summarization_regression(
+        self, client
+    ):
+        """
+        Regression test for bug where PUT with context_window_max returned null percentages
+        when summarization occurred.
+
+        Bug: _calculate_context_usage_percentages returned None for empty/few messages even
+        when model info was provided.
+
+        Fix: Function now returns 0.0 for empty messages when model info is provided,
+        and small percentages for few messages, representing the current session state.
+        """
+        # Create many messages that will definitely trigger summarization with context_window_max=500
+        messages = []
+        for i in range(25):
+            messages.append(
+                {
+                    "role": "user" if i % 2 == 0 else "assistant",
+                    "content": f"Message {i}: This is substantial content that uses many tokens and will trigger summarization when context window is limited to 500 tokens. "
+                    * 3,
+                }
+            )
+
+        payload = {
+            "messages": messages,
+            "memories": [],
+            "context": "",
+            "namespace": "test-namespace",
+            "session_id": "regression-test-session",
+        }
+
+        # Test with context_window_max=500 (should trigger summarization)
+        response = await client.put(
+            "/v1/working-memory/regression-test-session?context_window_max=500",
+            json=payload,
+        )
+
+        assert response.status_code == 200
+
+        data = response.json()
+
+        # Verify summarization occurred (message count should be reduced)
+        original_message_count = len(payload["messages"])
+        final_message_count = len(data["messages"])
+        assert (
+            final_message_count < original_message_count
+        ), f"Expected summarization to reduce messages from {original_message_count} to less, but got {final_message_count}"
+
+        # Verify context summary was created
+        assert (
+            data["context"] is not None
+        ), "Context should not be None after summarization"
+        assert (
+            data["context"].strip() != ""
+        ), "Context should not be empty after summarization"
+
+        # REGRESSION TEST: Context percentages should NOT be null even after summarization
+        # They should reflect the current state (post-summarization) with small percentages
+        assert "context_percentage_total_used" in data
+        assert "context_percentage_until_summarization" in data
+        assert (
+            data["context_percentage_total_used"] is not None
+        ), "BUG REGRESSION: context_percentage_total_used should not be null when context_window_max is provided"
+        assert (
+            data["context_percentage_until_summarization"] is not None
+        ), "BUG REGRESSION: context_percentage_until_summarization should not be null when context_window_max is provided"
+
+        # Verify the percentages are valid numbers
+        total_used = data["context_percentage_total_used"]
+        until_summarization = data["context_percentage_until_summarization"]
+
+        assert isinstance(
+            total_used, int | float
+        ), f"context_percentage_total_used should be a number, got {type(total_used)}"
+        assert isinstance(
+            until_summarization, int | float
+        ), f"context_percentage_until_summarization should be a number, got {type(until_summarization)}"
+        assert (
+            0 <= total_used <= 100
+        ), f"context_percentage_total_used should be 0-100, got {total_used}"
+        assert (
+            0 <= until_summarization <= 100
+        ), f"context_percentage_until_summarization should be 0-100, got {until_summarization}"
+
+        # After summarization, percentages should be reasonable (not necessarily high)
+        # They represent the current state of the session post-summarization
+        assert (
+            total_used >= 0
+        ), f"Expected non-negative total usage percentage, got {total_used}"
+        assert (
+            until_summarization >= 0
+        ), f"Expected non-negative until_summarization percentage, got {until_summarization}"
+
+    @pytest.mark.requires_api_keys
+    @pytest.mark.asyncio
+    async def test_working_memory_reconstruction_from_long_term(
+        self, client, async_redis_client
+    ):
+        """Test working memory reconstruction from long-term memory when index_all_messages_in_long_term_memory is enabled"""
+        from datetime import UTC, datetime
+
+        from agent_memory_server.config import settings
+        from agent_memory_server.long_term_memory import index_long_term_memories
+        from agent_memory_server.models import MemoryRecord
+
+        # Enable message indexing
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "reconstruction-api-test"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create message memories in long-term storage (simulating expired working memory)
+            message_memories = [
+                MemoryRecord(
+                    id="api-msg-1",
+                    text="user: Hello from API test",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=datetime.now(UTC),
+                ),
+                MemoryRecord(
+                    id="api-msg-2",
+                    text="assistant: Hello! How can I help you?",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=datetime.now(UTC),
+                ),
+            ]
+
+            # Index messages in long-term memory
+            await index_long_term_memories(
+                message_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Try to get working memory - should reconstruct from long-term
+            response = await client.get(
+                f"/v1/working-memory/{session_id}?namespace={namespace}&user_id={user_id}"
+            )
+
+            assert response.status_code == 200
+            result = response.json()
+
+            # Should have reconstructed the working memory
+            assert result["session_id"] == session_id
+            assert result["user_id"] == user_id
+            assert result["namespace"] == namespace
+            assert len(result["messages"]) == 2
+
+            # Check message content
+            message_contents = [msg["content"] for msg in result["messages"]]
+            assert "Hello from API test" in message_contents
+            assert "Hello! How can I help you?" in message_contents
+
+            # Should have empty memories, context, and data (reconstruction only includes messages)
+            assert result["memories"] == []
+            assert result["context"] == ""
+            assert result["data"] == {}
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
     @pytest.mark.requires_api_keys
     @pytest.mark.asyncio
     async def test_put_memory_stores_messages_in_long_term_memory(
@@ -457,6 +697,70 @@ async def test_get_nonexistent_session_with_new_client_returns_404(self, client)
         data = response.json()
         assert "not found" in data["detail"].lower()
 
+    @pytest.mark.asyncio
+    async def test_get_working_memory_with_recent_messages_limit(
+        self, client, async_redis_client
+    ):
+        """Test GET working memory with recent_messages_limit parameter"""
+        session_id = "test-api-limit"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create working memory with many messages
+        messages = []
+        for i in range(8):
+            messages.append(
+                {
+                    "id": f"msg-{i}",
+                    "role": "user" if i % 2 == 0 else "assistant",
+                    "content": f"API Message {i}",
+                }
+            )
+
+        data = {
+            "session_id": session_id,
+            "messages": messages,
+            "context": "Test context",
+            "namespace": namespace,
+            "user_id": user_id,
+            "memories": [],
+            "data": {"test": "value"},
+        }
+
+        # Store working memory
+        response = await client.put(f"/v1/working-memory/{session_id}", json=data)
+        assert response.status_code == 200
+
+        # Test: Get with recent_messages_limit=3
+        response = await client.get(
+            f"/v1/working-memory/{session_id}?namespace={namespace}&user_id={user_id}&recent_messages_limit=3"
+        )
+
+        assert response.status_code == 200
+        result = response.json()
+
+        # Should have limited messages
+        assert len(result["messages"]) == 3
+
+        # Should get the last 3 messages (messages 5, 6, 7)
+        assert result["messages"][0]["content"] == "API Message 5"
+        assert result["messages"][1]["content"] == "API Message 6"
+        assert result["messages"][2]["content"] == "API Message 7"
+
+        # Other data should be preserved
+        assert result["context"] == "Test context"
+        assert result["data"] == {"test": "value"}
+        assert result["session_id"] == session_id
+
+        # Test: Get without limit (should get all messages)
+        response = await client.get(
+            f"/v1/working-memory/{session_id}?namespace={namespace}&user_id={user_id}"
+        )
+
+        assert response.status_code == 200
+        result = response.json()
+        assert len(result["messages"]) == 8  # All messages
+
 
 @pytest.mark.requires_api_keys
 class TestSearchEndpoint:
@@ -491,27 +795,27 @@ async def test_search(self, mock_search, client):
 
     @patch("agent_memory_server.api.long_term_memory.search_long_term_memories")
     @pytest.mark.asyncio
-    async def test_search_with_optimize_query_true(self, mock_search, client):
-        """Test search endpoint with optimize_query=True (default)."""
+    async def test_search_with_optimize_query_default(self, mock_search, client):
+        """Test search endpoint with optimize_query default (False)."""
         mock_search.return_value = MemoryRecordResultsResponse(
             total=1,
             memories=[
-                MemoryRecordResult(id="1", text="Optimized result", dist=0.1),
+                MemoryRecordResult(id="1", text="Non-optimized result", dist=0.1),
             ],
             next_offset=None,
         )
 
         payload = {"text": "tell me about my preferences"}
 
-        # Call endpoint without optimize_query parameter (should default to True)
+        # Call endpoint without optimize_query parameter (should default to False)
         response = await client.post("/v1/long-term-memory/search", json=payload)
 
         assert response.status_code == 200
 
-        # Verify search was called with optimize_query=True (default)
+        # Verify search was called with optimize_query=False (default)
         mock_search.assert_called_once()
         call_kwargs = mock_search.call_args.kwargs
-        assert call_kwargs.get("optimize_query") is True
+        assert call_kwargs.get("optimize_query") is False
 
     @patch("agent_memory_server.api.long_term_memory.search_long_term_memories")
     @pytest.mark.asyncio
@@ -847,10 +1151,10 @@ async def test_memory_prompt_with_model_name(
     @patch("agent_memory_server.api.long_term_memory.search_long_term_memories")
     @patch("agent_memory_server.api.working_memory.get_working_memory")
     @pytest.mark.asyncio
-    async def test_memory_prompt_with_optimize_query_default_true(
+    async def test_memory_prompt_with_optimize_query_default_false(
         self, mock_get_working_memory, mock_search, client
     ):
-        """Test memory prompt endpoint with default optimize_query=True."""
+        """Test memory prompt endpoint with default optimize_query=False."""
         # Mock working memory
         mock_get_working_memory.return_value = WorkingMemoryResponse(
             session_id="test-session",
@@ -877,15 +1181,15 @@ async def test_memory_prompt_with_optimize_query_default_true(
             "long_term_search": {"text": "preferences"},
         }
 
-        # Call endpoint without optimize_query parameter (should default to True)
+        # Call endpoint without optimize_query parameter (should default to False)
         response = await client.post("/v1/memory/prompt", json=payload)
 
         assert response.status_code == 200
 
-        # Verify search was called with optimize_query=True (default)
+        # Verify search was called with optimize_query=False (default)
         mock_search.assert_called_once()
         # The search is called indirectly through the API's search_long_term_memory function
-        # which should have optimize_query=True by default
+        # which should have optimize_query=False by default
 
     @patch("agent_memory_server.api.long_term_memory.search_long_term_memories")
     @patch("agent_memory_server.api.working_memory.get_working_memory")
diff --git a/tests/test_client_api.py b/tests/test_client_api.py
index d69bec1..e6621f0 100644
--- a/tests/test_client_api.py
+++ b/tests/test_client_api.py
@@ -635,10 +635,10 @@ async def test_search_memory_tool_with_optimize_query_true_explicit(
 
 
 @pytest.mark.asyncio
-async def test_memory_prompt_with_optimize_query_default_true(
+async def test_memory_prompt_with_optimize_query_default_false(
     memory_test_client: MemoryAPIClient,
 ):
-    """Test that client memory_prompt uses optimize_query=True by default."""
+    """Test that client memory_prompt uses optimize_query=False by default."""
     with patch(
         "agent_memory_server.long_term_memory.search_long_term_memories"
     ) as mock_search:
@@ -646,15 +646,17 @@ async def test_memory_prompt_with_optimize_query_default_true(
             total=0, memories=[], next_offset=None
         )
 
-        # Call memory_prompt without optimize_query parameter (should default to True)
+        # Call memory_prompt without optimize_query parameter (should default to False)
         result = await memory_test_client.memory_prompt(
             query="what are my preferences?", long_term_search={"text": "preferences"}
         )
 
-        # Verify search was called with optimize_query=True (default)
-        mock_search.assert_called_once()
-        call_kwargs = mock_search.call_args.kwargs
-        assert call_kwargs.get("optimize_query") is True
+        # Verify search was called with optimize_query=False (default)
+        # May be called multiple times due to soft-filter fallback
+        assert mock_search.call_count >= 1
+        # Check that all calls use optimize_query=False
+        for call in mock_search.call_args_list:
+            assert call.kwargs.get("optimize_query") is False
         assert result is not None
 
 
@@ -678,7 +680,9 @@ async def test_memory_prompt_with_optimize_query_false_explicit(
         )
 
         # Verify search was called with optimize_query=False
-        mock_search.assert_called_once()
-        call_kwargs = mock_search.call_args.kwargs
-        assert call_kwargs.get("optimize_query") is False
+        # May be called multiple times due to soft-filter fallback
+        assert mock_search.call_count >= 1
+        # Check that all calls use optimize_query=False
+        for call in mock_search.call_args_list:
+            assert call.kwargs.get("optimize_query") is False
         assert result is not None
diff --git a/tests/test_client_tool_calls.py b/tests/test_client_tool_calls.py
index 19d7096..b24e8df 100644
--- a/tests/test_client_tool_calls.py
+++ b/tests/test_client_tool_calls.py
@@ -498,6 +498,109 @@ def test_convert_openai_to_anthropic_schema(self):
         )
         assert anthropic_schema["input_schema"]["required"] == ["param1"]
 
+    def test_create_long_term_memory_tool_schema(self):
+        """Test create_long_term_memory tool schema in OpenAI format."""
+        schema = MemoryAPIClient.create_long_term_memory_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "create_long_term_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memories" in params["properties"]
+        assert "memories" in params["required"]
+
+        # Check memory_type enum does NOT include "message"
+        memory_items = params["properties"]["memories"]["items"]
+        memory_type_prop = memory_items["properties"]["memory_type"]
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_edit_long_term_memory_tool_schema(self):
+        """Test edit_long_term_memory tool schema in OpenAI format."""
+        schema = MemoryAPIClient.edit_long_term_memory_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "edit_long_term_memory"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memory_id" in params["properties"]
+        assert "memory_id" in params["required"]
+
+        # Check memory_type enum does NOT include "message"
+        memory_type_prop = params["properties"]["memory_type"]
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_delete_long_term_memories_tool_schema(self):
+        """Test delete_long_term_memories tool schema in OpenAI format."""
+        schema = MemoryAPIClient.delete_long_term_memories_tool_schema()
+
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "delete_long_term_memories"
+        assert "description" in schema["function"]
+        assert "parameters" in schema["function"]
+
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "memory_ids" in params["properties"]
+        assert "memory_ids" in params["required"]
+
+    def test_add_memory_tool_schema_excludes_message_type(self):
+        """Test that add_memory_to_working_memory schema excludes 'message' type."""
+        schema = MemoryAPIClient.get_add_memory_tool_schema()
+
+        params = schema["function"]["parameters"]
+        memory_type_prop = params["properties"]["memory_type"]
+
+        # Verify only episodic and semantic are allowed
+        assert memory_type_prop["enum"] == ["episodic", "semantic"]
+        assert "message" not in memory_type_prop["enum"]
+
+    def test_all_tool_schemas_exclude_message_type(self):
+        """Test that creation/editing tool schemas exclude 'message' type.
+
+        Note: search_memory CAN include 'message' in its filter enum since it's for
+        searching/reading existing memories, not creating new ones.
+        """
+        # Get all schemas
+        all_schemas = MemoryAPIClient.get_all_memory_tool_schemas()
+
+        # Tools that should NOT expose message type (creation/editing tools)
+        restricted_tools = {
+            "add_memory_to_working_memory",
+            "create_long_term_memory",
+            "edit_long_term_memory",
+        }
+
+        # Check each schema that has memory_type parameter
+        for schema in all_schemas:
+            function_name = schema["function"]["name"]
+            params = schema["function"]["parameters"]
+
+            # Check if this schema has memory_type in properties
+            if "memory_type" in params["properties"]:
+                memory_type_prop = params["properties"]["memory_type"]
+                if function_name in restricted_tools:
+                    assert (
+                        "message" not in memory_type_prop.get("enum", [])
+                    ), f"Creation/editing tool {function_name} should not expose 'message' memory type"
+
+            # Check nested properties (like in create_long_term_memory)
+            if "memories" in params["properties"]:
+                items = params["properties"]["memories"].get("items", {})
+                if "properties" in items and "memory_type" in items["properties"]:
+                    memory_type_prop = items["properties"]["memory_type"]
+                    if function_name in restricted_tools:
+                        assert (
+                            "message" not in memory_type_prop.get("enum", [])
+                        ), f"Creation/editing tool {function_name} should not expose 'message' memory type in nested properties"
+
 
 class TestToolCallErrorHandling:
     """Tests for tool call error handling and edge cases."""
diff --git a/tests/test_context_percentage_calculation.py b/tests/test_context_percentage_calculation.py
new file mode 100644
index 0000000..4eca4fd
--- /dev/null
+++ b/tests/test_context_percentage_calculation.py
@@ -0,0 +1,240 @@
+"""
+Unit tests for context percentage calculation functions.
+Includes regression tests for division by zero and edge cases.
+"""
+
+from agent_memory_server.api import _calculate_context_usage_percentages
+from agent_memory_server.models import MemoryMessage
+
+
+class TestContextPercentageCalculation:
+    """Test context percentage calculation in various scenarios"""
+
+    def test_context_percentages_with_context_window_max(self):
+        """Test that context percentages are calculated when context_window_max is provided"""
+        messages = [
+            MemoryMessage(role="user", content="Hello, how are you today?"),
+            MemoryMessage(
+                role="assistant", content="I'm doing well, thank you for asking!"
+            ),
+            MemoryMessage(
+                role="user",
+                content="That's great to hear. Can you help me with something?",
+            ),
+        ]
+
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=500
+            )
+        )
+
+        assert (
+            total_percentage is not None
+        ), "total_percentage should not be None when context_window_max is provided"
+        assert (
+            until_summarization_percentage is not None
+        ), "until_summarization_percentage should not be None when context_window_max is provided"
+        assert isinstance(total_percentage, float), "total_percentage should be a float"
+        assert isinstance(
+            until_summarization_percentage, float
+        ), "until_summarization_percentage should be a float"
+        assert (
+            0 <= total_percentage <= 100
+        ), "total_percentage should be between 0 and 100"
+        assert (
+            0 <= until_summarization_percentage <= 100
+        ), "until_summarization_percentage should be between 0 and 100"
+
+    def test_context_percentages_with_model_name(self):
+        """Test that context percentages are calculated when model_name is provided"""
+        messages = [
+            MemoryMessage(role="user", content="Hello"),
+            MemoryMessage(role="assistant", content="Hi there"),
+        ]
+
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name="gpt-4o-mini", context_window_max=None
+            )
+        )
+
+        assert (
+            total_percentage is not None
+        ), "total_percentage should not be None when model_name is provided"
+        assert (
+            until_summarization_percentage is not None
+        ), "until_summarization_percentage should not be None when model_name is provided"
+        assert isinstance(total_percentage, float), "total_percentage should be a float"
+        assert isinstance(
+            until_summarization_percentage, float
+        ), "until_summarization_percentage should be a float"
+
+    def test_context_percentages_without_model_info(self):
+        """Test that context percentages return None when no model info is provided"""
+        messages = [
+            MemoryMessage(role="user", content="Hello"),
+            MemoryMessage(role="assistant", content="Hi there"),
+        ]
+
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=None
+            )
+        )
+
+        assert (
+            total_percentage is None
+        ), "total_percentage should be None when no model info is provided"
+        assert (
+            until_summarization_percentage is None
+        ), "until_summarization_percentage should be None when no model info is provided"
+
+    def test_context_percentages_with_empty_messages(self):
+        """Test context percentages with empty messages list but model info provided"""
+        messages = []
+
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=500
+            )
+        )
+
+        # CORRECTED: Should return 0.0 when model info is provided, even with empty messages
+        assert (
+            total_percentage == 0.0
+        ), "total_percentage should be 0.0 for empty messages when model info provided"
+        assert (
+            until_summarization_percentage == 0.0
+        ), "until_summarization_percentage should be 0.0 for empty messages when model info provided"
+
+    def test_context_percentages_precedence(self):
+        """Test that context_window_max takes precedence over model_name"""
+        messages = [
+            MemoryMessage(role="user", content="Hello world"),
+        ]
+
+        # Test with both provided - context_window_max should take precedence
+        total_percentage_both, until_summarization_percentage_both = (
+            _calculate_context_usage_percentages(
+                messages=messages,
+                model_name="gpt-4o-mini",  # This has a large context window
+                context_window_max=100,  # This is much smaller
+            )
+        )
+
+        # Test with only context_window_max
+        total_percentage_max_only, until_summarization_percentage_max_only = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=100
+            )
+        )
+
+        # Results should be the same, proving context_window_max takes precedence
+        assert (
+            total_percentage_both == total_percentage_max_only
+        ), "context_window_max should take precedence over model_name"
+        assert (
+            until_summarization_percentage_both
+            == until_summarization_percentage_max_only
+        ), "context_window_max should take precedence over model_name"
+
+    def test_context_percentages_high_token_usage(self):
+        """Test context percentages when token usage is high"""
+        # Create many messages to exceed typical limits
+        messages = []
+        for i in range(50):
+            messages.append(
+                MemoryMessage(
+                    role="user" if i % 2 == 0 else "assistant",
+                    content=f"This is message number {i} with substantial content that will use many tokens. "
+                    * 10,
+                )
+            )
+
+        # Test with small context window to force high percentages
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=500
+            )
+        )
+
+        assert total_percentage is not None
+        assert until_summarization_percentage is not None
+        # Should be capped at 100%
+        assert total_percentage <= 100.0, "total_percentage should be capped at 100%"
+        assert (
+            until_summarization_percentage <= 100.0
+        ), "until_summarization_percentage should be capped at 100%"
+
+    def test_context_percentages_zero_context_window_regression(self):
+        """
+        Regression test for division by zero when context_window_max is 0 or very small.
+
+        Bug: When max_tokens <= 0 or token_threshold <= 0, division by zero occurred.
+        Fix: Added checks to return None for invalid context windows.
+        """
+        messages = [MemoryMessage(role="user", content="Hello")]
+
+        # Test with zero context window
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=0
+            )
+        )
+
+        # Should return None for invalid context window
+        assert total_percentage is None, "Should return None for zero context window"
+        assert (
+            until_summarization_percentage is None
+        ), "Should return None for zero context window"
+
+        # Test with negative context window
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages, model_name=None, context_window_max=-1
+            )
+        )
+
+        # Should return None for invalid context window
+        assert (
+            total_percentage is None
+        ), "Should return None for negative context window"
+        assert (
+            until_summarization_percentage is None
+        ), "Should return None for negative context window"
+
+    def test_context_percentages_very_small_context_window_regression(self):
+        """
+        Regression test for division by zero when token_threshold becomes 0.
+
+        Bug: When context_window_max is very small (e.g., 1) and summarization_threshold is 0.7,
+        token_threshold = int(1 * 0.7) = 0, causing division by zero.
+        Fix: Added check for token_threshold <= 0.
+        """
+        messages = [MemoryMessage(role="user", content="Hello world")]
+
+        # Test with very small context window that would cause token_threshold = 0
+        total_percentage, until_summarization_percentage = (
+            _calculate_context_usage_percentages(
+                messages=messages,
+                model_name=None,
+                context_window_max=1,  # With summarization_threshold=0.7, token_threshold = int(1 * 0.7) = 0
+            )
+        )
+
+        # Should handle this gracefully without division by zero
+        assert (
+            total_percentage is not None
+        ), "Should handle small context window without error"
+        assert (
+            until_summarization_percentage is not None
+        ), "Should handle small context window without error"
+        assert isinstance(total_percentage, float), "Should return valid float"
+        assert isinstance(
+            until_summarization_percentage, float
+        ), "Should return valid float"
+        # until_summarization_percentage should be 100% when threshold is 0
+        assert (
+            until_summarization_percentage == 100.0
+        ), "Should return 100% when token threshold is 0"
diff --git a/tests/test_extraction_logic_fix.py b/tests/test_extraction_logic_fix.py
new file mode 100644
index 0000000..48323f2
--- /dev/null
+++ b/tests/test_extraction_logic_fix.py
@@ -0,0 +1,282 @@
+"""
+Tests for the extraction logic fixes in long_term_memory.py
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from agent_memory_server.long_term_memory import promote_working_memory_to_long_term
+from agent_memory_server.models import MemoryMessage, MemoryRecord, WorkingMemory
+from agent_memory_server.working_memory import get_working_memory, set_working_memory
+
+
+class TestExtractionLogicFixes:
+    """Test the fixes for extraction logic issues"""
+
+    @pytest.mark.asyncio
+    async def test_extracted_memories_variable_always_defined(self, async_redis_client):
+        """Test that extracted_memories variable is always defined, even when extraction is disabled"""
+        from agent_memory_server.config import settings
+
+        # Disable extraction
+        original_setting = settings.enable_discrete_memory_extraction
+        settings.enable_discrete_memory_extraction = False
+
+        try:
+            session_id = "test-extraction-disabled"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create working memory with unextracted messages
+            messages = [
+                MemoryMessage(
+                    id="msg-1",
+                    role="user",
+                    content="Test message",
+                    discrete_memory_extracted="f",  # Unextracted
+                ),
+            ]
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=messages,
+                memories=[],  # No existing memories
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # This should not raise a NameError for undefined extracted_memories
+            promoted_count = await promote_working_memory_to_long_term(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+            )
+
+            # Should complete successfully
+            assert promoted_count == 0  # No memories to promote
+
+        finally:
+            settings.enable_discrete_memory_extraction = original_setting
+
+    @pytest.mark.asyncio
+    async def test_extracted_memories_are_promoted(self, async_redis_client):
+        """Test that extracted memories are actually promoted to long-term storage"""
+        from agent_memory_server.config import settings
+
+        # Enable extraction
+        original_setting = settings.enable_discrete_memory_extraction
+        settings.enable_discrete_memory_extraction = True
+
+        try:
+            session_id = "test-extraction-promotion"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create working memory with unextracted messages
+            messages = [
+                MemoryMessage(
+                    id="msg-1",
+                    role="user",
+                    content="Test message for extraction",
+                    discrete_memory_extracted="f",  # Unextracted
+                ),
+            ]
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=messages,
+                memories=[],  # No existing memories
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # Mock the extraction functions to return a test memory
+            mock_extracted_memory = MemoryRecord(
+                id="extracted-1",
+                text="Extracted memory from conversation",
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                memory_type="episodic",  # Use valid enum value
+            )
+
+            with (
+                patch(
+                    "agent_memory_server.long_term_memory.should_extract_session_thread",
+                    return_value=True,
+                ),
+                patch(
+                    "agent_memory_server.long_term_memory.extract_memories_from_session_thread",
+                    return_value=[mock_extracted_memory],
+                ),
+            ):
+                promoted_count = await promote_working_memory_to_long_term(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                # Should have promoted the extracted memory
+                assert promoted_count == 1
+
+                # Verify the working memory was updated with extraction status
+                updated_wm = await get_working_memory(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                assert updated_wm is not None
+                # Message should be marked as extracted
+                assert updated_wm.messages[0].discrete_memory_extracted == "t"
+                # Extracted memory should be in working memory (now with persisted_at set)
+                assert len(updated_wm.memories) == 1
+                assert updated_wm.memories[0].id == "extracted-1"
+                assert updated_wm.memories[0].persisted_at is not None
+
+        finally:
+            settings.enable_discrete_memory_extraction = original_setting
+
+    @pytest.mark.asyncio
+    async def test_working_memory_updated_when_messages_marked_extracted(
+        self, async_redis_client
+    ):
+        """Test that working memory is updated even when no memories are extracted but messages are marked"""
+        from agent_memory_server.config import settings
+
+        # Enable extraction
+        original_setting = settings.enable_discrete_memory_extraction
+        settings.enable_discrete_memory_extraction = True
+
+        try:
+            session_id = "test-extraction-marking"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create working memory with unextracted messages
+            messages = [
+                MemoryMessage(
+                    id="msg-1",
+                    role="user",
+                    content="Test message for marking",
+                    discrete_memory_extracted="f",  # Unextracted
+                ),
+            ]
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=messages,
+                memories=[],  # No existing memories
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # Mock extraction to return no memories but still trigger marking
+            with (
+                patch(
+                    "agent_memory_server.long_term_memory.should_extract_session_thread",
+                    return_value=True,
+                ),
+                patch(
+                    "agent_memory_server.long_term_memory.extract_memories_from_session_thread",
+                    return_value=[],
+                ),
+            ):  # No extracted memories
+                promoted_count = await promote_working_memory_to_long_term(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                # Should have promoted 0 memories
+                assert promoted_count == 0
+
+                # But working memory should still be updated with extraction status
+                updated_wm = await get_working_memory(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                assert updated_wm is not None
+                # Message should be marked as extracted even though no memories were extracted
+                assert updated_wm.messages[0].discrete_memory_extracted == "t"
+
+        finally:
+            settings.enable_discrete_memory_extraction = original_setting
+
+    @pytest.mark.asyncio
+    async def test_no_extraction_when_debounced(self, async_redis_client):
+        """Test that extraction is skipped when debounced and extracted_memories is still defined"""
+        from agent_memory_server.config import settings
+
+        # Enable extraction
+        original_setting = settings.enable_discrete_memory_extraction
+        settings.enable_discrete_memory_extraction = True
+
+        try:
+            session_id = "test-extraction-debounced"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create working memory with unextracted messages
+            messages = [
+                MemoryMessage(
+                    id="msg-1",
+                    role="user",
+                    content="Test message for debouncing",
+                    discrete_memory_extracted="f",  # Unextracted
+                ),
+            ]
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=messages,
+                memories=[],
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # Mock extraction to be debounced (should_extract returns False)
+            with patch(
+                "agent_memory_server.long_term_memory.should_extract_session_thread",
+                return_value=False,
+            ):
+                promoted_count = await promote_working_memory_to_long_term(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                # Should complete without error (extracted_memories should be defined as empty list)
+                assert promoted_count == 0
+
+                # Working memory should not be updated since nothing changed
+                updated_wm = await get_working_memory(
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    redis_client=async_redis_client,
+                )
+
+                assert updated_wm is not None
+                # Message should still be marked as unextracted
+                assert updated_wm.messages[0].discrete_memory_extracted == "f"
+
+        finally:
+            settings.enable_discrete_memory_extraction = original_setting
diff --git a/tests/test_llm_judge_evaluation.py b/tests/test_llm_judge_evaluation.py
index dde557f..11f2528 100644
--- a/tests/test_llm_judge_evaluation.py
+++ b/tests/test_llm_judge_evaluation.py
@@ -369,6 +369,7 @@ async def test_judge_spatial_grounding_evaluation(self):
         assert evaluation["spatial_grounding_score"] >= 0.7
         assert evaluation["overall_score"] >= 0.6
 
+    @pytest.mark.skip(reason="Flaky test - LLM judge evaluation can be inconsistent")
     async def test_judge_comprehensive_grounding_evaluation(self):
         """Test LLM judge on complex example with multiple grounding types"""
 
diff --git a/tests/test_long_term_memory.py b/tests/test_long_term_memory.py
index 908c80d..0a723e3 100644
--- a/tests/test_long_term_memory.py
+++ b/tests/test_long_term_memory.py
@@ -1031,8 +1031,19 @@ async def test_search_passes_all_parameters_correctly(
         """Test that all search parameters are passed correctly to the adapter."""
         # Mock the vectorstore adapter
         mock_adapter = AsyncMock()
+        # Return some results to avoid fallback behavior when distance_threshold is set
         mock_adapter.search_memories.return_value = MemoryRecordResults(
-            total=0, memories=[]
+            total=1,
+            memories=[
+                MemoryRecordResult(
+                    id="test-id",
+                    text="test memory",
+                    session_id="test-session",
+                    user_id="test-user",
+                    namespace="test-namespace",
+                    dist=0.1,  # Required field for MemoryRecordResult
+                )
+            ],
         )
         mock_get_adapter.return_value = mock_adapter
 
diff --git a/tests/test_recent_messages_limit.py b/tests/test_recent_messages_limit.py
new file mode 100644
index 0000000..1a9e04b
--- /dev/null
+++ b/tests/test_recent_messages_limit.py
@@ -0,0 +1,514 @@
+"""
+Tests for recent messages limit functionality.
+"""
+
+from datetime import UTC, datetime
+
+import pytest
+
+from agent_memory_server.models import MemoryMessage, WorkingMemory
+from agent_memory_server.working_memory import get_working_memory, set_working_memory
+
+
+class TestRecentMessagesLimit:
+    """Test recent messages limit functionality"""
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_with_working_memory(self, async_redis_client):
+        """Test recent messages limit with existing working memory using JSONPath"""
+        session_id = "test-limit-session"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create working memory with many messages
+        messages = []
+        for i in range(10):
+            messages.append(
+                MemoryMessage(
+                    id=f"msg-{i}",
+                    role="user" if i % 2 == 0 else "assistant",
+                    content=f"Message {i}: This is message number {i}",
+                )
+            )
+
+        working_memory = WorkingMemory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            messages=messages,
+        )
+
+        # Store the working memory
+        await set_working_memory(working_memory, redis_client=async_redis_client)
+
+        # Test: Get with recent_messages_limit=3
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=3,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 3
+
+        # Should get the last 3 messages (messages 7, 8, 9)
+        assert result.messages[0].content == "Message 7: This is message number 7"
+        assert result.messages[1].content == "Message 8: This is message number 8"
+        assert result.messages[2].content == "Message 9: This is message number 9"
+
+        # Test: Get with recent_messages_limit=5
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=5,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 5
+
+        # Should get the last 5 messages (messages 5, 6, 7, 8, 9)
+        assert result.messages[0].content == "Message 5: This is message number 5"
+        assert result.messages[4].content == "Message 9: This is message number 9"
+
+        # Test: Get without limit (should get all messages)
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 10
+        assert result.messages[0].content == "Message 0: This is message number 0"
+        assert result.messages[9].content == "Message 9: This is message number 9"
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_larger_than_available(
+        self, async_redis_client
+    ):
+        """Test recent messages limit when limit is larger than available messages"""
+        session_id = "test-limit-large"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create working memory with only 3 messages
+        messages = []
+        for i in range(3):
+            messages.append(
+                MemoryMessage(
+                    id=f"msg-{i}",
+                    role="user" if i % 2 == 0 else "assistant",
+                    content=f"Message {i}",
+                )
+            )
+
+        working_memory = WorkingMemory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            messages=messages,
+        )
+
+        await set_working_memory(working_memory, redis_client=async_redis_client)
+
+        # Test: Get with recent_messages_limit=10 (larger than available)
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=10,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 3  # Should return all available messages
+        assert result.messages[0].content == "Message 0"
+        assert result.messages[2].content == "Message 2"
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_zero_and_negative(self, async_redis_client):
+        """Test recent messages limit with zero and negative values"""
+        session_id = "test-limit-edge"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create working memory with messages
+        messages = []
+        for i in range(5):
+            messages.append(
+                MemoryMessage(
+                    id=f"msg-{i}",
+                    role="user" if i % 2 == 0 else "assistant",
+                    content=f"Message {i}",
+                )
+            )
+
+        working_memory = WorkingMemory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            messages=messages,
+        )
+
+        await set_working_memory(working_memory, redis_client=async_redis_client)
+
+        # Test: Get with recent_messages_limit=0 (should return all messages)
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=0,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 5  # Should return all messages when limit is 0
+
+        # Test: Get with recent_messages_limit=-1 (should return all messages)
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=-1,
+        )
+
+        assert result is not None
+        assert (
+            len(result.messages) == 5
+        )  # Should return all messages when limit is negative
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_with_reconstruction(self, async_redis_client):
+        """Test recent messages limit with reconstruction from long-term memory"""
+        from agent_memory_server.config import settings
+        from agent_memory_server.long_term_memory import index_long_term_memories
+        from agent_memory_server.models import MemoryRecord
+
+        # Enable message indexing
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "test-limit-reconstruction"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create message memories in long-term storage
+            now = datetime.now(UTC)
+            message_memories = []
+            for msg_idx in range(8):
+                message_memories.append(
+                    MemoryRecord(
+                        id=f"lt-msg-{msg_idx}",
+                        text=f"{'user' if msg_idx % 2 == 0 else 'assistant'}: Long-term message {msg_idx}",
+                        memory_type="message",
+                        session_id=session_id,
+                        user_id=user_id,
+                        namespace=namespace,
+                        persisted_at=now,
+                    )
+                )
+
+            # Index messages in long-term memory
+            await index_long_term_memories(
+                message_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Test: Reconstruct with recent_messages_limit=3
+            result = await get_working_memory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+                recent_messages_limit=3,
+            )
+
+            assert result is not None
+            assert len(result.messages) <= 3  # Should limit to 3 messages
+
+            # Messages should be in chronological order (oldest first)
+            for _i, msg in enumerate(result.messages):
+                assert "Long-term message" in msg.content
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_preserves_other_data(self, async_redis_client):
+        """Test that recent messages limit doesn't affect other working memory data"""
+        session_id = "test-limit-preserve"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create working memory with messages and other data
+        messages = []
+        for i in range(5):
+            messages.append(
+                MemoryMessage(
+                    id=f"msg-{i}",
+                    role="user" if i % 2 == 0 else "assistant",
+                    content=f"Message {i}",
+                )
+            )
+
+        working_memory = WorkingMemory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            messages=messages,
+            context="This is the context",
+            data={"key": "value", "setting": "test"},
+            memories=[],
+        )
+
+        await set_working_memory(working_memory, redis_client=async_redis_client)
+
+        # Test: Get with recent_messages_limit=2
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=2,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 2  # Limited messages
+
+        # Other data should be preserved
+        assert result.context == "This is the context"
+        assert result.data == {"key": "value", "setting": "test"}
+        assert result.memories == []
+        assert result.session_id == session_id
+        assert result.user_id == user_id
+        assert result.namespace == namespace
+
+    @pytest.mark.asyncio
+    async def test_working_memory_takes_precedence_over_long_term(
+        self, async_redis_client
+    ):
+        """Test that working memory is used instead of long-term memory when both exist"""
+        from datetime import UTC, datetime
+
+        from agent_memory_server.config import settings
+        from agent_memory_server.long_term_memory import index_long_term_memories
+        from agent_memory_server.models import MemoryRecord
+
+        # Enable message indexing
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "test-precedence"
+            user_id = "test-user"
+            namespace = "test"
+
+            # First, create long-term memories
+            now = datetime.now(UTC)
+            lt_memories = []
+            for i in range(3):
+                lt_memories.append(
+                    MemoryRecord(
+                        id=f"lt-msg-{i}",
+                        text=f"{'user' if i % 2 == 0 else 'assistant'}: Long-term message {i}",
+                        memory_type="message",
+                        session_id=session_id,
+                        user_id=user_id,
+                        namespace=namespace,
+                        persisted_at=now,
+                    )
+                )
+
+            await index_long_term_memories(
+                lt_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Now create working memory with different messages
+            wm_messages = []
+            for i in range(2):
+                wm_messages.append(
+                    MemoryMessage(
+                        id=f"wm-msg-{i}",
+                        role="user" if i % 2 == 0 else "assistant",
+                        content=f"Working memory message {i}",
+                    )
+                )
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=wm_messages,
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # Test: Get working memory - should return working memory, not long-term
+            result = await get_working_memory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+                recent_messages_limit=1,
+            )
+
+            assert result is not None
+            assert len(result.messages) == 1
+            # Should be from working memory, not long-term memory
+            assert "Working memory message" in result.messages[0].content
+            assert "Long-term message" not in result.messages[0].content
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_recent_messages_limit_respects_created_at_order(
+        self, async_redis_client
+    ):
+        """Test that recent messages limit uses created_at for proper chronological ordering"""
+        from datetime import UTC, datetime, timedelta
+
+        session_id = "test-created-at-order"
+        user_id = "test-user"
+        namespace = "test"
+
+        # Create messages with specific created_at timestamps (out of order)
+        base_time = datetime.now(UTC)
+        messages = [
+            MemoryMessage(
+                id="msg-1",
+                role="user",
+                content="First message (oldest)",
+                created_at=base_time - timedelta(minutes=10),
+            ),
+            MemoryMessage(
+                id="msg-3",
+                role="user",
+                content="Third message (newest)",
+                created_at=base_time,
+            ),
+            MemoryMessage(
+                id="msg-2",
+                role="assistant",
+                content="Second message (middle)",
+                created_at=base_time - timedelta(minutes=5),
+            ),
+        ]
+
+        working_memory = WorkingMemory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            messages=messages,  # Stored in non-chronological order
+        )
+
+        await set_working_memory(working_memory, redis_client=async_redis_client)
+
+        # Test: Get with recent_messages_limit=2 (should get the 2 most recent by created_at)
+        result = await get_working_memory(
+            session_id=session_id,
+            user_id=user_id,
+            namespace=namespace,
+            redis_client=async_redis_client,
+            recent_messages_limit=2,
+        )
+
+        assert result is not None
+        assert len(result.messages) == 2
+
+        # Should get messages in chronological order (oldest first)
+        # The 2 most recent should be msg-2 and msg-3
+        assert result.messages[0].content == "Second message (middle)"
+        assert result.messages[1].content == "Third message (newest)"
+
+        # Verify the timestamps are in correct order
+        assert result.messages[0].created_at < result.messages[1].created_at
+
+    @pytest.mark.asyncio
+    async def test_message_persistence_sets_correct_memory_type(
+        self, async_redis_client
+    ):
+        """Test that messages persisted to long-term storage have memory_type='message'"""
+        from agent_memory_server.config import settings
+        from agent_memory_server.filters import MemoryType, SessionId
+        from agent_memory_server.long_term_memory import (
+            promote_working_memory_to_long_term,
+            search_long_term_memories,
+        )
+
+        # Enable message indexing
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "test-message-type"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create working memory with messages
+            messages = [
+                MemoryMessage(
+                    id="msg-1",
+                    role="user",
+                    content="Test message for memory type verification",
+                ),
+                MemoryMessage(
+                    id="msg-2",
+                    role="assistant",
+                    content="Response message for memory type verification",
+                ),
+            ]
+
+            working_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=messages,
+            )
+
+            await set_working_memory(working_memory, redis_client=async_redis_client)
+
+            # Promote messages to long-term storage
+            promoted_count = await promote_working_memory_to_long_term(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+            )
+
+            assert (
+                promoted_count >= 2
+            )  # At least both messages should be promoted (may include extracted memories)
+
+            # Search for the persisted messages
+            results = await search_long_term_memories(
+                text="",  # Empty query to get all
+                session_id=SessionId(eq=session_id),
+                memory_type=MemoryType(eq="message"),
+                limit=10,
+                offset=0,
+            )
+
+            assert len(results.memories) == 2  # Should have exactly 2 message memories
+
+            # Verify both messages have the correct memory type
+            for memory in results.memories:
+                assert memory.memory_type == "message"
+                assert memory.session_id == session_id
+                assert memory.user_id == user_id
+                assert memory.namespace == namespace
+                # Verify the text format is "role: content"
+                assert ": " in memory.text
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
diff --git a/tests/test_thread_aware_grounding.py b/tests/test_thread_aware_grounding.py
index a1e790b..b4bd00a 100644
--- a/tests/test_thread_aware_grounding.py
+++ b/tests/test_thread_aware_grounding.py
@@ -208,6 +208,9 @@ async def test_empty_conversation_handling(self):
         # Should return empty list without errors
         assert extracted_memories == []
 
+    @pytest.mark.skip(
+        reason="Flaky test - LLM extraction behavior is non-deterministic"
+    )
     @pytest.mark.requires_api_keys
     async def test_multi_entity_conversation(self):
         """Test contextual grounding with multiple entities in conversation."""
diff --git a/tests/test_working_memory_reconstruction.py b/tests/test_working_memory_reconstruction.py
new file mode 100644
index 0000000..3991044
--- /dev/null
+++ b/tests/test_working_memory_reconstruction.py
@@ -0,0 +1,272 @@
+"""
+Tests for working memory reconstruction from long-term memory.
+"""
+
+from datetime import UTC, datetime
+
+import pytest
+
+from agent_memory_server.config import settings
+from agent_memory_server.long_term_memory import index_long_term_memories
+from agent_memory_server.models import MemoryMessage, MemoryRecord, WorkingMemory
+from agent_memory_server.working_memory import get_working_memory, set_working_memory
+
+
+class TestWorkingMemoryReconstruction:
+    """Test working memory reconstruction from long-term storage"""
+
+    @pytest.mark.asyncio
+    async def test_reconstruction_disabled_by_default(self, async_redis_client):
+        """Test that reconstruction doesn't happen when index_all_messages_in_long_term_memory is False"""
+        # Ensure the setting is disabled
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = False
+
+        try:
+            # Try to get non-existent working memory
+            result = await get_working_memory(
+                session_id="nonexistent-session",
+                user_id="test-user",
+                namespace="test",
+                redis_client=async_redis_client,
+            )
+
+            # Should return None since reconstruction is disabled
+            assert result is None
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_reconstruction_with_no_messages(self, async_redis_client):
+        """Test reconstruction when no messages exist in long-term memory"""
+        # Enable the setting
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            # Try to get non-existent working memory with no messages in long-term
+            result = await get_working_memory(
+                session_id="empty-session",
+                user_id="test-user",
+                namespace="test",
+                redis_client=async_redis_client,
+            )
+
+            # Should return None since no messages found
+            assert result is None
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_reconstruction_with_messages(self, async_redis_client):
+        """Test successful reconstruction from messages in long-term memory"""
+        # Enable the setting
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "test-reconstruction-session"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create message-type memory records (simulating what would be stored)
+            now = datetime.now(UTC)
+            message_memories = [
+                MemoryRecord(
+                    id="msg-1",
+                    text="user: Hello, how are you?",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=now,
+                ),
+                MemoryRecord(
+                    id="msg-2",
+                    text="assistant: I'm doing well, thank you for asking!",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=now,
+                ),
+                MemoryRecord(
+                    id="msg-3",
+                    text="user: Can you help me with something?",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=now,
+                ),
+            ]
+
+            # Index these messages in long-term memory
+            await index_long_term_memories(
+                message_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Now try to get working memory - should reconstruct from long-term
+            result = await get_working_memory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+            )
+
+            # Should successfully reconstruct
+            assert result is not None
+            assert result.session_id == session_id
+            assert result.user_id == user_id
+            assert result.namespace == namespace
+            assert len(result.messages) == 3
+
+            # Check that all expected messages are present (order might vary)
+            message_contents = [msg.content for msg in result.messages]
+            message_ids = [msg.id for msg in result.messages]
+
+            assert "Hello, how are you?" in message_contents
+            assert "I'm doing well, thank you for asking!" in message_contents
+            assert "Can you help me with something?" in message_contents
+
+            assert "msg-1" in message_ids
+            assert "msg-2" in message_ids
+            assert "msg-3" in message_ids
+
+            # All messages should have persisted_at set
+            for msg in result.messages:
+                assert msg.persisted_at is not None
+
+            # Should have empty memories, context, and data
+            assert result.memories == []
+            assert result.context == ""
+            assert result.data == {}
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_reconstruction_ignores_existing_working_memory(
+        self, async_redis_client
+    ):
+        """Test that reconstruction doesn't happen if working memory already exists"""
+        # Enable the setting
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "existing-session"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create existing working memory
+            existing_memory = WorkingMemory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                messages=[
+                    MemoryMessage(
+                        id="existing-msg",
+                        role="user",
+                        content="This is existing content",
+                    )
+                ],
+            )
+
+            # Store the existing working memory
+            await set_working_memory(existing_memory, redis_client=async_redis_client)
+
+            # Create different messages in long-term memory
+            message_memories = [
+                MemoryRecord(
+                    id="lt-msg-1",
+                    text="user: This is from long-term",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=datetime.now(UTC),
+                ),
+            ]
+
+            await index_long_term_memories(
+                message_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Get working memory - should return existing, not reconstruct
+            result = await get_working_memory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+            )
+
+            # Should return existing working memory, not reconstructed
+            assert result is not None
+            assert len(result.messages) == 1
+            assert result.messages[0].content == "This is existing content"
+            assert result.messages[0].id == "existing-msg"
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting
+
+    @pytest.mark.asyncio
+    async def test_reconstruction_with_malformed_messages(self, async_redis_client):
+        """Test reconstruction handles malformed message memories gracefully"""
+        # Enable the setting
+        original_setting = settings.index_all_messages_in_long_term_memory
+        settings.index_all_messages_in_long_term_memory = True
+
+        try:
+            session_id = "malformed-session"
+            user_id = "test-user"
+            namespace = "test"
+
+            # Create mix of valid and malformed message memories
+            message_memories = [
+                MemoryRecord(
+                    id="valid-msg",
+                    text="user: This is valid",
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=datetime.now(UTC),
+                ),
+                MemoryRecord(
+                    id="malformed-msg",
+                    text="This has no role separator",  # Missing ": "
+                    memory_type="message",
+                    session_id=session_id,
+                    user_id=user_id,
+                    namespace=namespace,
+                    persisted_at=datetime.now(UTC),
+                ),
+            ]
+
+            await index_long_term_memories(
+                message_memories,
+                redis_client=async_redis_client,
+                deduplicate=False,
+            )
+
+            # Should reconstruct with only valid messages
+            result = await get_working_memory(
+                session_id=session_id,
+                user_id=user_id,
+                namespace=namespace,
+                redis_client=async_redis_client,
+            )
+
+            assert result is not None
+            assert len(result.messages) == 1  # Only the valid message
+            assert result.messages[0].content == "This is valid"
+
+        finally:
+            settings.index_all_messages_in_long_term_memory = original_setting