From e0fa596a25b1a8276ccfb78790e472d5a6a010c3 Mon Sep 17 00:00:00 2001
From: Mengxin Zhu <843303+zxkane@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:43:13 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix:=20implement=20proper=20time?=
 =?UTF-8?q?out=20handling=20for=20MCP=20client=20tool=20calls?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add timeout support to call_tool_sync and call_tool_async methods
- Only apply timeout when read_timeout_seconds is explicitly provided to maintain backward compatibility
- Handle TimeoutError exceptions with proper error responses
- Add comprehensive tests for timeout scenarios and backward compatibility
---
 src/strands/tools/mcp/mcp_client.py        |  35 ++++-
 tests/strands/tools/mcp/test_mcp_client.py | 170 +++++++++++++++++++++
 2 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/src/strands/tools/mcp/mcp_client.py b/src/strands/tools/mcp/mcp_client.py
index 96e80385f..e58244ad4 100644
--- a/src/strands/tools/mcp/mcp_client.py
+++ b/src/strands/tools/mcp/mcp_client.py
@@ -290,8 +290,23 @@ async def _call_tool_async() -> MCPCallToolResult:
             )
 
         try:
-            call_tool_result: MCPCallToolResult = self._invoke_on_background_thread(_call_tool_async()).result()
+            future = self._invoke_on_background_thread(_call_tool_async())
+
+            # Only apply timeout if explicitly provided by user to maintain backward compatibility
+            if read_timeout_seconds is not None:
+                timeout_seconds = read_timeout_seconds.total_seconds()
+                call_tool_result = future.result(timeout=timeout_seconds)
+            else:
+                # No timeout - preserve original behavior for existing customers
+                call_tool_result = future.result()
+
             return self._handle_tool_result(tool_use_id, call_tool_result)
+        except futures.TimeoutError:
+            timeout_seconds = read_timeout_seconds.total_seconds() if read_timeout_seconds else 0
+            logger.exception("tool execution timed out after %s seconds", timeout_seconds)
+            return self._handle_tool_execution_error(
+                tool_use_id, Exception(f"Tool call timed out after {timeout_seconds} seconds")
+            )
         except Exception as e:
             logger.exception("tool execution failed")
             return self._handle_tool_execution_error(tool_use_id, e)
@@ -328,8 +343,24 @@ async def _call_tool_async() -> MCPCallToolResult:
 
         try:
             future = self._invoke_on_background_thread(_call_tool_async())
-            call_tool_result: MCPCallToolResult = await asyncio.wrap_future(future)
+
+            # Only apply timeout if explicitly provided by user to maintain backward compatibility
+            if read_timeout_seconds is not None:
+                timeout_seconds = read_timeout_seconds.total_seconds()
+                call_tool_result = await asyncio.wait_for(
+                    asyncio.wrap_future(future), timeout=timeout_seconds
+                )
+            else:
+                # No timeout - preserve original behavior for existing customers
+                call_tool_result = await asyncio.wrap_future(future)
+
             return self._handle_tool_result(tool_use_id, call_tool_result)
+        except asyncio.TimeoutError:
+            timeout_seconds = read_timeout_seconds.total_seconds() if read_timeout_seconds else 0
+            logger.exception("async tool execution timed out after %s seconds", timeout_seconds)
+            return self._handle_tool_execution_error(
+                tool_use_id, Exception(f"Tool call timed out after {timeout_seconds} seconds")
+            )
         except Exception as e:
             logger.exception("tool execution failed")
             return self._handle_tool_execution_error(tool_use_id, e)
diff --git a/tests/strands/tools/mcp/test_mcp_client.py b/tests/strands/tools/mcp/test_mcp_client.py
index 67d8fe558..dd854a1aa 100644
--- a/tests/strands/tools/mcp/test_mcp_client.py
+++ b/tests/strands/tools/mcp/test_mcp_client.py
@@ -541,3 +541,173 @@ def slow_transport():
     assert client._background_thread_session is None
     assert client._background_thread_event_loop is None
     assert not client._init_future.done()  # New future created
+
+
+def test_call_tool_sync_server_jsonrpc_error_response(mock_transport, mock_session):
+    """Test that call_tool_sync handles JSON-RPC error responses from server without hanging.
+
+    This reproduces the bug where the MCP client hangs when the remote streamable HTTP server
+    responds with a JSON-RPC error instead of a successful tool result.
+    """
+    import time
+    from datetime import timedelta
+
+    # Simulate a JSON-RPC error by making the call_tool method hang indefinitely
+    # This mimics what happens when the MCP SDK receives a JSON-RPC error response
+    def hanging_call_tool(*args, **kwargs):
+        # Simulate the hanging behavior by sleeping indefinitely
+        time.sleep(10)  # This will cause a timeout in our test
+
+    mock_session.call_tool = hanging_call_tool
+
+    with MCPClient(mock_transport["transport_callable"]) as client:
+        # Test with a very short timeout to verify the fix
+        start_time = time.time()
+
+        # This should not hang indefinitely, should timeout after 2 seconds
+        result = client.call_tool_sync(
+            tool_use_id="test-hanging-123",
+            name="test_tool",
+            arguments={"param": "value"},
+            read_timeout_seconds=timedelta(seconds=2)  # Short timeout for test
+        )
+
+        elapsed_time = time.time() - start_time
+
+        # The call should complete quickly (within 3 seconds) due to timeout handling
+        # and return a timeout error result rather than hanging for 10+ seconds
+        assert elapsed_time < 4, f"Tool call took {elapsed_time} seconds, suggesting it hung"
+        assert result["status"] == "error"
+        assert result["toolUseId"] == "test-hanging-123"
+        assert len(result["content"]) == 1
+        assert ("timed out" in result["content"][0]["text"] or
+                "Tool execution failed" in result["content"][0]["text"])
+
+
+@pytest.mark.asyncio
+async def test_call_tool_async_server_jsonrpc_error_response(mock_transport, mock_session):
+    """Test that call_tool_async handles JSON-RPC error responses from server without hanging.
+
+    This reproduces the bug where the MCP client hangs when the remote streamable HTTP server
+    responds with a JSON-RPC error instead of a successful tool result.
+    """
+    import asyncio
+    import time
+    from datetime import timedelta
+
+    # Simulate a JSON-RPC error by making the call_tool method hang indefinitely
+    async def hanging_call_tool(*args, **kwargs):
+        # Simulate the hanging behavior
+        await asyncio.sleep(10)  # This will cause a timeout in our test
+
+    mock_session.call_tool = hanging_call_tool
+
+    with MCPClient(mock_transport["transport_callable"]) as client:
+        start_time = time.time()
+
+        # This should not hang indefinitely, should timeout after 2 seconds
+        result = await client.call_tool_async(
+            tool_use_id="test-async-hanging-123",
+            name="test_tool",
+            arguments={"param": "value"},
+            read_timeout_seconds=timedelta(seconds=2)  # Short timeout for test
+        )
+
+        elapsed_time = time.time() - start_time
+
+        # The call should complete quickly (within 3 seconds) due to timeout handling
+        # and return a timeout error result rather than hanging for 10+ seconds
+        assert elapsed_time < 4, f"Async tool call took {elapsed_time} seconds, suggesting it hung"
+        assert result["status"] == "error"
+        assert result["toolUseId"] == "test-async-hanging-123"
+        assert len(result["content"]) == 1
+        assert ("timed out" in result["content"][0]["text"] or
+                "Tool execution failed" in result["content"][0]["text"])
+
+
+def test_call_tool_sync_no_timeout_preserves_original_behavior(mock_transport, mock_session):
+    """Test that call_tool_sync does not apply timeout when read_timeout_seconds is None.
+
+    This verifies backward compatibility - when no timeout is specified, the method should
+    behave as it did originally (no timeout applied) to avoid breaking existing customers.
+    """
+    import asyncio
+    import time
+
+    # Mock a slow but successful tool call that takes longer than the old default (60s)
+    # but should succeed since no timeout is applied
+    async def slow_but_successful_call_tool(name, arguments, read_timeout_seconds=None):
+        # Simulate a slow operation that would exceed old default timeout
+        await asyncio.sleep(0.1)  # Short delay for test performance
+        return MCPCallToolResult(
+            isError=False,
+            content=[MCPTextContent(type="text", text="Success after delay")]
+        )
+
+    mock_session.call_tool = slow_but_successful_call_tool
+
+    with MCPClient(mock_transport["transport_callable"]) as client:
+        start_time = time.time()
+
+        # Call WITHOUT timeout parameter - should not apply any timeout
+        result = client.call_tool_sync(
+            tool_use_id="test-no-timeout-123",
+            name="test_tool",
+            arguments={"param": "value"}
+            # No read_timeout_seconds parameter
+        )
+
+        elapsed_time = time.time() - start_time
+
+        # Should succeed without timeout error
+        assert result["status"] == "success"
+        assert result["toolUseId"] == "test-no-timeout-123"
+        assert len(result["content"]) == 1
+        assert result["content"][0]["text"] == "Success after delay"
+
+        # Should have taken at least the delay time, proving no premature timeout
+        assert elapsed_time >= 0.1
+
+
+@pytest.mark.asyncio
+async def test_call_tool_async_no_timeout_preserves_original_behavior(mock_transport, mock_session):
+    """Test that call_tool_async does not apply timeout when read_timeout_seconds is None.
+
+    This verifies backward compatibility - when no timeout is specified, the method should
+    behave as it did originally (no timeout applied) to avoid breaking existing customers.
+    """
+    import asyncio
+    import time
+
+    # Mock a slow but successful async tool call
+    async def slow_but_successful_call_tool(name, arguments, read_timeout_seconds=None):
+        # Simulate a slow async operation
+        await asyncio.sleep(0.1)  # Short delay for test performance
+        return MCPCallToolResult(
+            isError=False,
+            content=[MCPTextContent(type="text", text="Async success after delay")]
+        )
+
+    mock_session.call_tool = slow_but_successful_call_tool
+
+    with MCPClient(mock_transport["transport_callable"]) as client:
+        start_time = time.time()
+
+        # Call WITHOUT timeout parameter - should not apply any timeout
+        result = await client.call_tool_async(
+            tool_use_id="test-async-no-timeout-123",
+            name="test_tool",
+            arguments={"param": "value"}
+            # No read_timeout_seconds parameter
+        )
+
+        elapsed_time = time.time() - start_time
+
+        # Should succeed without timeout error
+        assert result["status"] == "success"
+        assert result["toolUseId"] == "test-async-no-timeout-123"
+        assert len(result["content"]) == 1
+        assert result["content"][0]["text"] == "Async success after delay"
+
+        # Should have taken at least the delay time, proving no premature timeout
+        assert elapsed_time >= 0.1