From d4a9bc0ad5ece78abe51732a290e41283d8c4ae1 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 11:33:57 +0000 Subject: [PATCH 01/15] fix: automatically handle container exits and reconnect clients MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #2270 When containers exit or are removed, clients (Cursor, Cline) lose connection and the tool stops working. This PR implements automatic handling of container exits with client config updates to force reconnection. ## Changes ### Transport Layer - Updated handleContainerExit() in http.go and stdio.go to log clear warnings - Made TransparentProxy.Stop() idempotent to prevent double-close panics - Added logging to distinguish between container exit vs removal ### Runner Layer - Check if workload exists in runtime (not just status files) to determine if container was removed or just exited - If container removed: clean up client config and exit gracefully - If container exited: signal restart needed to workload manager ### Workload Manager - Automatic restart with exponential backoff (5s, 10s, 20s, 40s, 60s) - Up to 10 retry attempts (~10 minutes) - Remove from client config before restart, re-add after successful restart - This forces clients to see the config change and reconnect with fresh sessions ## Behavior **Container exits (crash)**: - Detect exit → Remove from ~/.cursor/mcp.json → Wait with backoff → Restart container → Re-add to mcp.json → Client reconnects ✅ **Container removed (docker rm / thv delete)**: - Detect removal → Remove from ~/.cursor/mcp.json → Exit gracefully → Tool vanishes from client ✅ ## Testing Tested with Cursor on macOS: - Container kill: Automatic restart and reconnection works - Container remove: Clean removal from client config --- docs/fixes/container-exit-handling.md | 104 ++++++++++++++++++ pkg/runner/runner.go | 87 ++++++++++++++- pkg/transport/http.go | 38 ++++++- .../proxy/transparent/transparent_proxy.go | 12 ++ pkg/transport/stdio.go | 33 +++++- pkg/workloads/manager.go | 73 ++++++++++-- 6 files changed, 332 insertions(+), 15 deletions(-) create mode 100644 docs/fixes/container-exit-handling.md diff --git a/docs/fixes/container-exit-handling.md b/docs/fixes/container-exit-handling.md new file mode 100644 index 000000000..eeb6cee02 --- /dev/null +++ b/docs/fixes/container-exit-handling.md @@ -0,0 +1,104 @@ +# Container Exit Handling with Automatic Restart + +## Problem + +When a container exits (crashes or is stopped), the toolhive proxy would silently lose connection. The proxy would continue running but requests would fail without clear feedback to the user about what happened or how to recover. + +## Solution + +**Simple and Automatic**: Detect container exit → Check if workload exists → Either restart or clean up + +When the container monitor detects that a container has stopped: + +1. **Log a clear warning** - `WARN: Container fetch exited` +2. **Stop the proxy cleanly** - Releases port and resources +3. **Check if workload exists** - Uses `thv ls` logic to see if workload was removed or just exited +4. **If removed** (not in `thv ls`): + - Remove from client config (updates `~/.cursor/mcp.json` etc.) + - Exit gracefully, no restart +5. **If still exists** (in `thv ls`): + - Remove from client config temporarily + - Automatic restart with backoff (5s, 10s, 20s, 40s, 60s) + - Re-add to client config after successful restart + - Up to 10 attempts (~10 minutes) + +## Benefits + +✅ **No silent failures** - Clear logging about what happened +✅ **Automatic recovery** - Restarts automatically without user intervention +✅ **Exponential backoff** - Smart retry delays (5s → 60s max) +✅ **Clean shutdown** - Proxy stops properly between retries +✅ **Eventually consistent** - Keeps trying for transient issues +✅ **Gives up gracefully** - After 10 attempts, stops cleanly + +## What Changed + +### Transport Layer (`pkg/transport/http.go`, `pkg/transport/stdio.go`) +- Updated `handleContainerExit()` to log at WARN level with clear messages +- Cleanly stops proxy when container exits + +### Runner Layer (`pkg/runner/runner.go`) +- When transport stops, checks if workload still exists using `DoesWorkloadExist` +- **If workload doesn't exist in `thv ls`**: Removes from client config and exits gracefully +- **If workload exists**: Returns special error "container exited, restart needed" +- This simple check determines whether to restart or clean up + +### Workload Manager (`pkg/workloads/manager.go`) +- Added retry loop with exponential backoff in `RunWorkload()` +- Retries up to 10 times with delays: 5s, 10s, 20s, 40s, 60s (capped) +- Sets workload status to "starting" during retry attempts +- **Removes from client config before each restart** - Forces clients to reconnect +- **Re-adds to client config after restart** - Runner does this automatically after starting +- Only retries on "container exited, restart needed" errors, not other failures + +## Retry Timeline + +``` +Container exits +├─ Attempt 1: Immediate restart +├─ Attempt 2: After 5s delay +├─ Attempt 3: After 10s delay +├─ Attempt 4: After 20s delay +├─ Attempt 5: After 40s delay +├─ Attempt 6: After 60s delay +├─ Attempt 7: After 60s delay +├─ Attempt 8: After 60s delay +├─ Attempt 9: After 60s delay +├─ Attempt 10: After 60s delay +└─ Give up (after ~10 minutes total) +``` + +## User Experience + +**Before:** +``` +Container exits → Proxy keeps running → Requests fail silently → Confusion +``` + +**After (container exits but still in `thv ls`):** +``` +Container exits → WARN logged → Proxy stops → Check `thv ls` → Still exists → +Remove from client config → Automatic restart (10 attempts) → +Re-add to client config → ✅ Recovered +Cursor sees config change → Reconnects with fresh session → Tool works! +``` + +**After (container removed with `docker rm` or `thv delete`):** +``` +Container exits → WARN logged → Proxy stops → Check `thv ls` → Doesn't exist → +Remove from client config → Exit gracefully → ✅ Clean shutdown +Cursor sees config removed → No longer shows the tool +``` + +**If container keeps crashing:** +``` +Container exits → Retries with backoff → After 10 attempts → Gives up → Status: Error +``` + +The system now: +- **Handles transient container failures automatically** with exponential backoff +- **Distinguishes between exit and removal** using `thv ls` as source of truth +- **Updates client config** to force clean reconnection (remove then re-add) +- **Gives up gracefully** on persistent issues or when workload is intentionally removed +- Clients (like Cursor) see the config changes and reconnect with new sessions + diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 54fc3fcd4..2aa63d471 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -4,6 +4,7 @@ package runner import ( "bytes" "context" + "errors" "fmt" "net/http" "os" @@ -18,6 +19,7 @@ import ( "github.com/stacklok/toolhive/pkg/auth/remote" "github.com/stacklok/toolhive/pkg/client" "github.com/stacklok/toolhive/pkg/config" + ct "github.com/stacklok/toolhive/pkg/container" rt "github.com/stacklok/toolhive/pkg/container/runtime" "github.com/stacklok/toolhive/pkg/labels" "github.com/stacklok/toolhive/pkg/logger" @@ -383,7 +385,7 @@ func (r *Runner) Run(ctx context.Context) error { } if !running { // Transport is no longer running (container exited or was stopped) - logger.Info("Transport is no longer running, exiting...") + logger.Warn("Transport is no longer running, attempting automatic restart...") close(doneCh) return } @@ -404,8 +406,8 @@ func (r *Runner) Run(ctx context.Context) error { case sig := <-sigCh: stopMCPServer(fmt.Sprintf("Received signal %s", sig)) case <-doneCh: - // The transport has already been stopped (likely by the container monitor) - // Clean up the PID file and state + // The transport has already been stopped (likely by the container exit) + // Clean up the PID file and state // TODO: Stop writing to PID file once we migrate over to statuses. if err := process.RemovePIDFile(r.Config.BaseName); err != nil { logger.Warnf("Warning: Failed to remove PID file: %v", err) @@ -414,12 +416,89 @@ func (r *Runner) Run(ctx context.Context) error { logger.Warnf("Warning: Failed to reset workload %s PID: %v", r.Config.BaseName, err) } - logger.Infof("MCP server %s stopped", r.Config.ContainerName) + // Check if workload still exists (using workload manager to check `thv ls`) + // If it doesn't exist, it was removed - clean up client config + // If it exists, it exited unexpectedly - signal restart needed + workloadManager, err := r.getWorkloadManager(ctx) + if err == nil { + exists, checkErr := workloadManager.DoesWorkloadExist(ctx, r.Config.BaseName) + if checkErr != nil { + logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr) + // Assume restart needed if we can't check + } else if !exists { + // Workload doesn't exist in `thv ls` - it was removed + logger.Infof("Workload %s no longer exists. Removing from client configurations.", r.Config.BaseName) + clientManager, clientErr := client.NewManager(ctx) + if clientErr == nil { + if removeErr := clientManager.RemoveServerFromClients(ctx, r.Config.ContainerName, r.Config.Group); removeErr != nil { + logger.Warnf("Warning: Failed to remove from client config: %v", removeErr) + } else { + logger.Infof("Successfully removed %s from client configurations", r.Config.ContainerName) + } + } + logger.Infof("MCP server %s stopped and cleaned up", r.Config.ContainerName) + return nil // Exit gracefully, no restart + } + } + + // Workload still exists - signal restart needed + logger.Infof("MCP server %s stopped, restart needed", r.Config.ContainerName) + return fmt.Errorf("container exited, restart needed") } return nil } +// getWorkloadManager creates a workload manager instance for checking workload existence +func (r *Runner) getWorkloadManager(ctx context.Context) (interface { + DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) +}, error) { + // Create a minimal implementation that checks via status manager + impl := &workloadExistenceChecker{statusManager: r.statusManager} + return impl, nil +} + +// workloadExistenceChecker is a minimal implementation to check if a workload exists +type workloadExistenceChecker struct { + statusManager statuses.StatusManager +} + +func (w *workloadExistenceChecker) DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) { + // Check if workload exists by trying to get it from status manager + workload, err := w.statusManager.GetWorkload(ctx, workloadName) + if err != nil { + if errors.Is(err, rt.ErrWorkloadNotFound) { + return false, nil + } + return false, fmt.Errorf("failed to check if workload exists: %w", err) + } + + // If remote workload, check if it should exist + if workload.Remote { + // For remote workloads, trust the status manager + return workload.Status != rt.WorkloadStatusError, nil + } + + // For container workloads, verify the container actually exists in the runtime + // Create a runtime instance to check if container exists + backend, err := ct.NewFactory().Create(ctx) + if err != nil { + logger.Warnf("Failed to create runtime to check container existence: %v", err) + // Fall back to status manager only + return workload.Status != rt.WorkloadStatusError, nil + } + + // Check if container is actually running in the runtime + running, err := backend.IsWorkloadRunning(ctx, workloadName) + if err != nil { + // Container doesn't exist or error checking + logger.Debugf("Container %s not found in runtime: %v", workloadName, err) + return false, nil + } + + return running, nil +} + // handleRemoteAuthentication handles authentication for remote MCP servers func (r *Runner) handleRemoteAuthentication(ctx context.Context) (oauth2.TokenSource, error) { if r.Config.RemoteAuthConfig == nil { diff --git a/pkg/transport/http.go b/pkg/transport/http.go index 90e0431ee..36956dc7f 100644 --- a/pkg/transport/http.go +++ b/pkg/transport/http.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "net/url" + "strings" "sync" "golang.org/x/oauth2" @@ -58,6 +59,10 @@ type HTTPTransport struct { // Container monitor monitor rt.Monitor errorCh <-chan error + + // Container exit error (for determining if restart is needed) + containerExitErr error + exitErrMutex sync.Mutex } // NewHTTPTransport creates a new HTTP transport. @@ -273,14 +278,42 @@ func (t *HTTPTransport) handleContainerExit(ctx context.Context) { case <-ctx.Done(): return case err := <-t.errorCh: - logger.Infof("Container %s exited: %v", t.containerName, err) - // Stop the transport when the container exits + // Store the exit error so runner can check if restart is needed + t.exitErrMutex.Lock() + t.containerExitErr = err + t.exitErrMutex.Unlock() + + logger.Warnf("Container %s exited: %v", t.containerName, err) + + // Check if container was removed (not just exited) + isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") + if isRemoved { + logger.Infof("Container %s was removed. Stopping proxy and cleaning up.", t.containerName) + } else { + logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName) + } + + // Stop the transport when the container exits/removed if stopErr := t.Stop(ctx); stopErr != nil { logger.Errorf("Error stopping transport after container exit: %v", stopErr) } } } +// ShouldRestart returns true if the container exited and should be restarted. +// Returns false if the container was removed (intentionally deleted). +func (t *HTTPTransport) ShouldRestart() bool { + t.exitErrMutex.Lock() + defer t.exitErrMutex.Unlock() + + if t.containerExitErr == nil { + return false // No exit error, normal shutdown + } + + // Don't restart if container was removed + return !strings.Contains(t.containerExitErr.Error(), "may have been removed") +} + // IsRunning checks if the transport is currently running. func (t *HTTPTransport) IsRunning(_ context.Context) (bool, error) { t.mutex.Lock() @@ -294,3 +327,4 @@ func (t *HTTPTransport) IsRunning(_ context.Context) (bool, error) { return true, nil } } + diff --git a/pkg/transport/proxy/transparent/transparent_proxy.go b/pkg/transport/proxy/transparent/transparent_proxy.go index da3faa30d..67692a0e9 100644 --- a/pkg/transport/proxy/transparent/transparent_proxy.go +++ b/pkg/transport/proxy/transparent/transparent_proxy.go @@ -51,6 +51,9 @@ type TransparentProxy struct { // Mutex for protecting shared state mutex sync.Mutex + // Track if Stop() has been called + stopped bool + // Shutdown channel shutdownCh chan struct{} @@ -435,6 +438,15 @@ func (p *TransparentProxy) Stop(ctx context.Context) error { p.mutex.Lock() defer p.mutex.Unlock() + // Check if already stopped + if p.stopped { + logger.Debugf("Proxy for %s is already stopped, skipping", p.targetURI) + return nil + } + + // Mark as stopped before closing channel + p.stopped = true + // Signal shutdown close(p.shutdownCh) diff --git a/pkg/transport/stdio.go b/pkg/transport/stdio.go index d224bc387..c395fe5cf 100644 --- a/pkg/transport/stdio.go +++ b/pkg/transport/stdio.go @@ -76,6 +76,10 @@ type StdioTransport struct { // Container monitor monitor rt.Monitor + // Container exit error (for determining if restart is needed) + containerExitErr error + exitErrMutex sync.Mutex + // Retry configuration (for testing) retryConfig *retryConfig } @@ -627,7 +631,20 @@ func (t *StdioTransport) handleContainerExit(ctx context.Context) { return } - logger.Infof("Container %s exited: %v", t.containerName, err) + // Store the exit error so runner can check if restart is needed + t.exitErrMutex.Lock() + t.containerExitErr = err + t.exitErrMutex.Unlock() + + logger.Warnf("Container %s exited: %v", t.containerName, err) + + // Check if container was removed (not just exited) + isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") + if isRemoved { + logger.Infof("Container %s was removed. Stopping proxy and cleaning up.", t.containerName) + } else { + logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName) + } // Check if the transport is already stopped before trying to stop it select { @@ -647,3 +664,17 @@ func (t *StdioTransport) handleContainerExit(ctx context.Context) { } } } + +// ShouldRestart returns true if the container exited and should be restarted. +// Returns false if the container was removed (intentionally deleted). +func (t *StdioTransport) ShouldRestart() bool { + t.exitErrMutex.Lock() + defer t.exitErrMutex.Unlock() + + if t.containerExitErr == nil { + return false // No exit error, normal shutdown + } + + // Don't restart if container was removed + return !strings.Contains(t.containerExitErr.Error(), "may have been removed") +} diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go index 6389a037f..caf0cdde2 100644 --- a/pkg/workloads/manager.go +++ b/pkg/workloads/manager.go @@ -390,7 +390,7 @@ func (d *DefaultManager) stopContainerWorkload(ctx context.Context, name string) return d.stopSingleContainerWorkload(ctx, &container) } -// RunWorkload runs a workload in the foreground. +// RunWorkload runs a workload in the foreground with automatic restart on container exit. func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunConfig) error { // Ensure that the workload has a status entry before starting the process. if err := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusStarting, ""); err != nil { @@ -398,15 +398,72 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC return fmt.Errorf("failed to create workload status: %v", err) } - mcpRunner := runner.NewRunner(runConfig, d.statuses) - err := mcpRunner.Run(ctx) - if err != nil { - // If the run failed, we should set the status to error. - if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, err.Error()); statusErr != nil { - logger.Warnf("Failed to set workload %s status to error: %v", runConfig.BaseName, statusErr) + // Retry loop with exponential backoff for container restarts + maxRetries := 10 // Allow many retries for transient issues + retryDelay := 5 * time.Second + + for attempt := 1; attempt <= maxRetries; attempt++ { + if attempt > 1 { + logger.Infof("Restart attempt %d/%d for %s after %v delay", attempt, maxRetries, runConfig.BaseName, retryDelay) + time.Sleep(retryDelay) + + // Exponential backoff: 5s, 10s, 20s, 40s, 60s (capped) + retryDelay *= 2 + if retryDelay > 60*time.Second { + retryDelay = 60 * time.Second + } } + + mcpRunner := runner.NewRunner(runConfig, d.statuses) + err := mcpRunner.Run(ctx) + + if err != nil { + // Check if this is a "container exited, restart needed" error + if err.Error() == "container exited, restart needed" { + logger.Warnf("Container %s exited unexpectedly (attempt %d/%d). Restarting...", + runConfig.BaseName, attempt, maxRetries) + + // Remove from client config so clients notice the restart + clientManager, clientErr := client.NewManager(ctx) + if clientErr == nil { + logger.Infof("Removing %s from client configurations before restart", runConfig.BaseName) + if removeErr := clientManager.RemoveServerFromClients(ctx, runConfig.BaseName, runConfig.Group); removeErr != nil { + logger.Warnf("Warning: Failed to remove from client config: %v", removeErr) + } + } + + // Set status to starting (since we're restarting) + if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusStarting, "Container exited, restarting"); statusErr != nil { + logger.Warnf("Failed to set workload %s status to starting: %v", runConfig.BaseName, statusErr) + } + + // If we haven't exhausted retries, continue the loop + if attempt < maxRetries { + continue + } + + // Exhausted all retries + logger.Errorf("Failed to restart %s after %d attempts. Giving up.", runConfig.BaseName, maxRetries) + if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, "Failed to restart after container exit"); statusErr != nil { + logger.Warnf("Failed to set workload %s status to error: %v", runConfig.BaseName, statusErr) + } + return fmt.Errorf("container restart failed after %d attempts", maxRetries) + } + + // Some other error - don't retry + logger.Errorf("Workload %s failed with error: %v", runConfig.BaseName, err) + if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, err.Error()); statusErr != nil { + logger.Warnf("Failed to set workload %s status to error: %v", runConfig.BaseName, statusErr) + } + return err + } + + // Success - workload completed normally + return nil } - return err + + // Should not reach here, but just in case + return fmt.Errorf("unexpected end of retry loop for %s", runConfig.BaseName) } // validateSecretParameters validates the secret parameters for a workload. From ab3d5fa48b5f10dcc7efa4f8e8bb344ed170624a Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 11:41:02 +0000 Subject: [PATCH 02/15] Superfluous - info is in the PR Signed-off-by: nigel brown --- docs/fixes/container-exit-handling.md | 104 -------------------------- 1 file changed, 104 deletions(-) delete mode 100644 docs/fixes/container-exit-handling.md diff --git a/docs/fixes/container-exit-handling.md b/docs/fixes/container-exit-handling.md deleted file mode 100644 index eeb6cee02..000000000 --- a/docs/fixes/container-exit-handling.md +++ /dev/null @@ -1,104 +0,0 @@ -# Container Exit Handling with Automatic Restart - -## Problem - -When a container exits (crashes or is stopped), the toolhive proxy would silently lose connection. The proxy would continue running but requests would fail without clear feedback to the user about what happened or how to recover. - -## Solution - -**Simple and Automatic**: Detect container exit → Check if workload exists → Either restart or clean up - -When the container monitor detects that a container has stopped: - -1. **Log a clear warning** - `WARN: Container fetch exited` -2. **Stop the proxy cleanly** - Releases port and resources -3. **Check if workload exists** - Uses `thv ls` logic to see if workload was removed or just exited -4. **If removed** (not in `thv ls`): - - Remove from client config (updates `~/.cursor/mcp.json` etc.) - - Exit gracefully, no restart -5. **If still exists** (in `thv ls`): - - Remove from client config temporarily - - Automatic restart with backoff (5s, 10s, 20s, 40s, 60s) - - Re-add to client config after successful restart - - Up to 10 attempts (~10 minutes) - -## Benefits - -✅ **No silent failures** - Clear logging about what happened -✅ **Automatic recovery** - Restarts automatically without user intervention -✅ **Exponential backoff** - Smart retry delays (5s → 60s max) -✅ **Clean shutdown** - Proxy stops properly between retries -✅ **Eventually consistent** - Keeps trying for transient issues -✅ **Gives up gracefully** - After 10 attempts, stops cleanly - -## What Changed - -### Transport Layer (`pkg/transport/http.go`, `pkg/transport/stdio.go`) -- Updated `handleContainerExit()` to log at WARN level with clear messages -- Cleanly stops proxy when container exits - -### Runner Layer (`pkg/runner/runner.go`) -- When transport stops, checks if workload still exists using `DoesWorkloadExist` -- **If workload doesn't exist in `thv ls`**: Removes from client config and exits gracefully -- **If workload exists**: Returns special error "container exited, restart needed" -- This simple check determines whether to restart or clean up - -### Workload Manager (`pkg/workloads/manager.go`) -- Added retry loop with exponential backoff in `RunWorkload()` -- Retries up to 10 times with delays: 5s, 10s, 20s, 40s, 60s (capped) -- Sets workload status to "starting" during retry attempts -- **Removes from client config before each restart** - Forces clients to reconnect -- **Re-adds to client config after restart** - Runner does this automatically after starting -- Only retries on "container exited, restart needed" errors, not other failures - -## Retry Timeline - -``` -Container exits -├─ Attempt 1: Immediate restart -├─ Attempt 2: After 5s delay -├─ Attempt 3: After 10s delay -├─ Attempt 4: After 20s delay -├─ Attempt 5: After 40s delay -├─ Attempt 6: After 60s delay -├─ Attempt 7: After 60s delay -├─ Attempt 8: After 60s delay -├─ Attempt 9: After 60s delay -├─ Attempt 10: After 60s delay -└─ Give up (after ~10 minutes total) -``` - -## User Experience - -**Before:** -``` -Container exits → Proxy keeps running → Requests fail silently → Confusion -``` - -**After (container exits but still in `thv ls`):** -``` -Container exits → WARN logged → Proxy stops → Check `thv ls` → Still exists → -Remove from client config → Automatic restart (10 attempts) → -Re-add to client config → ✅ Recovered -Cursor sees config change → Reconnects with fresh session → Tool works! -``` - -**After (container removed with `docker rm` or `thv delete`):** -``` -Container exits → WARN logged → Proxy stops → Check `thv ls` → Doesn't exist → -Remove from client config → Exit gracefully → ✅ Clean shutdown -Cursor sees config removed → No longer shows the tool -``` - -**If container keeps crashing:** -``` -Container exits → Retries with backoff → After 10 attempts → Gives up → Status: Error -``` - -The system now: -- **Handles transient container failures automatically** with exponential backoff -- **Distinguishes between exit and removal** using `thv ls` as source of truth -- **Updates client config** to force clean reconnection (remove then re-add) -- **Gives up gracefully** on persistent issues or when workload is intentionally removed -- Clients (like Cursor) see the config changes and reconnect with new sessions - From 5c57fd177c92f176c674b488609b280e600c3be2 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 11:46:55 +0000 Subject: [PATCH 03/15] fix: apply gofmt formatting --- pkg/runner/runner.go | 10 +++++----- pkg/transport/http.go | 5 ++--- pkg/transport/stdio.go | 2 +- pkg/workloads/manager.go | 24 ++++++++++++------------ 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 2aa63d471..8fed87a70 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -407,7 +407,7 @@ func (r *Runner) Run(ctx context.Context) error { stopMCPServer(fmt.Sprintf("Received signal %s", sig)) case <-doneCh: // The transport has already been stopped (likely by the container exit) - // Clean up the PID file and state + // Clean up the PID file and state // TODO: Stop writing to PID file once we migrate over to statuses. if err := process.RemovePIDFile(r.Config.BaseName); err != nil { logger.Warnf("Warning: Failed to remove PID file: %v", err) @@ -472,13 +472,13 @@ func (w *workloadExistenceChecker) DoesWorkloadExist(ctx context.Context, worklo } return false, fmt.Errorf("failed to check if workload exists: %w", err) } - + // If remote workload, check if it should exist if workload.Remote { // For remote workloads, trust the status manager return workload.Status != rt.WorkloadStatusError, nil } - + // For container workloads, verify the container actually exists in the runtime // Create a runtime instance to check if container exists backend, err := ct.NewFactory().Create(ctx) @@ -487,7 +487,7 @@ func (w *workloadExistenceChecker) DoesWorkloadExist(ctx context.Context, worklo // Fall back to status manager only return workload.Status != rt.WorkloadStatusError, nil } - + // Check if container is actually running in the runtime running, err := backend.IsWorkloadRunning(ctx, workloadName) if err != nil { @@ -495,7 +495,7 @@ func (w *workloadExistenceChecker) DoesWorkloadExist(ctx context.Context, worklo logger.Debugf("Container %s not found in runtime: %v", workloadName, err) return false, nil } - + return running, nil } diff --git a/pkg/transport/http.go b/pkg/transport/http.go index 36956dc7f..f7d0f859d 100644 --- a/pkg/transport/http.go +++ b/pkg/transport/http.go @@ -284,7 +284,7 @@ func (t *HTTPTransport) handleContainerExit(ctx context.Context) { t.exitErrMutex.Unlock() logger.Warnf("Container %s exited: %v", t.containerName, err) - + // Check if container was removed (not just exited) isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") if isRemoved { @@ -292,7 +292,7 @@ func (t *HTTPTransport) handleContainerExit(ctx context.Context) { } else { logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName) } - + // Stop the transport when the container exits/removed if stopErr := t.Stop(ctx); stopErr != nil { logger.Errorf("Error stopping transport after container exit: %v", stopErr) @@ -327,4 +327,3 @@ func (t *HTTPTransport) IsRunning(_ context.Context) (bool, error) { return true, nil } } - diff --git a/pkg/transport/stdio.go b/pkg/transport/stdio.go index c395fe5cf..815f68d3d 100644 --- a/pkg/transport/stdio.go +++ b/pkg/transport/stdio.go @@ -637,7 +637,7 @@ func (t *StdioTransport) handleContainerExit(ctx context.Context) { t.exitErrMutex.Unlock() logger.Warnf("Container %s exited: %v", t.containerName, err) - + // Check if container was removed (not just exited) isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") if isRemoved { diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go index caf0cdde2..20139fa96 100644 --- a/pkg/workloads/manager.go +++ b/pkg/workloads/manager.go @@ -401,28 +401,28 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC // Retry loop with exponential backoff for container restarts maxRetries := 10 // Allow many retries for transient issues retryDelay := 5 * time.Second - + for attempt := 1; attempt <= maxRetries; attempt++ { if attempt > 1 { logger.Infof("Restart attempt %d/%d for %s after %v delay", attempt, maxRetries, runConfig.BaseName, retryDelay) time.Sleep(retryDelay) - + // Exponential backoff: 5s, 10s, 20s, 40s, 60s (capped) retryDelay *= 2 if retryDelay > 60*time.Second { retryDelay = 60 * time.Second } } - + mcpRunner := runner.NewRunner(runConfig, d.statuses) err := mcpRunner.Run(ctx) - + if err != nil { // Check if this is a "container exited, restart needed" error if err.Error() == "container exited, restart needed" { - logger.Warnf("Container %s exited unexpectedly (attempt %d/%d). Restarting...", + logger.Warnf("Container %s exited unexpectedly (attempt %d/%d). Restarting...", runConfig.BaseName, attempt, maxRetries) - + // Remove from client config so clients notice the restart clientManager, clientErr := client.NewManager(ctx) if clientErr == nil { @@ -431,17 +431,17 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC logger.Warnf("Warning: Failed to remove from client config: %v", removeErr) } } - + // Set status to starting (since we're restarting) if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusStarting, "Container exited, restarting"); statusErr != nil { logger.Warnf("Failed to set workload %s status to starting: %v", runConfig.BaseName, statusErr) } - + // If we haven't exhausted retries, continue the loop if attempt < maxRetries { continue } - + // Exhausted all retries logger.Errorf("Failed to restart %s after %d attempts. Giving up.", runConfig.BaseName, maxRetries) if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, "Failed to restart after container exit"); statusErr != nil { @@ -449,7 +449,7 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC } return fmt.Errorf("container restart failed after %d attempts", maxRetries) } - + // Some other error - don't retry logger.Errorf("Workload %s failed with error: %v", runConfig.BaseName, err) if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, err.Error()); statusErr != nil { @@ -457,11 +457,11 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC } return err } - + // Success - workload completed normally return nil } - + // Should not reach here, but just in case return fmt.Errorf("unexpected end of retry loop for %s", runConfig.BaseName) } From f3352cf4853cda86b57f4d78f8012f2e209a3f4e Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 11:54:12 +0000 Subject: [PATCH 04/15] fix: address linter errors (line length, unused params) --- pkg/runner/runner.go | 52 +++++++++++++++++++++++----------------- pkg/workloads/manager.go | 16 +++++++++++-- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 8fed87a70..1f040a5ce 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -419,26 +419,35 @@ func (r *Runner) Run(ctx context.Context) error { // Check if workload still exists (using workload manager to check `thv ls`) // If it doesn't exist, it was removed - clean up client config // If it exists, it exited unexpectedly - signal restart needed - workloadManager, err := r.getWorkloadManager(ctx) - if err == nil { - exists, checkErr := workloadManager.DoesWorkloadExist(ctx, r.Config.BaseName) - if checkErr != nil { - logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr) - // Assume restart needed if we can't check - } else if !exists { - // Workload doesn't exist in `thv ls` - it was removed - logger.Infof("Workload %s no longer exists. Removing from client configurations.", r.Config.BaseName) - clientManager, clientErr := client.NewManager(ctx) - if clientErr == nil { - if removeErr := clientManager.RemoveServerFromClients(ctx, r.Config.ContainerName, r.Config.Group); removeErr != nil { - logger.Warnf("Warning: Failed to remove from client config: %v", removeErr) - } else { - logger.Infof("Successfully removed %s from client configurations", r.Config.ContainerName) - } + workloadManager := r.getWorkloadManager() + exists, checkErr := workloadManager.DoesWorkloadExist(ctx, r.Config.BaseName) + if checkErr != nil { + logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr) + // Assume restart needed if we can't check + } else if !exists { + // Workload doesn't exist in `thv ls` - it was removed + logger.Infof( + "Workload %s no longer exists. Removing from client configurations.", + r.Config.BaseName, + ) + clientManager, clientErr := client.NewManager(ctx) + if clientErr == nil { + removeErr := clientManager.RemoveServerFromClients( + ctx, + r.Config.ContainerName, + r.Config.Group, + ) + if removeErr != nil { + logger.Warnf("Warning: Failed to remove from client config: %v", removeErr) + } else { + logger.Infof( + "Successfully removed %s from client configurations", + r.Config.ContainerName, + ) } - logger.Infof("MCP server %s stopped and cleaned up", r.Config.ContainerName) - return nil // Exit gracefully, no restart } + logger.Infof("MCP server %s stopped and cleaned up", r.Config.ContainerName) + return nil // Exit gracefully, no restart } // Workload still exists - signal restart needed @@ -450,12 +459,11 @@ func (r *Runner) Run(ctx context.Context) error { } // getWorkloadManager creates a workload manager instance for checking workload existence -func (r *Runner) getWorkloadManager(ctx context.Context) (interface { +func (r *Runner) getWorkloadManager() interface { DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) -}, error) { +} { // Create a minimal implementation that checks via status manager - impl := &workloadExistenceChecker{statusManager: r.statusManager} - return impl, nil + return &workloadExistenceChecker{statusManager: r.statusManager} } // workloadExistenceChecker is a minimal implementation to check if a workload exists diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go index 20139fa96..675a2ca17 100644 --- a/pkg/workloads/manager.go +++ b/pkg/workloads/manager.go @@ -433,7 +433,13 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC } // Set status to starting (since we're restarting) - if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusStarting, "Container exited, restarting"); statusErr != nil { + statusErr := d.statuses.SetWorkloadStatus( + ctx, + runConfig.BaseName, + rt.WorkloadStatusStarting, + "Container exited, restarting", + ) + if statusErr != nil { logger.Warnf("Failed to set workload %s status to starting: %v", runConfig.BaseName, statusErr) } @@ -444,7 +450,13 @@ func (d *DefaultManager) RunWorkload(ctx context.Context, runConfig *runner.RunC // Exhausted all retries logger.Errorf("Failed to restart %s after %d attempts. Giving up.", runConfig.BaseName, maxRetries) - if statusErr := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, "Failed to restart after container exit"); statusErr != nil { + statusErr = d.statuses.SetWorkloadStatus( + ctx, + runConfig.BaseName, + rt.WorkloadStatusError, + "Failed to restart after container exit", + ) + if statusErr != nil { logger.Warnf("Failed to set workload %s status to error: %v", runConfig.BaseName, statusErr) } return fmt.Errorf("container restart failed after %d attempts", maxRetries) From 01bdd4d6c3b7ee6bc7f78ff725eada0f65a381e9 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 13:38:16 +0000 Subject: [PATCH 05/15] test: add tests for container exit handling and automatic restart Adds unit tests for: - HTTPTransport.ShouldRestart() - restart logic for container exits vs removals - StdioTransport.ShouldRestart() - restart logic for container exits vs removals - TransparentProxy.Stop() - idempotent stop functionality - DefaultManager.RunWorkload() - container exit handling These tests cover the new automatic container exit detection and restart functionality, improving code coverage for the PR. --- pkg/transport/http_test.go | 50 +++++++++++++++++++ .../proxy/transparent/transparent_test.go | 44 ++++++++++++++++ pkg/transport/stdio_test.go | 41 +++++++++++++++ pkg/workloads/manager_test.go | 41 +++++++++++++++ 4 files changed, 176 insertions(+) create mode 100644 pkg/transport/http_test.go diff --git a/pkg/transport/http_test.go b/pkg/transport/http_test.go new file mode 100644 index 000000000..056ebdcc0 --- /dev/null +++ b/pkg/transport/http_test.go @@ -0,0 +1,50 @@ +package transport + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestHTTPTransport_ShouldRestart tests the ShouldRestart logic +func TestHTTPTransport_ShouldRestart(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + exitError error + expectedResult bool + }{ + { + name: "container exited - should restart", + exitError: fmt.Errorf("container exited unexpectedly"), + expectedResult: true, + }, + { + name: "container removed - should not restart", + exitError: fmt.Errorf("Container test (test) not found, it may have been removed"), + expectedResult: false, + }, + { + name: "no error - should not restart", + exitError: nil, + expectedResult: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + transport := &HTTPTransport{ + containerName: "test-container", + containerExitErr: tt.exitError, + } + + result := transport.ShouldRestart() + assert.Equal(t, tt.expectedResult, result) + }) + } +} + diff --git a/pkg/transport/proxy/transparent/transparent_test.go b/pkg/transport/proxy/transparent/transparent_test.go index f9eba25fe..fd34c303b 100644 --- a/pkg/transport/proxy/transparent/transparent_test.go +++ b/pkg/transport/proxy/transparent/transparent_test.go @@ -351,3 +351,47 @@ func TestWellKnownPathWithoutAuthHandler(t *testing.T) { assert.Equal(t, http.StatusNotFound, recorder.Code, "Without auth handler, well-known path should return 404") } + +// TestTransparentProxy_IdempotentStop tests that Stop() can be called multiple times safely +func TestTransparentProxy_IdempotentStop(t *testing.T) { + t.Parallel() + + // Create a proxy + proxy := NewTransparentProxy("127.0.0.1", 0, "http://localhost:8080", nil, nil, false, false, "sse") + + ctx := context.Background() + + // Start the proxy (this creates the shutdown channel) + err := proxy.Start(ctx) + if err != nil { + t.Fatalf("Failed to start proxy: %v", err) + } + + // First stop should succeed + err = proxy.Stop(ctx) + assert.NoError(t, err, "First Stop() should succeed") + + // Second stop should also succeed (idempotent) + err = proxy.Stop(ctx) + assert.NoError(t, err, "Second Stop() should succeed (idempotent)") + + // Third stop should also succeed + err = proxy.Stop(ctx) + assert.NoError(t, err, "Third Stop() should succeed (idempotent)") +} + +// TestTransparentProxy_StopWithoutStart tests that Stop() works even if never started +func TestTransparentProxy_StopWithoutStart(t *testing.T) { + t.Parallel() + + // Create a proxy but don't start it + proxy := NewTransparentProxy("127.0.0.1", 0, "http://localhost:8080", nil, nil, false, false, "sse") + + ctx := context.Background() + + // Stop should handle being called without Start + err := proxy.Stop(ctx) + // This may return an error or succeed depending on implementation + // The key is it shouldn't panic + _ = err +} diff --git a/pkg/transport/stdio_test.go b/pkg/transport/stdio_test.go index 5a378d641..f6314ce35 100644 --- a/pkg/transport/stdio_test.go +++ b/pkg/transport/stdio_test.go @@ -1013,3 +1013,44 @@ func TestStdinRaceCondition(t *testing.T) { // Clean up cancel() } + +// TestStdioTransport_ShouldRestart tests the ShouldRestart logic +func TestStdioTransport_ShouldRestart(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + exitError error + expectedResult bool + }{ + { + name: "container exited - should restart", + exitError: fmt.Errorf("container exited unexpectedly"), + expectedResult: true, + }, + { + name: "container removed - should not restart", + exitError: fmt.Errorf("Container test (test) not found, it may have been removed"), + expectedResult: false, + }, + { + name: "no error - should not restart", + exitError: nil, + expectedResult: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + transport := &StdioTransport{ + containerName: "test-container", + containerExitErr: tt.exitError, + } + + result := transport.ShouldRestart() + assert.Equal(t, tt.expectedResult, result) + }) + } +} diff --git a/pkg/workloads/manager_test.go b/pkg/workloads/manager_test.go index a49603de5..53319fca2 100644 --- a/pkg/workloads/manager_test.go +++ b/pkg/workloads/manager_test.go @@ -1658,3 +1658,44 @@ func TestDefaultManager_updateSingleWorkload(t *testing.T) { }) } } + +// TestDefaultManager_RunWorkload_ContainerExitHandling tests container exit handling +func TestDefaultManager_RunWorkload_ContainerExitHandling(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockRuntime := runtimeMocks.NewMockRuntime(ctrl) + mockStatusMgr := statusMocks.NewMockStatusManager(ctrl) + mockConfigProvider := configMocks.NewMockProvider(ctrl) + + mockConfigProvider.EXPECT().GetConfig().Return(&config.Config{}).AnyTimes() + + // Expect status to be set to starting + mockStatusMgr.EXPECT(). + SetWorkloadStatus(gomock.Any(), "test-workload", runtime.WorkloadStatusStarting, ""). + Return(nil) + + // Expect status to be set to error on failure + mockStatusMgr.EXPECT(). + SetWorkloadStatus(gomock.Any(), "test-workload", runtime.WorkloadStatusError, gomock.Any()). + Return(nil).AnyTimes() + + manager := &DefaultManager{ + runtime: mockRuntime, + statuses: mockStatusMgr, + configProvider: mockConfigProvider, + } + + runConfig := &runner.RunConfig{ + ContainerName: "test-container", + BaseName: "test-workload", + Group: "default", + } + + // RunWorkload will fail because the runner can't actually run + // This tests that the status is properly set + err := manager.RunWorkload(context.Background(), runConfig) + assert.Error(t, err) +} From 6576b904552248c256420cae3f51d505ffe3d3a6 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Tue, 18 Nov 2025 14:02:17 +0000 Subject: [PATCH 06/15] fix: apply gci import formatting to test files --- pkg/transport/http_test.go | 1 - .../proxy/transparent/transparent_test.go | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pkg/transport/http_test.go b/pkg/transport/http_test.go index 056ebdcc0..1adc49452 100644 --- a/pkg/transport/http_test.go +++ b/pkg/transport/http_test.go @@ -47,4 +47,3 @@ func TestHTTPTransport_ShouldRestart(t *testing.T) { }) } } - diff --git a/pkg/transport/proxy/transparent/transparent_test.go b/pkg/transport/proxy/transparent/transparent_test.go index fd34c303b..4ec462c64 100644 --- a/pkg/transport/proxy/transparent/transparent_test.go +++ b/pkg/transport/proxy/transparent/transparent_test.go @@ -358,23 +358,23 @@ func TestTransparentProxy_IdempotentStop(t *testing.T) { // Create a proxy proxy := NewTransparentProxy("127.0.0.1", 0, "http://localhost:8080", nil, nil, false, false, "sse") - + ctx := context.Background() - + // Start the proxy (this creates the shutdown channel) err := proxy.Start(ctx) if err != nil { t.Fatalf("Failed to start proxy: %v", err) } - + // First stop should succeed err = proxy.Stop(ctx) assert.NoError(t, err, "First Stop() should succeed") - + // Second stop should also succeed (idempotent) err = proxy.Stop(ctx) assert.NoError(t, err, "Second Stop() should succeed (idempotent)") - + // Third stop should also succeed err = proxy.Stop(ctx) assert.NoError(t, err, "Third Stop() should succeed (idempotent)") @@ -386,9 +386,9 @@ func TestTransparentProxy_StopWithoutStart(t *testing.T) { // Create a proxy but don't start it proxy := NewTransparentProxy("127.0.0.1", 0, "http://localhost:8080", nil, nil, false, false, "sse") - + ctx := context.Background() - + // Stop should handle being called without Start err := proxy.Stop(ctx) // This may return an error or succeed depending on implementation From 7612120964f59e9e9963db7c4ec806c537a8e9c8 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 15:16:49 +0000 Subject: [PATCH 07/15] ci: clear Go module cache before downloading dependencies Fixes test failures caused by corrupted module cache in CI environment. The 'go clean -modcache' step ensures a clean state before downloading dependencies, preventing 'dial tcp: lookup proxy.golang.org' errors during test compilation. Related to #2634 --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5b6b7fab5..80fc5d8c9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,6 +20,10 @@ jobs: cache: true # This caches go modules based on go.sum cache-dependency-path: go.sum + # Clear Go module cache to avoid corruption issues + - name: Clear Go module cache + run: go clean -modcache + # Download all dependencies upfront (will be cached) - name: Download Go dependencies run: | From 8c1facbcc61d198cbc5ddad1cf469121735fe335 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 15:32:19 +0000 Subject: [PATCH 08/15] ci: explicitly download go-winio dependency before mod download Adds explicit download step for github.com/Microsoft/go-winio@v0.6.2 before general module download to prevent network/DNS failures during test compilation. Related to #2634 --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 80fc5d8c9..654af6b9c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,10 @@ jobs: - name: Clear Go module cache run: go clean -modcache + # Explicitly download problematic dependency first + - name: Pre-download go-winio dependency + run: go get github.com/Microsoft/go-winio@v0.6.2 + # Download all dependencies upfront (will be cached) - name: Download Go dependencies run: | From c21f2532ac4361839a6b0c4010aed7b494e4531b Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 15:33:28 +0000 Subject: [PATCH 09/15] ci: disable setup-go cache and handle module caching manually Disables the built-in actions/setup-go cache and implements manual module caching with explicit control. This prevents cache corruption issues that were causing 'go test' to attempt network downloads during compilation. Changes: - Set cache: false in actions/setup-go - Add explicit actions/cache for ~/go/pkg/mod - Remove go clean -modcache and go get steps (no longer needed) - Cache key based on go.sum ensures proper invalidation Related to #2634 --- .github/workflows/test.yml | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 654af6b9c..f84338005 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,25 +17,24 @@ jobs: uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6 with: go-version-file: 'go.mod' - cache: true # This caches go modules based on go.sum - cache-dependency-path: go.sum + cache: false # Disable built-in cache, we'll handle it manually - # Clear Go module cache to avoid corruption issues - - name: Clear Go module cache - run: go clean -modcache - - # Explicitly download problematic dependency first - - name: Pre-download go-winio dependency - run: go get github.com/Microsoft/go-winio@v0.6.2 + # Cache Go modules manually for better control + - name: Cache Go modules + uses: actions/cache@v4 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-mod-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go-mod- - # Download all dependencies upfront (will be cached) + # Download all dependencies upfront - name: Download Go dependencies run: | go mod download go mod verify # Cache Go build cache for faster compilation - # Note: ~/go/pkg/mod is already cached by actions/setup-go with cache: true - name: Cache Go build cache uses: actions/cache@v4 with: From d65c0f0e7c8fa8602fde3d1170209fed29719d4a Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 15:45:45 +0000 Subject: [PATCH 10/15] fix: race condition in TransparentProxy server access Fixes data race where Stop() could set p.server = nil while a goroutine in Start() was still accessing it. Now captures server in local variable before starting the goroutine. Fixes race detector failure: - Read at transparent_proxy.go:380 (Start goroutine) - Write at transparent_proxy.go:461 (Stop method) Related to #2634 --- pkg/transport/proxy/transparent/transparent_proxy.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/transport/proxy/transparent/transparent_proxy.go b/pkg/transport/proxy/transparent/transparent_proxy.go index 67692a0e9..21268844c 100644 --- a/pkg/transport/proxy/transparent/transparent_proxy.go +++ b/pkg/transport/proxy/transparent/transparent_proxy.go @@ -376,8 +376,10 @@ func (p *TransparentProxy) Start(ctx context.Context) error { ReadHeaderTimeout: 10 * time.Second, // Prevent Slowloris attacks } + // Capture server in local variable to avoid race with Stop() + server := p.server go func() { - err := p.server.Serve(ln) + err := server.Serve(ln) if err != nil && err != http.ErrServerClosed { var opErr *net.OpError if errors.As(err, &opErr) && opErr.Op == "accept" { From a2745392d735c81477e8d73fd0203b177482421a Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 15:47:21 +0000 Subject: [PATCH 11/15] revert: remove unnecessary workflow changes The test failures were caused by a race condition in the code, not by module cache issues. Reverting the workflow changes since they were not needed. The actual fix was in transparent_proxy.go to prevent race condition when accessing p.server between Start() and Stop(). Related to #2634 --- .github/workflows/test.yml | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f84338005..5b6b7fab5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,24 +17,17 @@ jobs: uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6 with: go-version-file: 'go.mod' - cache: false # Disable built-in cache, we'll handle it manually + cache: true # This caches go modules based on go.sum + cache-dependency-path: go.sum - # Cache Go modules manually for better control - - name: Cache Go modules - uses: actions/cache@v4 - with: - path: ~/go/pkg/mod - key: ${{ runner.os }}-go-mod-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-go-mod- - - # Download all dependencies upfront + # Download all dependencies upfront (will be cached) - name: Download Go dependencies run: | go mod download go mod verify # Cache Go build cache for faster compilation + # Note: ~/go/pkg/mod is already cached by actions/setup-go with cache: true - name: Cache Go build cache uses: actions/cache@v4 with: From df350a4c707f5689b385a8e46e21f5b6f2372d13 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 17:03:28 +0000 Subject: [PATCH 12/15] fix: address PR review comments - use typed errors and simplify runner Addresses review comments in PR #2634: 1. Replace string matching with typed error checking - Add ErrContainerRemoved error type to docker package - Update container monitor to return ErrContainerRemoved when container not found - Replace strings.Contains() with errors.Is() in http.go and stdio.go - Update tests to use typed error 2. Simplify workload existence checking - Move DoesWorkloadExist method directly onto Runner struct - Remove workloadExistenceChecker type and getWorkloadManager helper All tests pass successfully. --- pkg/container/docker/errors.go | 3 +++ pkg/container/docker/monitor.go | 10 +++++----- pkg/runner/runner.go | 25 +++++++------------------ pkg/transport/http.go | 16 ++++++++-------- pkg/transport/http_test.go | 4 +++- pkg/transport/stdio.go | 10 +++++----- pkg/transport/stdio_test.go | 3 ++- 7 files changed, 33 insertions(+), 38 deletions(-) diff --git a/pkg/container/docker/errors.go b/pkg/container/docker/errors.go index 03303abdb..455c62b6a 100644 --- a/pkg/container/docker/errors.go +++ b/pkg/container/docker/errors.go @@ -18,6 +18,9 @@ var ( // ErrContainerExited is returned when a container has exited unexpectedly ErrContainerExited = fmt.Errorf("container exited unexpectedly") + + // ErrContainerRemoved is returned when a container has been removed + ErrContainerRemoved = fmt.Errorf("container removed") ) // ContainerError represents an error related to container operations diff --git a/pkg/container/docker/monitor.go b/pkg/container/docker/monitor.go index 71f3c6af7..579840bea 100644 --- a/pkg/container/docker/monitor.go +++ b/pkg/container/docker/monitor.go @@ -97,14 +97,14 @@ func (m *ContainerMonitor) monitor(ctx context.Context) { running, err := m.runtime.IsWorkloadRunning(checkCtx, m.containerName) cancel() // Always cancel the context to avoid leaks if err != nil { - // If the container is not found, it may have been removed + // If the container is not found, it has been removed if IsContainerNotFound(err) { - exitErr := NewContainerError( - ErrContainerExited, + removeErr := NewContainerError( + ErrContainerRemoved, m.containerName, - fmt.Sprintf("Container %s (%s) not found, it may have been removed", m.containerName, m.containerName), + fmt.Sprintf("Container %s not found, it has been removed", m.containerName), ) - m.errorCh <- exitErr + m.errorCh <- removeErr return } diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 1f040a5ce..c87ffe59a 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -416,11 +416,10 @@ func (r *Runner) Run(ctx context.Context) error { logger.Warnf("Warning: Failed to reset workload %s PID: %v", r.Config.BaseName, err) } - // Check if workload still exists (using workload manager to check `thv ls`) + // Check if workload still exists (using status manager and runtime) // If it doesn't exist, it was removed - clean up client config // If it exists, it exited unexpectedly - signal restart needed - workloadManager := r.getWorkloadManager() - exists, checkErr := workloadManager.DoesWorkloadExist(ctx, r.Config.BaseName) + exists, checkErr := r.DoesWorkloadExist(ctx, r.Config.BaseName) if checkErr != nil { logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr) // Assume restart needed if we can't check @@ -458,22 +457,12 @@ func (r *Runner) Run(ctx context.Context) error { return nil } -// getWorkloadManager creates a workload manager instance for checking workload existence -func (r *Runner) getWorkloadManager() interface { - DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) -} { - // Create a minimal implementation that checks via status manager - return &workloadExistenceChecker{statusManager: r.statusManager} -} - -// workloadExistenceChecker is a minimal implementation to check if a workload exists -type workloadExistenceChecker struct { - statusManager statuses.StatusManager -} - -func (w *workloadExistenceChecker) DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) { +// DoesWorkloadExist checks if a workload exists in the status manager and runtime. +// For remote workloads, it trusts the status manager. +// For container workloads, it verifies the container exists in the runtime. +func (r *Runner) DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) { // Check if workload exists by trying to get it from status manager - workload, err := w.statusManager.GetWorkload(ctx, workloadName) + workload, err := r.statusManager.GetWorkload(ctx, workloadName) if err != nil { if errors.Is(err, rt.ErrWorkloadNotFound) { return false, nil diff --git a/pkg/transport/http.go b/pkg/transport/http.go index f7d0f859d..10ba86e90 100644 --- a/pkg/transport/http.go +++ b/pkg/transport/http.go @@ -2,18 +2,19 @@ package transport import ( "context" + "errors" "fmt" "net/http" "net/url" - "strings" "sync" "golang.org/x/oauth2" "github.com/stacklok/toolhive/pkg/container" + "github.com/stacklok/toolhive/pkg/container/docker" rt "github.com/stacklok/toolhive/pkg/container/runtime" "github.com/stacklok/toolhive/pkg/logger" - "github.com/stacklok/toolhive/pkg/transport/errors" + transporterrors "github.com/stacklok/toolhive/pkg/transport/errors" "github.com/stacklok/toolhive/pkg/transport/middleware" "github.com/stacklok/toolhive/pkg/transport/proxy/transparent" "github.com/stacklok/toolhive/pkg/transport/types" @@ -170,7 +171,7 @@ func (t *HTTPTransport) Start(ctx context.Context) error { t.proxyPort, targetURI) } else { if t.containerName == "" { - return errors.ErrContainerNameNotSet + return transporterrors.ErrContainerNameNotSet } // For local containers, use the configured target URI @@ -285,9 +286,8 @@ func (t *HTTPTransport) handleContainerExit(ctx context.Context) { logger.Warnf("Container %s exited: %v", t.containerName, err) - // Check if container was removed (not just exited) - isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") - if isRemoved { + // Check if container was removed (not just exited) using typed error + if errors.Is(err, docker.ErrContainerRemoved) { logger.Infof("Container %s was removed. Stopping proxy and cleaning up.", t.containerName) } else { logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName) @@ -310,8 +310,8 @@ func (t *HTTPTransport) ShouldRestart() bool { return false // No exit error, normal shutdown } - // Don't restart if container was removed - return !strings.Contains(t.containerExitErr.Error(), "may have been removed") + // Don't restart if container was removed (use typed error check) + return !errors.Is(t.containerExitErr, docker.ErrContainerRemoved) } // IsRunning checks if the transport is currently running. diff --git a/pkg/transport/http_test.go b/pkg/transport/http_test.go index 1adc49452..b8d4aba1d 100644 --- a/pkg/transport/http_test.go +++ b/pkg/transport/http_test.go @@ -5,6 +5,8 @@ import ( "testing" "github.com/stretchr/testify/assert" + + "github.com/stacklok/toolhive/pkg/container/docker" ) // TestHTTPTransport_ShouldRestart tests the ShouldRestart logic @@ -23,7 +25,7 @@ func TestHTTPTransport_ShouldRestart(t *testing.T) { }, { name: "container removed - should not restart", - exitError: fmt.Errorf("Container test (test) not found, it may have been removed"), + exitError: docker.NewContainerError(docker.ErrContainerRemoved, "test", "Container removed"), expectedResult: false, }, { diff --git a/pkg/transport/stdio.go b/pkg/transport/stdio.go index 815f68d3d..d5a247900 100644 --- a/pkg/transport/stdio.go +++ b/pkg/transport/stdio.go @@ -20,6 +20,7 @@ import ( "golang.org/x/exp/jsonrpc2" "github.com/stacklok/toolhive/pkg/container" + "github.com/stacklok/toolhive/pkg/container/docker" rt "github.com/stacklok/toolhive/pkg/container/runtime" "github.com/stacklok/toolhive/pkg/logger" transporterrors "github.com/stacklok/toolhive/pkg/transport/errors" @@ -638,9 +639,8 @@ func (t *StdioTransport) handleContainerExit(ctx context.Context) { logger.Warnf("Container %s exited: %v", t.containerName, err) - // Check if container was removed (not just exited) - isRemoved := err != nil && strings.Contains(err.Error(), "may have been removed") - if isRemoved { + // Check if container was removed (not just exited) using typed error + if errors.Is(err, docker.ErrContainerRemoved) { logger.Infof("Container %s was removed. Stopping proxy and cleaning up.", t.containerName) } else { logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName) @@ -675,6 +675,6 @@ func (t *StdioTransport) ShouldRestart() bool { return false // No exit error, normal shutdown } - // Don't restart if container was removed - return !strings.Contains(t.containerExitErr.Error(), "may have been removed") + // Don't restart if container was removed (use typed error check) + return !errors.Is(t.containerExitErr, docker.ErrContainerRemoved) } diff --git a/pkg/transport/stdio_test.go b/pkg/transport/stdio_test.go index f6314ce35..12a66dc08 100644 --- a/pkg/transport/stdio_test.go +++ b/pkg/transport/stdio_test.go @@ -16,6 +16,7 @@ import ( "go.uber.org/mock/gomock" "golang.org/x/exp/jsonrpc2" + "github.com/stacklok/toolhive/pkg/container/docker" "github.com/stacklok/toolhive/pkg/container/runtime/mocks" "github.com/stacklok/toolhive/pkg/logger" ) @@ -1030,7 +1031,7 @@ func TestStdioTransport_ShouldRestart(t *testing.T) { }, { name: "container removed - should not restart", - exitError: fmt.Errorf("Container test (test) not found, it may have been removed"), + exitError: docker.NewContainerError(docker.ErrContainerRemoved, "test", "Container removed"), expectedResult: false, }, { From 0a46a1321ba7d12806d1818bd12f49a8fe034287 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 17:47:24 +0000 Subject: [PATCH 13/15] fix: check container existence instead of running state for restart logic The DoesWorkloadExist method was using IsWorkloadRunning which returns false for stopped containers. This caused the system to treat stopped containers as removed, cleaning up client config instead of restarting. Now using GetWorkloadInfo to check if the container exists (regardless of running state), so stopped containers are correctly identified as existing and trigger automatic restart. --- pkg/runner/runner.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index c87ffe59a..532fa3c18 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -485,15 +485,17 @@ func (r *Runner) DoesWorkloadExist(ctx context.Context, workloadName string) (bo return workload.Status != rt.WorkloadStatusError, nil } - // Check if container is actually running in the runtime - running, err := backend.IsWorkloadRunning(ctx, workloadName) + // Check if container exists in the runtime (not just running) + // GetWorkloadInfo will return an error if the container doesn't exist + _, err = backend.GetWorkloadInfo(ctx, workloadName) if err != nil { - // Container doesn't exist or error checking + // Container doesn't exist logger.Debugf("Container %s not found in runtime: %v", workloadName, err) return false, nil } - return running, nil + // Container exists (may be running or stopped) + return true, nil } // handleRemoteAuthentication handles authentication for remote MCP servers From ac6a8e62860cab4b071be0dd85306774615836ae Mon Sep 17 00:00:00 2001 From: nigel brown Date: Wed, 19 Nov 2025 18:08:10 +0000 Subject: [PATCH 14/15] fix: detect and restart when container is restarted via docker restart Adds detection for container restarts (e.g., 'docker restart '). When a container is restarted, Docker assigns a new StartedAt timestamp while keeping the same container name. The monitor now tracks the initial StartedAt time and compares it on each check to detect restarts. Changes: - Add StartedAt field to ContainerInfo to track when container last started - Populate StartedAt from Docker State.StartedAt field - Monitor checks for StartedAt changes to detect restarts - Trigger automatic restart logic when restart detected - Update tests to mock GetWorkloadInfo calls This fixes the issue where restarting a container broke the proxy connection without triggering automatic reconnection. --- pkg/container/docker/client.go | 21 ++++++++++----- pkg/container/docker/monitor.go | 38 +++++++++++++++++++++++----- pkg/container/docker/monitor_test.go | 8 ++++++ pkg/container/runtime/types.go | 2 ++ 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/pkg/container/docker/client.go b/pkg/container/docker/client.go index c3a51c1a3..7276a6a49 100644 --- a/pkg/container/docker/client.go +++ b/pkg/container/docker/client.go @@ -556,14 +556,21 @@ func (c *Client) GetWorkloadInfo(ctx context.Context, workloadName string) (runt created = time.Time{} // Use zero time if parsing fails } + // Convert start time + startedAt, err := time.Parse(time.RFC3339Nano, info.State.StartedAt) + if err != nil { + startedAt = time.Time{} // Use zero time if parsing fails + } + return runtime.ContainerInfo{ - Name: strings.TrimPrefix(info.Name, "/"), - Image: info.Config.Image, - Status: info.State.Status, - State: dockerToDomainStatus(info.State.Status), - Created: created, - Labels: info.Config.Labels, - Ports: ports, + Name: strings.TrimPrefix(info.Name, "/"), + Image: info.Config.Image, + Status: info.State.Status, + State: dockerToDomainStatus(info.State.Status), + Created: created, + StartedAt: startedAt, + Labels: info.Config.Labels, + Ports: ports, }, nil } diff --git a/pkg/container/docker/monitor.go b/pkg/container/docker/monitor.go index 579840bea..43ca3d9f3 100644 --- a/pkg/container/docker/monitor.go +++ b/pkg/container/docker/monitor.go @@ -12,13 +12,14 @@ import ( // ContainerMonitor watches a container's state and reports when it exits type ContainerMonitor struct { - runtime runtime.Runtime - containerName string - stopCh chan struct{} - errorCh chan error - wg sync.WaitGroup - running bool - mutex sync.Mutex + runtime runtime.Runtime + containerName string + stopCh chan struct{} + errorCh chan error + wg sync.WaitGroup + running bool + mutex sync.Mutex + initialStartTime time.Time // Track container start time to detect restarts } // NewMonitor creates a new container monitor @@ -49,6 +50,13 @@ func (m *ContainerMonitor) StartMonitoring(ctx context.Context) (<-chan error, e return nil, NewContainerError(ErrContainerNotRunning, m.containerName, "container is not running") } + // Get initial container info to track start time + info, err := m.runtime.GetWorkloadInfo(ctx, m.containerName) + if err != nil { + return nil, NewContainerError(err, m.containerName, fmt.Sprintf("failed to get container info: %v", err)) + } + m.initialStartTime = info.StartedAt + m.running = true m.wg.Add(1) @@ -129,6 +137,22 @@ func (m *ContainerMonitor) monitor(ctx context.Context) { m.errorCh <- exitErr return } + + // Container is running - check if it was restarted (different start time) + infoCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + info, err := m.runtime.GetWorkloadInfo(infoCtx, m.containerName) + cancel() + if err == nil && !info.StartedAt.IsZero() && !info.StartedAt.Equal(m.initialStartTime) { + // Container was restarted (has a different start time) + restartErr := NewContainerError( + ErrContainerExited, + m.containerName, + fmt.Sprintf("Container %s was restarted (start time changed from %s to %s)", + m.containerName, m.initialStartTime.Format(time.RFC3339), info.StartedAt.Format(time.RFC3339)), + ) + m.errorCh <- restartErr + return + } } } } diff --git a/pkg/container/docker/monitor_test.go b/pkg/container/docker/monitor_test.go index 359944e5e..4043df8fb 100644 --- a/pkg/container/docker/monitor_test.go +++ b/pkg/container/docker/monitor_test.go @@ -39,6 +39,10 @@ func TestContainerMonitor_StartMonitoring_WhenRunningStarts(t *testing.T) { // StartMonitoring should verify running exactly once on first call. mockRT.EXPECT().IsWorkloadRunning(ctx, "workload-1").Return(true, nil).Times(1) + // StartMonitoring now gets the container start time + mockRT.EXPECT().GetWorkloadInfo(ctx, "workload-1").Return(rt.ContainerInfo{ + StartedAt: time.Now(), + }, nil).Times(1) m := NewMonitor(mockRT, "workload-1") ch, err := m.StartMonitoring(ctx) @@ -145,6 +149,10 @@ func TestContainerMonitor_StartStop_TerminatesQuickly(t *testing.T) { defer cancel() mockRT.EXPECT().IsWorkloadRunning(ctx, "workload-5").Return(true, nil).Times(1) + // StartMonitoring now gets the container start time + mockRT.EXPECT().GetWorkloadInfo(ctx, "workload-5").Return(rt.ContainerInfo{ + StartedAt: time.Now(), + }, nil).Times(1) m := NewMonitor(mockRT, "workload-5") ch, err := m.StartMonitoring(ctx) diff --git a/pkg/container/runtime/types.go b/pkg/container/runtime/types.go index 2bbdc75e0..b234b0ae5 100644 --- a/pkg/container/runtime/types.go +++ b/pkg/container/runtime/types.go @@ -55,6 +55,8 @@ type ContainerInfo struct { State WorkloadStatus // Created is the container creation timestamp Created time.Time + // StartedAt is when the container was last started (changes on restart) + StartedAt time.Time // Labels is the container labels Labels map[string]string // Ports is the container port mappings From d72be3483bb0e28ff8effb3685ebaeec53c08645 Mon Sep 17 00:00:00 2001 From: nigel brown Date: Thu, 20 Nov 2025 11:51:11 +0000 Subject: [PATCH 15/15] made the DoesWorkloadExist function private Signed-off-by: nigel brown --- pkg/runner/runner.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 532fa3c18..41f331308 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -419,7 +419,7 @@ func (r *Runner) Run(ctx context.Context) error { // Check if workload still exists (using status manager and runtime) // If it doesn't exist, it was removed - clean up client config // If it exists, it exited unexpectedly - signal restart needed - exists, checkErr := r.DoesWorkloadExist(ctx, r.Config.BaseName) + exists, checkErr := r.doesWorkloadExist(ctx, r.Config.BaseName) if checkErr != nil { logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr) // Assume restart needed if we can't check @@ -457,10 +457,10 @@ func (r *Runner) Run(ctx context.Context) error { return nil } -// DoesWorkloadExist checks if a workload exists in the status manager and runtime. +// doesWorkloadExist checks if a workload exists in the status manager and runtime. // For remote workloads, it trusts the status manager. // For container workloads, it verifies the container exists in the runtime. -func (r *Runner) DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) { +func (r *Runner) doesWorkloadExist(ctx context.Context, workloadName string) (bool, error) { // Check if workload exists by trying to get it from status manager workload, err := r.statusManager.GetWorkload(ctx, workloadName) if err != nil {