Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d4a9bc0
fix: automatically handle container exits and reconnect clients
therealnb Nov 18, 2025
ab3d5fa
Superfluous - info is in the PR
therealnb Nov 18, 2025
5c57fd1
fix: apply gofmt formatting
therealnb Nov 18, 2025
f3352cf
fix: address linter errors (line length, unused params)
therealnb Nov 18, 2025
eb04391
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 18, 2025
01bdd4d
test: add tests for container exit handling and automatic restart
therealnb Nov 18, 2025
6576b90
fix: apply gci import formatting to test files
therealnb Nov 18, 2025
3e2327a
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 18, 2025
13fb72a
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 18, 2025
569eb56
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 19, 2025
63f9c3f
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 19, 2025
5e35010
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 19, 2025
6b094bf
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 19, 2025
7612120
ci: clear Go module cache before downloading dependencies
therealnb Nov 19, 2025
8c1facb
ci: explicitly download go-winio dependency before mod download
therealnb Nov 19, 2025
c21f253
ci: disable setup-go cache and handle module caching manually
therealnb Nov 19, 2025
d65c0f0
fix: race condition in TransparentProxy server access
therealnb Nov 19, 2025
a274539
revert: remove unnecessary workflow changes
therealnb Nov 19, 2025
df350a4
fix: address PR review comments - use typed errors and simplify runner
therealnb Nov 19, 2025
0a46a13
fix: check container existence instead of running state for restart l…
therealnb Nov 19, 2025
ac6a8e6
fix: detect and restart when container is restarted via docker restart
therealnb Nov 19, 2025
baf9204
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 19, 2025
95d3c9b
Merge branch 'main' into fix/container-exit-reconnection
therealnb Nov 20, 2025
d72be34
made the DoesWorkloadExist function private
therealnb Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions pkg/container/docker/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,14 +556,21 @@ func (c *Client) GetWorkloadInfo(ctx context.Context, workloadName string) (runt
created = time.Time{} // Use zero time if parsing fails
}

// Convert start time
startedAt, err := time.Parse(time.RFC3339Nano, info.State.StartedAt)
if err != nil {
startedAt = time.Time{} // Use zero time if parsing fails
}

return runtime.ContainerInfo{
Name: strings.TrimPrefix(info.Name, "/"),
Image: info.Config.Image,
Status: info.State.Status,
State: dockerToDomainStatus(info.State.Status),
Created: created,
Labels: info.Config.Labels,
Ports: ports,
Name: strings.TrimPrefix(info.Name, "/"),
Image: info.Config.Image,
Status: info.State.Status,
State: dockerToDomainStatus(info.State.Status),
Created: created,
StartedAt: startedAt,
Labels: info.Config.Labels,
Ports: ports,
}, nil
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/container/docker/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ var (

// ErrContainerExited is returned when a container has exited unexpectedly
ErrContainerExited = fmt.Errorf("container exited unexpectedly")

// ErrContainerRemoved is returned when a container has been removed
ErrContainerRemoved = fmt.Errorf("container removed")
)

// ContainerError represents an error related to container operations
Expand Down
48 changes: 36 additions & 12 deletions pkg/container/docker/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ import (

// ContainerMonitor watches a container's state and reports when it exits
type ContainerMonitor struct {
runtime runtime.Runtime
containerName string
stopCh chan struct{}
errorCh chan error
wg sync.WaitGroup
running bool
mutex sync.Mutex
runtime runtime.Runtime
containerName string
stopCh chan struct{}
errorCh chan error
wg sync.WaitGroup
running bool
mutex sync.Mutex
initialStartTime time.Time // Track container start time to detect restarts
}

// NewMonitor creates a new container monitor
Expand Down Expand Up @@ -49,6 +50,13 @@ func (m *ContainerMonitor) StartMonitoring(ctx context.Context) (<-chan error, e
return nil, NewContainerError(ErrContainerNotRunning, m.containerName, "container is not running")
}

// Get initial container info to track start time
info, err := m.runtime.GetWorkloadInfo(ctx, m.containerName)
if err != nil {
return nil, NewContainerError(err, m.containerName, fmt.Sprintf("failed to get container info: %v", err))
}
m.initialStartTime = info.StartedAt

m.running = true
m.wg.Add(1)

Expand Down Expand Up @@ -97,14 +105,14 @@ func (m *ContainerMonitor) monitor(ctx context.Context) {
running, err := m.runtime.IsWorkloadRunning(checkCtx, m.containerName)
cancel() // Always cancel the context to avoid leaks
if err != nil {
// If the container is not found, it may have been removed
// If the container is not found, it has been removed
if IsContainerNotFound(err) {
exitErr := NewContainerError(
ErrContainerExited,
removeErr := NewContainerError(
ErrContainerRemoved,
m.containerName,
fmt.Sprintf("Container %s (%s) not found, it may have been removed", m.containerName, m.containerName),
fmt.Sprintf("Container %s not found, it has been removed", m.containerName),
)
m.errorCh <- exitErr
m.errorCh <- removeErr
return
}

Expand All @@ -129,6 +137,22 @@ func (m *ContainerMonitor) monitor(ctx context.Context) {
m.errorCh <- exitErr
return
}

// Container is running - check if it was restarted (different start time)
infoCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
info, err := m.runtime.GetWorkloadInfo(infoCtx, m.containerName)
cancel()
if err == nil && !info.StartedAt.IsZero() && !info.StartedAt.Equal(m.initialStartTime) {
// Container was restarted (has a different start time)
restartErr := NewContainerError(
ErrContainerExited,
m.containerName,
fmt.Sprintf("Container %s was restarted (start time changed from %s to %s)",
m.containerName, m.initialStartTime.Format(time.RFC3339), info.StartedAt.Format(time.RFC3339)),
)
m.errorCh <- restartErr
return
}
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/container/docker/monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ func TestContainerMonitor_StartMonitoring_WhenRunningStarts(t *testing.T) {

// StartMonitoring should verify running exactly once on first call.
mockRT.EXPECT().IsWorkloadRunning(ctx, "workload-1").Return(true, nil).Times(1)
// StartMonitoring now gets the container start time
mockRT.EXPECT().GetWorkloadInfo(ctx, "workload-1").Return(rt.ContainerInfo{
StartedAt: time.Now(),
}, nil).Times(1)

m := NewMonitor(mockRT, "workload-1")
ch, err := m.StartMonitoring(ctx)
Expand Down Expand Up @@ -145,6 +149,10 @@ func TestContainerMonitor_StartStop_TerminatesQuickly(t *testing.T) {
defer cancel()

mockRT.EXPECT().IsWorkloadRunning(ctx, "workload-5").Return(true, nil).Times(1)
// StartMonitoring now gets the container start time
mockRT.EXPECT().GetWorkloadInfo(ctx, "workload-5").Return(rt.ContainerInfo{
StartedAt: time.Now(),
}, nil).Times(1)

m := NewMonitor(mockRT, "workload-5")
ch, err := m.StartMonitoring(ctx)
Expand Down
2 changes: 2 additions & 0 deletions pkg/container/runtime/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ type ContainerInfo struct {
State WorkloadStatus
// Created is the container creation timestamp
Created time.Time
// StartedAt is when the container was last started (changes on restart)
StartedAt time.Time
// Labels is the container labels
Labels map[string]string
// Ports is the container port mappings
Expand Down
84 changes: 81 additions & 3 deletions pkg/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package runner
import (
"bytes"
"context"
"errors"
"fmt"
"net/http"
"os"
Expand All @@ -18,6 +19,7 @@ import (
"github.com/stacklok/toolhive/pkg/auth/remote"
"github.com/stacklok/toolhive/pkg/client"
"github.com/stacklok/toolhive/pkg/config"
ct "github.com/stacklok/toolhive/pkg/container"
rt "github.com/stacklok/toolhive/pkg/container/runtime"
"github.com/stacklok/toolhive/pkg/labels"
"github.com/stacklok/toolhive/pkg/logger"
Expand Down Expand Up @@ -383,7 +385,7 @@ func (r *Runner) Run(ctx context.Context) error {
}
if !running {
// Transport is no longer running (container exited or was stopped)
logger.Info("Transport is no longer running, exiting...")
logger.Warn("Transport is no longer running, attempting automatic restart...")
close(doneCh)
return
}
Expand All @@ -404,7 +406,7 @@ func (r *Runner) Run(ctx context.Context) error {
case sig := <-sigCh:
stopMCPServer(fmt.Sprintf("Received signal %s", sig))
case <-doneCh:
// The transport has already been stopped (likely by the container monitor)
// The transport has already been stopped (likely by the container exit)
// Clean up the PID file and state
// TODO: Stop writing to PID file once we migrate over to statuses.
if err := process.RemovePIDFile(r.Config.BaseName); err != nil {
Expand All @@ -414,12 +416,88 @@ func (r *Runner) Run(ctx context.Context) error {
logger.Warnf("Warning: Failed to reset workload %s PID: %v", r.Config.BaseName, err)
}

logger.Infof("MCP server %s stopped", r.Config.ContainerName)
// Check if workload still exists (using status manager and runtime)
// If it doesn't exist, it was removed - clean up client config
// If it exists, it exited unexpectedly - signal restart needed
exists, checkErr := r.doesWorkloadExist(ctx, r.Config.BaseName)
if checkErr != nil {
logger.Warnf("Warning: Failed to check if workload exists: %v", checkErr)
// Assume restart needed if we can't check
} else if !exists {
// Workload doesn't exist in `thv ls` - it was removed
logger.Infof(
"Workload %s no longer exists. Removing from client configurations.",
r.Config.BaseName,
)
clientManager, clientErr := client.NewManager(ctx)
if clientErr == nil {
removeErr := clientManager.RemoveServerFromClients(
ctx,
r.Config.ContainerName,
r.Config.Group,
)
if removeErr != nil {
logger.Warnf("Warning: Failed to remove from client config: %v", removeErr)
} else {
logger.Infof(
"Successfully removed %s from client configurations",
r.Config.ContainerName,
)
}
}
logger.Infof("MCP server %s stopped and cleaned up", r.Config.ContainerName)
return nil // Exit gracefully, no restart
}

// Workload still exists - signal restart needed
logger.Infof("MCP server %s stopped, restart needed", r.Config.ContainerName)
return fmt.Errorf("container exited, restart needed")
}

return nil
}

// doesWorkloadExist checks if a workload exists in the status manager and runtime.
// For remote workloads, it trusts the status manager.
// For container workloads, it verifies the container exists in the runtime.
func (r *Runner) doesWorkloadExist(ctx context.Context, workloadName string) (bool, error) {
// Check if workload exists by trying to get it from status manager
workload, err := r.statusManager.GetWorkload(ctx, workloadName)
if err != nil {
if errors.Is(err, rt.ErrWorkloadNotFound) {
return false, nil
}
return false, fmt.Errorf("failed to check if workload exists: %w", err)
}

// If remote workload, check if it should exist
if workload.Remote {
// For remote workloads, trust the status manager
return workload.Status != rt.WorkloadStatusError, nil
}

// For container workloads, verify the container actually exists in the runtime
// Create a runtime instance to check if container exists
backend, err := ct.NewFactory().Create(ctx)
if err != nil {
logger.Warnf("Failed to create runtime to check container existence: %v", err)
// Fall back to status manager only
return workload.Status != rt.WorkloadStatusError, nil
}

// Check if container exists in the runtime (not just running)
// GetWorkloadInfo will return an error if the container doesn't exist
_, err = backend.GetWorkloadInfo(ctx, workloadName)
if err != nil {
// Container doesn't exist
logger.Debugf("Container %s not found in runtime: %v", workloadName, err)
return false, nil
}

// Container exists (may be running or stopped)
return true, nil
}

// handleRemoteAuthentication handles authentication for remote MCP servers
func (r *Runner) handleRemoteAuthentication(ctx context.Context) (oauth2.TokenSource, error) {
if r.Config.RemoteAuthConfig == nil {
Expand Down
41 changes: 37 additions & 4 deletions pkg/transport/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package transport

import (
"context"
"errors"
"fmt"
"net/http"
"net/url"
Expand All @@ -10,9 +11,10 @@ import (
"golang.org/x/oauth2"

"github.com/stacklok/toolhive/pkg/container"
"github.com/stacklok/toolhive/pkg/container/docker"
rt "github.com/stacklok/toolhive/pkg/container/runtime"
"github.com/stacklok/toolhive/pkg/logger"
"github.com/stacklok/toolhive/pkg/transport/errors"
transporterrors "github.com/stacklok/toolhive/pkg/transport/errors"
"github.com/stacklok/toolhive/pkg/transport/middleware"
"github.com/stacklok/toolhive/pkg/transport/proxy/transparent"
"github.com/stacklok/toolhive/pkg/transport/types"
Expand Down Expand Up @@ -58,6 +60,10 @@ type HTTPTransport struct {
// Container monitor
monitor rt.Monitor
errorCh <-chan error

// Container exit error (for determining if restart is needed)
containerExitErr error
exitErrMutex sync.Mutex
}

// NewHTTPTransport creates a new HTTP transport.
Expand Down Expand Up @@ -165,7 +171,7 @@ func (t *HTTPTransport) Start(ctx context.Context) error {
t.proxyPort, targetURI)
} else {
if t.containerName == "" {
return errors.ErrContainerNameNotSet
return transporterrors.ErrContainerNameNotSet
}

// For local containers, use the configured target URI
Expand Down Expand Up @@ -273,14 +279,41 @@ func (t *HTTPTransport) handleContainerExit(ctx context.Context) {
case <-ctx.Done():
return
case err := <-t.errorCh:
logger.Infof("Container %s exited: %v", t.containerName, err)
// Stop the transport when the container exits
// Store the exit error so runner can check if restart is needed
t.exitErrMutex.Lock()
t.containerExitErr = err
t.exitErrMutex.Unlock()

logger.Warnf("Container %s exited: %v", t.containerName, err)

// Check if container was removed (not just exited) using typed error
if errors.Is(err, docker.ErrContainerRemoved) {
logger.Infof("Container %s was removed. Stopping proxy and cleaning up.", t.containerName)
} else {
logger.Infof("Container %s exited. Will attempt automatic restart.", t.containerName)
}

// Stop the transport when the container exits/removed
if stopErr := t.Stop(ctx); stopErr != nil {
logger.Errorf("Error stopping transport after container exit: %v", stopErr)
}
}
}

// ShouldRestart returns true if the container exited and should be restarted.
// Returns false if the container was removed (intentionally deleted).
func (t *HTTPTransport) ShouldRestart() bool {
t.exitErrMutex.Lock()
defer t.exitErrMutex.Unlock()

if t.containerExitErr == nil {
return false // No exit error, normal shutdown
}

// Don't restart if container was removed (use typed error check)
return !errors.Is(t.containerExitErr, docker.ErrContainerRemoved)
}

// IsRunning checks if the transport is currently running.
func (t *HTTPTransport) IsRunning(_ context.Context) (bool, error) {
t.mutex.Lock()
Expand Down
Loading
Loading