Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 64 additions & 31 deletions relay/lib/ztlp_relay/gateway_forwarder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ defmodule ZtlpRelay.GatewayForwarder do
Prefers dynamic gateways registered for this service; falls back to
static gateways if no dynamic match. Returns :error if none available.
"""
@spec pick_gateway_for_service(String.t()) :: {:ok, {:inet.ip_address(), non_neg_integer()}} | :error
@spec pick_gateway_for_service(String.t()) ::
{:ok, {:inet.ip_address(), non_neg_integer()}} | :error
def pick_gateway_for_service(service_name) do
GenServer.call(__MODULE__, {:pick_gateway_for_service, service_name})
end
Expand Down Expand Up @@ -281,9 +282,15 @@ defmodule ZtlpRelay.GatewayForwarder do

# Setup NAT 5-tuple table for transparent 0.0.0.0 bypass via the Elixir relay.
if :ets.info(:ztlp_forwarded_quic_tuples, :name) == :undefined do
:ets.new(:ztlp_forwarded_quic_tuples, [:named_table, :public, :set, read_concurrency: true, write_concurrency: true])
:ets.new(:ztlp_forwarded_quic_tuples, [
:named_table,
:public,
:set,
read_concurrency: true,
write_concurrency: true
])
end

# Indexing tuples explicitly bound exclusively to registered gateways natively.
:ets.insert(:ztlp_forwarded_quic_tuples, {:gateway_addr, address})

Expand Down Expand Up @@ -385,6 +392,22 @@ defmodule ZtlpRelay.GatewayForwarder do
#
# See `proto/src/tunnel.rs::encode_service_name` and
# `gateway/lib/ztlp_gateway/packet.ex::service_hash/1`.
#
# v0.29.4 STRICT-ROUTING: when the caller passes an explicit non-zero
# service intent (real hash OR real name) and NO registered gateway
# matches, we return `:error` directly instead of round-robining over
# all other registered tenants. The pre-v0.29.4 fallback was a silent
# cross-tenant route footgun — a CLI asking for `gw-test-org-2` could
# land on `gw-hermese2e-1779353410` and render the wrong tenant's
# Bootstrap UI when the gateway-index happened to point that way.
#
# The all-zero 16-byte hash (no-service-preference sentinel) keeps the
# legacy round-robin fallback. `forward_hello_to_gateway` in
# `udp_listener.ex` already short-circuits nil/<<0::128>> through
# `pick_gateway/0`, but we honor it here too for direct internal
# callers that pass us the sentinel by mistake.
all_zero_hash? = service_name == <<0::128>>

matches_service =
case service_name do
<<hash::binary-size(16)>> when bit_size(hash) == 128 ->
Expand All @@ -406,31 +429,44 @@ defmodule ZtlpRelay.GatewayForwarder do

case service_gateways do
[] ->
# No dynamic gateway for this specific service — fall back to ANY available gateway.
# First try static gateways, then try ALL dynamic gateways regardless of service name.
# This prevents phone sessions from falling into half-open mode when the service name
# doesn't exactly match a registered gateway service.
all_fallback =
case state.gateways do
# No live gateway registered for the requested service.
#
# If the caller passed the all-zero "no preference" sentinel, keep
# the legacy round-robin fallback (back-compat for pre-Option-C
# clients and any direct internal caller using the sentinel).
#
# Otherwise — explicit service hash or name with no match — we
# return `:error` and let the call site surface the failure.
# The QUIC CLIENT_ROUTE path in `udp_listener.ex` already logs
# `"rejected: no gateway registered for service=..."` on `:error`;
# the HELLO `forward_hello_to_gateway` path drops into a half-open
# session that the client times out on.
if all_zero_hash? do
all_fallback =
case state.gateways do
[] ->
state.dynamic_gateways
|> Enum.filter(fn gw -> gw.expires_at > now end)
|> Enum.map(fn gw -> gw.address end)
|> Enum.uniq()

static ->
static
end

case all_fallback do
[] ->
# No static gateways — try all dynamic gateways regardless of service
state.dynamic_gateways
|> Enum.filter(fn gw -> gw.expires_at > now end)
|> Enum.map(fn gw -> gw.address end)
|> Enum.uniq()

static ->
static
end
{:reply, :error, state}

case all_fallback do
[] ->
{:reply, :error, state}

_ ->
index = rem(state.gateway_index, length(all_fallback))
gateway = Enum.at(all_fallback, index)
{:reply, {:ok, gateway}, %{state | gateway_index: index + 1}}
_ ->
index = rem(state.gateway_index, length(all_fallback))
gateway = Enum.at(all_fallback, index)
{:reply, {:ok, gateway}, %{state | gateway_index: index + 1}}
end
else
# Strict path: explicit service requested, no match → :error.
# Do NOT silently route to a different tenant.
{:reply, :error, state}
end

_ ->
Expand All @@ -447,8 +483,7 @@ defmodule ZtlpRelay.GatewayForwarder do
def handle_call(:enabled?, _from, state) do
now = System.monotonic_time(:second)

has_dynamic =
Enum.any?(state.dynamic_gateways, fn gw -> gw.expires_at > now end)
has_dynamic = Enum.any?(state.dynamic_gateways, fn gw -> gw.expires_at > now end)

{:reply, state.gateways != [] or has_dynamic, state}
end
Expand Down Expand Up @@ -509,9 +544,7 @@ defmodule ZtlpRelay.GatewayForwarder do
Enum.split_with(state.dynamic_gateways, fn gw -> gw.expires_at > now_s end)

if expired != [] do
Logger.info(
"[GatewayForwarder] Expired #{length(expired)} dynamic gateway registration(s)"
)
Logger.info("[GatewayForwarder] Expired #{length(expired)} dynamic gateway registration(s)")
end

Process.send_after(self(), :cleanup, 60_000)
Expand Down
70 changes: 62 additions & 8 deletions relay/test/ztlp_relay/gateway_forwarder_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,20 @@ defmodule ZtlpRelay.GatewayForwarderTest do
assert {:ok, ^gateway_addr} = GatewayForwarder.pick_gateway_for_service(service_name)
end

test "pick_gateway_for_service/1 with a 16-byte hash that does not match any registered service falls back instead of selecting the wrong gateway by accident" do
# Negative case: when no registered gateway matches the wire hash,
# the function falls back to the existing round-robin/static-gateway
# path. We assert the function returns a gateway (any gateway, since
# there are some registered) but does NOT crash on the hash input.
test "pick_gateway_for_service/1 with a 16-byte hash that does not match any registered service returns :error and does NOT silently round-robin to a different tenant" do
# SECURITY-CRITICAL: when the caller passes an explicit service hash
# (i.e. they want gw-tenant-X, not just any gateway), the relay MUST
# NOT silently fall back to whichever other tenant happens to be next
# in the round-robin index. That was the v0.29.0..v0.29.2 footgun:
# a client asking for gw-test-org-2 could be silently routed to
# gw-hermese2e-1779353410 and see the WRONG TENANT's Bootstrap UI.
#
# Strict-routing contract:
# * explicit non-zero service hash with NO match → :error
# * empty / all-zero service hash → fall back to
# `pick_gateway/0` round-robin (unchanged behavior)
#
# This test pins the strict path for non-matching hashes.

node_id_a = :crypto.strong_rand_bytes(16)
node_id_b = :crypto.strong_rand_bytes(16)
Expand All @@ -149,9 +158,54 @@ defmodule ZtlpRelay.GatewayForwarderTest do
|> :binary.part(0, 16)

result = GatewayForwarder.pick_gateway_for_service(hash)
# Must not crash. Falls back to round-robin over all dynamic gateways
# (matching pre-existing behavior for unrecognized service strings).
assert match?({:ok, _}, result) or result == :error

# MUST be :error. Returning {:ok, gw_a} or {:ok, gw_b} would mean a
# silent cross-tenant route — exactly the bug Task #2 of the v0.29.3
# handoff was filed to prevent.
assert result == :error,
"pick_gateway_for_service/1 with an unknown service hash must return :error, " <>
"not silently round-robin to a different tenant. Got: #{inspect(result)}"
end

test "pick_gateway_for_service/1 with an unknown service NAME (string form) also returns :error" do
# Same strict contract for the legacy string-name caller path.
# Internal Elixir callers that still pass a name string instead of a
# hash get the same hard-error semantics — no surprise round-robin.

node_id = :crypto.strong_rand_bytes(16)
gw = {{10, 0, 77, 1}, 23097}

GatewayForwarder.register_dynamic_gateway(gw, node_id, "gw-real-tenant", 60)
Process.sleep(20)

result = GatewayForwarder.pick_gateway_for_service("gw-does-not-exist")

assert result == :error,
"pick_gateway_for_service/1 with an unknown service name must return :error. " <>
"Got: #{inspect(result)}"
end

test "pick_gateway_for_service/1 with an all-zero (no-preference) hash falls back to round-robin" do
# Backwards-compat sanity: the all-zero 16-byte hash is the
# \"no service preference\" sentinel (see `forward_hello_to_gateway`
# in `udp_listener.ex` — it routes nil and <<0::128>> through
# `pick_gateway/0`, never through `pick_gateway_for_service/1`).
#
# If a caller does pass us the all-zero hash directly, treat it as
# the same \"any gateway\" intent rather than the strict-error path,
# so the legacy semantics still hold for pre-Option-C clients.

node_id = :crypto.strong_rand_bytes(16)
gw = {{10, 0, 66, 1}, 23097}

GatewayForwarder.register_dynamic_gateway(gw, node_id, "gw-some-tenant", 60)
Process.sleep(20)

result = GatewayForwarder.pick_gateway_for_service(<<0::128>>)

assert match?({:ok, _}, result),
"all-zero hash should fall back to round-robin, not strict-error. " <>
"Got: #{inspect(result)}"
end

test "update_client/2 rewrites the peer index so old client address is no longer routable" do
Expand Down
Loading