From b4b3b082e37698aa8daeb043d070ee40844d9598 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:32 +1000 Subject: [PATCH 01/10] clang-tidy: Suppress redundant expression warning clang-tidy 20.1.8 doesn't like (VHOST_USER_MAX_VQS / 2), because it expands to (2 / 2). But in the context of the #define, this makes logical sense, so suppress the warning. I'm not sure why it isn't firing on the debug() line just below. Possibly it only complains once per expression per function, so we only have to suppress it once? Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- vhost_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vhost_user.c b/vhost_user.c index fa343a86..223332d5 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -939,6 +939,7 @@ static bool vu_get_queue_num_exec(struct vu_dev *vdev, { (void)vdev; + /* NOLINTNEXTLINE(misc-redundant-expression) */ vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_VQS / 2); debug("VHOST_USER_MAX_VQS %u", VHOST_USER_MAX_VQS / 2); From 065d199f9df54b3cbe77a52da5b80d12f2bc7d85 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:33 +1000 Subject: [PATCH 02/10] cppcheck: Suppress the suppression of a suppression Apparently cppcheck 2.18.3 no longer trips the funcArgNamesDifferent warning on our (re-)definition of close_range(). So instead we get an unmatchedSuppression warning. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- linux_dep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux_dep.h b/linux_dep.h index 1d9e1669..89e590c8 100644 --- a/linux_dep.h +++ b/linux_dep.h @@ -142,7 +142,7 @@ struct tcp_info_linux { #endif __attribute__ ((weak)) -/* cppcheck-suppress funcArgNamesDifferent */ +/* cppcheck-suppress [funcArgNamesDifferent,unmatchedSuppression] */ int close_range(unsigned int first, unsigned int last, int flags) { return syscall(SYS_close_range, first, last, flags); } From ee9b2361d04027669bff8df4dc10722bda1c1333 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:34 +1000 Subject: [PATCH 03/10] cppcheck: Suppress a buggy cppcheck warning Another cppcheck package in Fedora, another bogus false positive. This one seems to not realise that a variable has been initialised by getsockopt() under a complicated set of circumstances. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- tcp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tcp.c b/tcp.c index 48b1ef29..7da41797 100644 --- a/tcp.c +++ b/tcp.c @@ -1030,6 +1030,11 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, return 0; } + /* This trips a cppcheck bug in some versions, including + * cppcheck 2.18.3. + * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/ + */ + /* cppcheck-suppress [uninitvar,unmatchedSuppression] */ conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked + conn->seq_init_from_tap; From 2274c3a520dfa41e440db93f15153ff426694331 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:35 +1000 Subject: [PATCH 04/10] cppcheck: Suppress variable scope warnings in dhcpv6() At least some cppcheck versions (2.18.3 for me) complain that the _storage variables in dhcpv6() could be reduced in scope. That's not actually the case, because although we don't reference the variables, we may touch their memory via pointers after the blocks in question. There's no reasonable way for cppcheck to determine that, though, so suppress its warnings. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- dhcpv6.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dhcpv6.c b/dhcpv6.c index c1a27aba..e4df0db5 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -550,10 +550,18 @@ int dhcpv6(struct ctx *c, struct iov_tail *data, { const struct opt_server_id *server_id = NULL; const struct opt_hdr *client_id = NULL; + /* The _storage variables can't be local to the blocks they're used in, + * because IOV_*_HEADER() may return pointers to them which are + * dereferenced afterwards. Since we don't have Rust-like lifetime + * tracking, cppcheck can't reasonably determine that, so we must + * suppress its warnings. */ + /* cppcheck-suppress [variableScope,unmatchedSuppression] */ struct opt_server_id server_id_storage; struct iov_tail opt, client_id_base; const struct opt_ia_na *ia = NULL; + /* cppcheck-suppress [variableScope,unmatchedSuppression] */ struct opt_hdr client_id_storage; + /* cppcheck-suppress [variableScope,unmatchedSuppression] */ struct opt_ia_na ia_storage; const struct in6_addr *src; struct msg_hdr mh_storage; From 81fd66a31e44a07ee98f22946104f951245123e5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:36 +1000 Subject: [PATCH 05/10] test: Don't delete exetool on make clean exetool is listed in $(LOCAL_ASSETS), which seems like it makes sense, until you realise that $(LOCAL_ASSETS) are deleted on make clean, and we can't rebuild exetool after deleting it (since it's just part of the exeter tree we download). Move it to $(DOWNLOAD_ASSETS) instead. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- test/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/Makefile b/test/Makefile index 49388276..3800a0a9 100644 --- a/test/Makefile +++ b/test/Makefile @@ -52,15 +52,14 @@ UBUNTU_NEW_IMGS = xenial-server-cloudimg-powerpc-disk1.img \ jammy-server-cloudimg-s390x.img UBUNTU_IMGS = $(UBUNTU_OLD_IMGS) $(UBUNTU_NEW_IMGS) -DOWNLOAD_ASSETS = exeter mbuto podman \ +DOWNLOAD_ASSETS = $(EXETOOL) mbuto podman \ $(DEBIAN_IMGS) $(FEDORA_IMGS) $(OPENSUSE_IMGS) $(UBUNTU_IMGS) TESTDATA_ASSETS = small.bin big.bin medium.bin \ rampstream LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ $(DEBIAN_IMGS:%=prepared-%) $(FEDORA_IMGS:%=prepared-%) \ $(UBUNTU_NEW_IMGS:%=prepared-%) \ - nstool guest-key guest-key.pub $(EXETOOL) \ - $(TESTDATA_ASSETS) + nstool guest-key guest-key.pub $(TESTDATA_ASSETS) ASSETS = $(DOWNLOAD_ASSETS) $(LOCAL_ASSETS) From 2a16cdf3e605a1c72647ccd520106a1e59ec8d50 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 2 Oct 2025 15:04:37 +1000 Subject: [PATCH 06/10] test: Add linting of Python test scripts We currently have one test moved to the new exeter based framwork written in Python. We plan to add many more, so add linting (flake8) and type checking (mypy) of those scripts. This can be invoked manually with "make flake8" or "make mypy" in test/, and is also added to the static checkers test set. Signed-off-by: David Gibson Signed-off-by: Stefano Brivio --- test/Makefile | 15 ++++++++++++++- test/build/build.py | 5 +++-- test/build/static_checkers.sh | 6 +++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 3800a0a9..5b5f0fc1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -8,6 +8,8 @@ BATS = bats -j $(shell nproc) EXETOOL = exeter/exetool/exetool WGET = wget -c +FLAKE8 = flake8 +MYPY = mypy --strict DEBIAN_IMGS = debian-8.11.0-openstack-amd64.qcow2 \ debian-10-nocloud-amd64.qcow2 \ @@ -64,11 +66,15 @@ LOCAL_ASSETS = mbuto.img mbuto.mem.img podman/bin/podman QEMU_EFI.fd \ ASSETS = $(DOWNLOAD_ASSETS) $(LOCAL_ASSETS) EXETER_PYPATH = exeter/py3 +EXETER_PYTHON = build/build.py EXETER_BATS = smoke/smoke.sh.bats \ - build/build.py.bats build/static_checkers.sh.bats + $(EXETER_PYTHON:%=%.bats) build/static_checkers.sh.bats BATS_FILES = $(EXETER_BATS) \ podman/test/system/505-networking-pasta.bats +# Python test code (for linters) +PYPKGS = $(EXETER_PYTHON) + CFLAGS = -Wall -Werror -Wextra -pedantic -std=c99 assets: $(ASSETS) @@ -127,6 +133,12 @@ medium.bin: big.bin: dd if=/dev/urandom bs=1M count=10 of=$@ +flake8: pull-exeter + PYTHONPATH=$(EXETER_PYPATH) $(FLAKE8) $(PYPKGS) + +mypy: pull-exeter + PYTHONPATH=$(EXETER_PYPATH) $(MYPY) $(PYPKGS) + $(EXETER_BATS): %.bats: % $(EXETOOL) PYTHONPATH=$(EXETER_PYPATH) $(EXETOOL) bats -- $< > $@ @@ -141,6 +153,7 @@ debug: assets clean: rm -f perf.js *~ + rm -rf .mypy_cache rm -f $(LOCAL_ASSETS) rm -f $(EXETER_BATS) rm -rf test_logs diff --git a/test/build/build.py b/test/build/build.py index e49287c9..e3de8305 100755 --- a/test/build/build.py +++ b/test/build/build.py @@ -18,11 +18,12 @@ from pathlib import Path import subprocess import tempfile -from typing import Iterable, Iterator +from typing import Iterator import exeter -def sh(cmd): + +def sh(cmd: str) -> None: """Run given command in a shell""" subprocess.run(cmd, shell=True) diff --git a/test/build/static_checkers.sh b/test/build/static_checkers.sh index 42806e79..228b99ae 100755 --- a/test/build/static_checkers.sh +++ b/test/build/static_checkers.sh @@ -21,6 +21,10 @@ exeter_set_description cppcheck "passt sources pass cppcheck" exeter_register clang_tidy make -C .. clang-tidy exeter_set_description clang_tidy "passt sources pass clang-tidy" -exeter_main "$@" +exeter_register flake8 make flake8 +exeter_set_description flake8 "passt tests in Python pass flake8" +exeter_register mypy make mypy +exeter_set_description mypy "passt tests in Python pass mypy --strict" +exeter_main "$@" From b3217aa5aec10704774c7142ef07daec5e698415 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 30 Sep 2025 22:26:02 +0200 Subject: [PATCH 07/10] tcp: Fix ACK sequence on FIN to tap If we reach end-of-file on a socket (or get EPOLLRDHUP / EPOLLHUP) and send a FIN segment to the guest / container acknowledging a sequence number that's behind what we received so far, we won't have any further trigger to send an updated ACK segment, as we are now switching the epoll socket monitoring to edge-triggered mode. To avoid this situation, in tcp_update_seqack_wnd(), we set the next acknowledgement sequence to the current observed sequence, regardless of what was acknowledged socket-side. However, we don't necessarily call tcp_update_seqack_wnd() before sending the FIN segment, which might potentially lead to a situation, not observed in practice, where we unnecessarily cause a retransmission at some point after our FIN segment. Avoid that by setting the ACK sequence to whatever we received from the container / guest, before sending a FIN segment and switching to EPOLLET. Signed-off-by: Stefano Brivio Reviewed-by: David Gibson --- tcp_buf.c | 14 +++++++++++++- tcp_vu.c | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index a493b5a0..cc106bc6 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -368,7 +368,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn_flag(c, conn, STALLED); } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - int ret = tcp_buf_send_flag(c, conn, FIN | ACK); + int ret; + + /* On TAP_FIN_SENT, we won't get further data events + * from the socket, and this might be the last ACK + * segment we send to the tap, so update its sequence to + * include everything we received until now. + * + * See also the special handling on CONN_IS_CLOSING() in + * tcp_update_seqack_wnd(). + */ + conn->seq_ack_to_tap = conn->seq_from_tap; + + ret = tcp_buf_send_flag(c, conn, FIN | ACK); if (ret) { tcp_rst(c, conn); return ret; diff --git a/tcp_vu.c b/tcp_vu.c index ebd3a1ed..3ec3538d 100644 --- a/tcp_vu.c +++ b/tcp_vu.c @@ -410,7 +410,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn_flag(c, conn, STALLED); } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - int ret = tcp_vu_send_flag(c, conn, FIN | ACK); + int ret; + + /* See tcp_buf_data_from_sock() */ + conn->seq_ack_to_tap = conn->seq_from_tap; + + ret = tcp_vu_send_flag(c, conn, FIN | ACK); if (ret) { tcp_rst(c, conn); return ret; From 8efa80b51f9f52082954aa26719b36a9ec367567 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 2 Oct 2025 01:13:27 +0200 Subject: [PATCH 08/10] tcp: Completely ignore data segment in CLOSE-WAIT state, log a message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to RFC 9293 we should ignore data (note: not data segments) in CLOSE-WAIT state (indicated by TAP_FIN_RCVD), see 3.10.7.4 "Other states": [...] Seventh, process the segment text: [...] CLOSE-WAIT STATE This should not occur since a FIN has been received from the remote side. Ignore the segment text. and we almost do that, except that we would look at the data length to decide whether it's a request for fast re-transmission, so fix that, and while at it, log a message, so that cases such as the following one are more apparent in debug logs: 28692 0.009758 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [FIN, ACK] Seq=121441 Ack=141 Win=65536 Len=0 we should ignore this FIN flag, because we didn't accept data up to this sequence (see next segment), but we don't do it, so, here: 28693 0.000036 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90722 Win=32128 Len=0 28694 0.034597 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [FIN, ACK] Seq=141 Ack=90722 Win=121216 Len=0 28695 0.000019 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [ACK] Seq=121442 Ack=142 Win=65536 Len=0 28696 0.162968 88.198.0.164 → 93.235.151.95 30773 TCP [TCP Retransmission] 55414 → 47080 [FIN, PSH, ACK] Seq=90722 Ack=142 Win=65536 Len=30719 [TCP segment of a reassembled PDU] we are erroneously in CLOSE-WAIT (TAP_FIN_RCVD) state, and this segment would look pretty strange there. This specific case is fixed by the next patch, so it should never happen again. Link: https://archives.passt.top/passt-dev/20250910115726.432bbb8d@elisabeth/ Link: https://bugs.passt.top/show_bug.cgi?id=126 Suggested-by: David Gibson Signed-off-by: Stefano Brivio Reviewed-by: David Gibson --- tcp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 7da41797..a648174b 100644 --- a/tcp.c +++ b/tcp.c @@ -2135,9 +2135,15 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { + size_t dlen; bool retr; - retr = th->ack && !tcp_packet_data_len(th, l4len) && !th->fin && + if ((dlen = tcp_packet_data_len(th, l4len))) { + flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)", + dlen); + } + + retr = th->ack && !th->fin && ntohl(th->ack_seq) == conn->seq_ack_from_tap && ntohs(th->window) == conn->wnd_from_tap; From b145441913eef6f8885b6b84531e944ff593790c Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 2 Oct 2025 00:41:54 +0200 Subject: [PATCH 09/10] tcp: Don't consider FIN flags with mismatching sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a guest or container sends us a FIN segment but its sequence number doesn't match the highest sequence of data we *accepted* (not necessarily the highest sequence we received), that is, conn->seq_from_tap, plus any data we're accepting in the current batch, we should discard the flag (not necessarily the segment), because there's still data we need to receive (again) before the end of the stream. If we consider those FIN flags as such, we'll end up in the situation described below. Here, 192.168.10.102 is a HTTP server in a Podman container, and 192.168.10.44 is a client fetching approximately 121 KB of data from it: 82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0 the server is done sending 83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0 pasta (client) acknowledges a previous sequence, because of a short sendmsg() 84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0 pasta (client) sends FIN, ACK as the client has no more data to send (a single GET request), while still acknowledging a previous sequence, because the retransmission didn't happen yet 85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0 the server acknowledges the FIN, ACK 86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU] and finally a retransmission comes, but as we wrongly switched to the CLOSE-WAIT state, 87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0 we consider frame #86 as an acknowledgement for the FIN segment we sent, and close the connection, while we still had to re-receive (and finally send) the missing data segment, instead. Link: https://github.com/containers/podman/issues/27179 Signed-off-by: Stefano Brivio --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index a648174b..0cca3de1 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,7 +1774,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, } } - if (th->fin) + if (th->fin && seq == seq_from_tap) fin = 1; if (!len) From a94783951d0d759912dcae0c6289c61fe6a86fc4 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 2 Oct 2025 00:52:53 +0200 Subject: [PATCH 10/10] tcp: On partial send (incomplete sendmsg()), request a retransmission right away MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ...possibly with an updated window value. As we're discarding the remaining data, we'll need to receive it again. If we don't request a retransmission immediately, we'll see an apparent gap in the sequence, and request a retransmission on the next data batch or segment, but we're just wasting time like that. In packets: 28686 0.000007 88.198.0.164 → 93.235.151.95 16118 TCP 55414 → 47080 [ACK] Seq=80501 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] 28687 0.000012 88.198.0.164 → 93.235.151.95 16118 TCP [TCP Window Full] 55414 → 47080 [ACK] Seq=96565 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] on this second data segment, we have a short sendmsg(), and 28688 0.000026 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 consequently acknowledge it, without requesting a retransmission, 28689 0.000006 88.198.0.164 → 93.235.151.95 8866 HTTP HTTP/1.1 200 ok (text/css) so the server proceeds sending a bit more, but 28690 0.000016 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#1] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 28691 0.000000 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#2] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 we'll throw that data (from frame #28689) away, and finally request a retransmission as we spotted the gap now. Request a retransmission as soon as we know we'll need it, instead. Signed-off-by: Stefano Brivio Reviewed-by: David Gibson --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 0cca3de1..91aa3301 100644 --- a/tcp.c +++ b/tcp.c @@ -1881,7 +1881,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, } out: - if (keep != -1) { + if (keep != -1 || partial_send) { /* We use an 8-bit approximation here: the associated risk is * that we skip a duplicate ACK on 8-bit sequence number * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2.