Skip to content

Commit

Permalink
Reduce black hole detection false positives
Browse files Browse the repository at this point in the history
Exactly MTU-sized datagrams aren't necessarily transmitted often, if
ever. As a result, black hole detection could previously reset the MTU
regularly at any nonzero packet loss rate.
  • Loading branch information
Ralith committed May 11, 2024
1 parent 61f8ac5 commit 4764764
Showing 1 changed file with 113 additions and 75 deletions.
188 changes: 113 additions & 75 deletions quinn-proto/src/connection/mtud.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{packet::SpaceId, MtuDiscoveryConfig, MAX_UDP_PAYLOAD};
use std::time::Instant;
use std::{cmp, time::Instant};
use tracing::trace;

/// Implements Datagram Packetization Layer Path Maximum Transmission Unit Discovery
Expand Down Expand Up @@ -104,14 +104,12 @@ impl MtuDiscovery {
self.current_mtu = new_mtu;
trace!(current_mtu = self.current_mtu, "new MTU detected");

self.black_hole_detector.on_probe_acked();
self.black_hole_detector
.on_probe_acked(packet_number, packet_bytes);
true
} else {
self.black_hole_detector.on_non_probe_acked(
self.current_mtu,
packet_number,
packet_bytes,
);
self.black_hole_detector
.on_non_probe_acked(packet_number, packet_bytes);
false
}
}
Expand Down Expand Up @@ -352,134 +350,174 @@ impl SearchState {

#[derive(Clone)]
struct BlackHoleDetector {
/// Counts suspicious packet loss bursts since a packet with size equal to the current MTU was
/// acknowledged (or since a black hole was detected)
/// Packet loss bursts currently considered suspicious
///
/// A packet loss burst is a group of contiguous packets that are deemed lost at the same time
/// (see usages of [`MtuDiscovery::on_non_probe_lost`] for details on how loss detection is
/// implemented)
///
/// A packet loss burst is considered suspicious when it contains only suspicious packets and no
/// MTU-sized packet has been acknowledged since the group's packets were sent
suspicious_loss_bursts: u8,
/// Indicates whether the current loss burst has any non-suspicious packets
///
/// Non-suspicious packets are non-probe packets of size <= `min_mtu`
loss_burst_has_non_suspicious_packets: bool,
/// The largest non-probe packet that was lost (used to keep track of loss bursts)
largest_non_probe_lost: Option<u64>,
/// The largest acked packet of size `current_mtu`
largest_acked_mtu_sized_packet: Option<u64>,
/// A lost packet is considered suspicious when it is larger than both the minimum MTU and the
/// largest acknowledged packet transmitted later than it. A packet loss burst is considered
/// suspicious when it contains only suspicious packets.
suspicious_loss_bursts: Vec<LossBurst>,
/// Loss burst currently being aggregated, if any
current_loss_burst: Option<CurrentLossBurst>,
/// Packet number of the biggest packet larger than `min_mtu` which we've received
/// acknowledgment of more recently than a suspicious loss burst, if any
largest_post_loss_packet: u64,
/// The maximum of `min_mtu` and the size of `largest_post_loss_packet`, or exactly `min_mtu` if
/// no larger packets have been received since the most recent loss burst.
safe_mtu: u16,
/// The UDP payload size guaranteed to be supported by the network
min_mtu: u16,
}

impl BlackHoleDetector {
fn new(min_mtu: u16) -> Self {
Self {
suspicious_loss_bursts: 0,
largest_non_probe_lost: None,
loss_burst_has_non_suspicious_packets: false,
largest_acked_mtu_sized_packet: None,
suspicious_loss_bursts: Vec::with_capacity(BLACK_HOLE_THRESHOLD + 1),
current_loss_burst: None,
largest_post_loss_packet: 0,
safe_mtu: min_mtu,
min_mtu,
}
}

fn on_probe_acked(&mut self) {
// We know for sure the path supports the current MTU
self.suspicious_loss_bursts = 0;
fn on_probe_acked(&mut self, packet_number: u64, packet_bytes: u16) {
debug_assert!(
packet_number >= self.largest_post_loss_packet,
"ACKs are delivered in order"
);
// MTU probes are always larger than the previous MTU, so no previous loss bursts are
// suspicious.
self.suspicious_loss_bursts.clear();
self.safe_mtu = packet_bytes;
self.largest_post_loss_packet = packet_number;
}

fn on_non_probe_acked(&mut self, current_mtu: u16, packet_number: u64, packet_bytes: u16) {
// Reset the black hole counter if a packet the size of the current MTU or larger
// has been acknowledged
if packet_bytes >= current_mtu
&& self
.largest_acked_mtu_sized_packet
.map_or(true, |pn| packet_number > pn)
{
self.suspicious_loss_bursts = 0;
self.largest_acked_mtu_sized_packet = Some(packet_number);
fn on_non_probe_acked(&mut self, packet_number: u64, packet_bytes: u16) {
debug_assert!(
packet_number >= self.largest_post_loss_packet,
"ACKs are delivered in order"
);
if packet_bytes <= self.safe_mtu {
// No impact.
return;
}
self.safe_mtu = packet_bytes;
self.largest_post_loss_packet = packet_number;
// Loss bursts packets smaller than this are retroactively deemed non-suspicious.
self.suspicious_loss_bursts
.retain(|burst| burst.smallest_packet_size > packet_bytes);
}

fn on_non_probe_lost(&mut self, packet_number: u64, packet_bytes: u16) {
// A loss burst is a group of consecutive packets that are declared lost, so a distance
// greater than 1 indicates a new burst
let new_loss_burst = self
.largest_non_probe_lost
.map_or(true, |prev| packet_number - prev != 1);
let end_last_burst = self.current_loss_burst.as_ref().map_or(false, |current| {
packet_number - current.latest_non_probe != 1
});

if new_loss_burst {
if end_last_burst {
self.finish_loss_burst();
}

if packet_bytes <= self.min_mtu {
self.loss_burst_has_non_suspicious_packets = true;
match self.current_loss_burst {
None => {
self.current_loss_burst = Some(CurrentLossBurst {
smallest_packet_size: packet_bytes,
latest_non_probe: packet_number,
});
}
Some(ref mut loss_burst) => {
loss_burst.latest_non_probe = packet_number;
loss_burst.smallest_packet_size =
cmp::min(loss_burst.smallest_packet_size, packet_bytes);
}
}

self.largest_non_probe_lost = Some(packet_number);
}

fn black_hole_detected(&mut self) -> bool {
self.finish_loss_burst();

if self.suspicious_loss_bursts <= BLACK_HOLE_THRESHOLD {
if self.suspicious_loss_bursts.len() <= BLACK_HOLE_THRESHOLD {
return false;
}

self.suspicious_loss_bursts = 0;
self.largest_acked_mtu_sized_packet = None;
self.suspicious_loss_bursts.clear();

true
}

/// Marks the end of the current loss burst, checking whether it was suspicious
fn finish_loss_burst(&mut self) {
if self.last_burst_was_suspicious() {
self.suspicious_loss_bursts = self.suspicious_loss_bursts.saturating_add(1);
}

self.loss_burst_has_non_suspicious_packets = false;
self.largest_non_probe_lost = None;
}

/// Returns true if the burst was suspicious and should count towards black hole detection
fn last_burst_was_suspicious(&self) -> bool {
// Ignore burst if it contains any non-suspicious packets, because in that case packet loss
// was likely caused by congestion (instead of a sudden decrease in the path's MTU)
if self.loss_burst_has_non_suspicious_packets {
return false;
}

// Ignore burst if we have received an ACK for a more recent MTU-sized packet, because that
// proves the network still supports the current MTU
let largest_acked = self.largest_acked_mtu_sized_packet.unwrap_or(0);
if self
.largest_non_probe_lost
.map_or(true, |largest_lost| largest_lost < largest_acked)
let Some(burst) = self.current_loss_burst.take() else {
return;
};
// If a loss burst contains a packet smaller than the minimum MTU or a more recently
// transmitted packet, it is not suspicious.
if burst.smallest_packet_size < self.min_mtu
|| (burst.latest_non_probe < self.largest_post_loss_packet
&& burst.smallest_packet_size < self.safe_mtu)
{
return false;
return;
}
// The loss burst is now deemed suspicious.

// A suspicious loss burst more recent than `largest_post_loss_packet` invalidates
// it. Ideally we'd reduce `safe_mtu` to the largest packet size received later the burst,
// but that would require significantly more state and complexity, and erring on the side of
// false positives is safe.
if burst.latest_non_probe > self.largest_post_loss_packet {
self.safe_mtu = self.min_mtu;
}

let burst = LossBurst {
smallest_packet_size: burst.smallest_packet_size,
};

if self.suspicious_loss_bursts.len() > BLACK_HOLE_THRESHOLD {
// To limit memory use, only track the most suspicious loss bursts.
let smallest = self
.suspicious_loss_bursts
.iter_mut()
.min_by_key(|prev| prev.smallest_packet_size)
.filter(|prev| prev.smallest_packet_size < burst.smallest_packet_size);
if let Some(smallest) = smallest {
*smallest = burst;
}
return;
}

true
self.suspicious_loss_bursts.push(burst);
}

#[cfg(test)]
fn suspicious_loss_burst_count(&self) -> usize {
self.suspicious_loss_bursts as usize
self.suspicious_loss_bursts.len()
}

#[cfg(test)]
fn largest_non_probe_lost(&self) -> Option<u64> {
self.largest_non_probe_lost
self.current_loss_burst.as_ref().map(|x| x.latest_non_probe)
}
}

#[derive(Clone)]
struct LossBurst {
smallest_packet_size: u16,
}

#[derive(Clone)]
struct CurrentLossBurst {
smallest_packet_size: u16,
latest_non_probe: u64,
}

// Corresponds to the RFC's `MAX_PROBES` constant (see
// https://www.rfc-editor.org/rfc/rfc8899#section-5.1.2)
const MAX_PROBE_RETRANSMITS: usize = 3;
const BLACK_HOLE_THRESHOLD: u8 = 3;
const BLACK_HOLE_THRESHOLD: usize = 3;

#[cfg(test)]
mod tests {
Expand Down

0 comments on commit 4764764

Please sign in to comment.