Remove grandparents from snapshot cache (#2917)

## Issue Addressed NA ## Proposed Changes In #2832 we made some changes to the `SnapshotCache` to help deal with the one-block reorgs seen on mainnet (and testnets). I believe the change in #2832 is good and we should keep it, but I think that in its present form it is causing the `SnapshotCache` to hold onto states that it doesn't need anymore. For example, a skip slot will result in one more `BeaconSnapshot` being stored in the cache. This PR adds a new type of pruning that happens after a block is inserted to the cache. We will remove any snapshot from the cache that is a *grandparent* of the block being imported. Since we know the grandparent has two valid blocks built atop it, it is not at risk from a one-block re-org. ## Additional Info NA
sigp · Jan 14, 2022 · c11253a · c11253a
1 parent ceeab02
commit c11253a
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 4 deletions.
diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs
@@ -2796,6 +2796,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                         beacon_block_root: block_root,
                     },
                     None,
+                    &self.spec,
                 )
             })
             .unwrap_or_else(|e| {
@@ -3740,6 +3741,12 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
             .try_write_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
             .map(|mut snapshot_cache| {
                 snapshot_cache.prune(new_finalized_checkpoint.epoch);
+                debug!(
+                    self.log,
+                    "Snapshot cache pruned";
+                    "new_len" => snapshot_cache.len(),
+                    "remaining_roots" => ?snapshot_cache.beacon_block_roots(),
+                );
             })
             .unwrap_or_else(|| {
                 error!(

diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs
@@ -4,8 +4,12 @@ use crate::{BeaconChain, BeaconChainError, BeaconChainTypes};
 use lazy_static::lazy_static;
 pub use lighthouse_metrics::*;
 use slot_clock::SlotClock;
+use std::time::Duration;
 use types::{BeaconState, Epoch, EthSpec, Hash256, Slot};
 
+/// The maximum time to wait for the snapshot cache lock during a metrics scrape.
+const SNAPSHOT_CACHE_TIMEOUT: Duration = Duration::from_millis(100);
+
 lazy_static! {
     /*
      * Block Processing
@@ -18,6 +22,10 @@ lazy_static! {
         "beacon_block_processing_successes_total",
         "Count of blocks processed without error"
     );
+    pub static ref BLOCK_PROCESSING_SNAPSHOT_CACHE_SIZE: Result<IntGauge> = try_create_int_gauge(
+        "beacon_block_processing_snapshot_cache_size",
+        "Count snapshots in the snapshot cache"
+    );
     pub static ref BLOCK_PROCESSING_SNAPSHOT_CACHE_MISSES: Result<IntCounter> = try_create_int_counter(
         "beacon_block_processing_snapshot_cache_misses",
         "Count of snapshot cache misses"
@@ -913,6 +921,16 @@ pub fn scrape_for_metrics<T: BeaconChainTypes>(beacon_chain: &BeaconChain<T>) {
 
     let attestation_stats = beacon_chain.op_pool.attestation_stats();
 
+    if let Some(snapshot_cache) = beacon_chain
+        .snapshot_cache
+        .try_write_for(SNAPSHOT_CACHE_TIMEOUT)
+    {
+        set_gauge(
+            &BLOCK_PROCESSING_SNAPSHOT_CACHE_SIZE,
+            snapshot_cache.len() as i64,
+        )
+    }
+
     set_gauge_by_usize(
         &OP_POOL_NUM_ATTESTATIONS,
         attestation_stats.num_attestations,

diff --git a/beacon_node/beacon_chain/src/snapshot_cache.rs b/beacon_node/beacon_chain/src/snapshot_cache.rs
@@ -1,4 +1,5 @@
 use crate::BeaconSnapshot;
+use itertools::process_results;
 use std::cmp;
 use std::time::Duration;
 use types::{
@@ -164,16 +165,51 @@ impl<T: EthSpec> SnapshotCache<T> {
         }
     }
 
+    /// The block roots of all snapshots contained in `self`.
+    pub fn beacon_block_roots(&self) -> Vec<Hash256> {
+        self.snapshots.iter().map(|s| s.beacon_block_root).collect()
+    }
+
+    /// The number of snapshots contained in `self`.
+    pub fn len(&self) -> usize {
+        self.snapshots.len()
+    }
+
     /// Insert a snapshot, potentially removing an existing snapshot if `self` is at capacity (see
     /// struct-level documentation for more info).
-    pub fn insert(&mut self, snapshot: BeaconSnapshot<T>, pre_state: Option<BeaconState<T>>) {
+    pub fn insert(
+        &mut self,
+        snapshot: BeaconSnapshot<T>,
+        pre_state: Option<BeaconState<T>>,
+        spec: &ChainSpec,
+    ) {
+        let parent_root = snapshot.beacon_block.message().parent_root();
         let item = CacheItem {
             beacon_block: snapshot.beacon_block,
             beacon_block_root: snapshot.beacon_block_root,
             beacon_state: snapshot.beacon_state,
             pre_state,
         };
 
+        // Remove the grandparent of the block that was just inserted.
+        //
+        // Assuming it's unlikely to see re-orgs deeper than one block, this method helps keep the
+        // cache small by removing any states that already have more than one descendant.
+        //
+        // Remove the grandparent first to free up room in the cache.
+        let grandparent_result =
+            process_results(item.beacon_state.rev_iter_block_roots(spec), |iter| {
+                iter.map(|(_slot, root)| root)
+                    .find(|root| *root != item.beacon_block_root && *root != parent_root)
+            });
+        if let Ok(Some(grandparent_root)) = grandparent_result {
+            let head_block_root = self.head_block_root;
+            self.snapshots.retain(|snapshot| {
+                let root = snapshot.beacon_block_root;
+                root == head_block_root || root != grandparent_root
+            });
+        }
+
         if self.snapshots.len() < self.max_len {
             self.snapshots.push(item);
         } else {
@@ -384,7 +420,7 @@ mod test {
             *snapshot.beacon_state.slot_mut() =
                 Slot::from(i * MainnetEthSpec::slots_per_epoch() + 1);
 
-            cache.insert(snapshot, None);
+            cache.insert(snapshot, None, &spec);
 
             assert_eq!(
                 cache.snapshots.len(),
@@ -402,7 +438,7 @@ mod test {
         // 2        2
         // 3        3
         assert_eq!(cache.snapshots.len(), CACHE_SIZE);
-        cache.insert(get_snapshot(42), None);
+        cache.insert(get_snapshot(42), None, &spec);
         assert_eq!(cache.snapshots.len(), CACHE_SIZE);
 
         assert!(
@@ -462,7 +498,7 @@ mod test {
 
         // Over-fill the cache so it needs to eject some old values on insert.
         for i in 0..CACHE_SIZE as u64 {
-            cache.insert(get_snapshot(u64::max_value() - i), None);
+            cache.insert(get_snapshot(u64::max_value() - i), None, &spec);
         }
 
         // Ensure that the new head value was not removed from the cache.