diff --git a/readyset-dataflow/src/domain/domain_metrics.rs b/readyset-dataflow/src/domain/domain_metrics.rs index fd7ebef5ef..93435ae1eb 100644 --- a/readyset-dataflow/src/domain/domain_metrics.rs +++ b/readyset-dataflow/src/domain/domain_metrics.rs @@ -15,11 +15,15 @@ use crate::{Packet, PacketDiscriminants}; /// Whenever possible the handles are generated at init time, others /// that require dynamic labels are created on demand and stored in /// a BTreeMap or a NodeMap. -pub(super) struct DomainMetrics; +pub(super) struct DomainMetrics { + /// Whether to record metrics that include metric labels with high cardinality. This flag + /// should be used very sparingly, as the cost of emitting these metrics could be quite high! + verbose: bool, +} impl DomainMetrics { - pub(super) fn new() -> Self { - DomainMetrics + pub(super) fn new(verbose: bool) -> Self { + DomainMetrics { verbose } } pub(super) fn inc_eviction_requests(&self) { @@ -44,45 +48,51 @@ impl DomainMetrics { } pub(super) fn rec_replay_time(&mut self, cache_name: &Relation, time: Duration) { - counter!( - recorded::DOMAIN_TOTAL_REPLAY_TIME, - time.as_micros() as u64, - "cache_name" => cache_name_to_string(cache_name) - ); - - histogram!( - recorded::DOMAIN_REPLAY_TIME, - time.as_micros() as f64, - "cache_name" => cache_name_to_string(cache_name) - ); + if self.verbose { + counter!( + recorded::DOMAIN_TOTAL_REPLAY_TIME, + time.as_micros() as u64, + "cache_name" => cache_name_to_string(cache_name) + ); + + histogram!( + recorded::DOMAIN_REPLAY_TIME, + time.as_micros() as f64, + "cache_name" => cache_name_to_string(cache_name) + ); + } } pub(super) fn rec_seed_replay_time(&mut self, cache_name: &Relation, time: Duration) { - counter!( - recorded::DOMAIN_TOTAL_SEED_REPLAY_TIME, - time.as_micros() as u64, - "cache_name" => cache_name_to_string(cache_name) - ); - - histogram!( - recorded::DOMAIN_SEED_REPLAY_TIME, - time.as_micros() as f64, - "cache_name" => cache_name_to_string(cache_name) - ); + if self.verbose { + counter!( + recorded::DOMAIN_TOTAL_SEED_REPLAY_TIME, + time.as_micros() as u64, + "cache_name" => cache_name_to_string(cache_name) + ); + + histogram!( + recorded::DOMAIN_SEED_REPLAY_TIME, + time.as_micros() as f64, + "cache_name" => cache_name_to_string(cache_name) + ); + } } pub(super) fn rec_finish_replay_time(&mut self, cache_name: &Relation, time: Duration) { - counter!( - recorded::DOMAIN_TOTAL_FINISH_REPLAY_TIME, - time.as_micros() as u64, - "cache_name" => cache_name_to_string(cache_name) - ); - - histogram!( - recorded::DOMAIN_FINISH_REPLAY_TIME, - time.as_micros() as f64, - "cache_name" => cache_name_to_string(cache_name) - ); + if self.verbose { + counter!( + recorded::DOMAIN_TOTAL_FINISH_REPLAY_TIME, + time.as_micros() as u64, + "cache_name" => cache_name_to_string(cache_name) + ); + + histogram!( + recorded::DOMAIN_FINISH_REPLAY_TIME, + time.as_micros() as f64, + "cache_name" => cache_name_to_string(cache_name) + ); + } } pub(super) fn rec_forward_time_input(&mut self, time: Duration) { @@ -96,17 +106,19 @@ impl DomainMetrics { } pub(super) fn rec_reader_replay_time(&mut self, cache_name: &Relation, time: Duration) { - counter!( - recorded::DOMAIN_READER_TOTAL_REPLAY_REQUEST_TIME, - time.as_micros() as u64, - "cache_name" => cache_name_to_string(cache_name) - ); - - histogram!( - recorded::DOMAIN_READER_REPLAY_REQUEST_TIME, - time.as_micros() as f64, - "cache_name" => cache_name_to_string(cache_name) - ); + if self.verbose { + counter!( + recorded::DOMAIN_READER_TOTAL_REPLAY_REQUEST_TIME, + time.as_micros() as u64, + "cache_name" => cache_name_to_string(cache_name) + ); + + histogram!( + recorded::DOMAIN_READER_REPLAY_REQUEST_TIME, + time.as_micros() as f64, + "cache_name" => cache_name_to_string(cache_name) + ); + } } pub(super) fn inc_replay_misses(&mut self, cache_name: &Relation, n: usize) { @@ -133,20 +145,24 @@ impl DomainMetrics { } pub(super) fn set_base_table_size(&self, name: &Relation, size: u64) { - gauge!( - recorded::ESTIMATED_BASE_TABLE_SIZE_BYTES, - size as f64, - "table_name" => cache_name_to_string(name), - ); + if self.verbose { + gauge!( + recorded::ESTIMATED_BASE_TABLE_SIZE_BYTES, + size as f64, + "table_name" => cache_name_to_string(name), + ); + } } pub(super) fn inc_base_table_lookups(&mut self, cache_name: &Relation, table_name: &Relation) { - counter!( - recorded::BASE_TABLE_LOOKUP_REQUESTS, - 1, - "cache_name" => cache_name_to_string(cache_name), - "table_name" => cache_name_to_string(table_name) - ); + if self.verbose { + counter!( + recorded::BASE_TABLE_LOOKUP_REQUESTS, + 1, + "cache_name" => cache_name_to_string(cache_name), + "table_name" => cache_name_to_string(table_name) + ); + } } } diff --git a/readyset-dataflow/src/domain/mod.rs b/readyset-dataflow/src/domain/mod.rs index 26461e4692..1b50777223 100644 --- a/readyset-dataflow/src/domain/mod.rs +++ b/readyset-dataflow/src/domain/mod.rs @@ -77,6 +77,10 @@ pub struct Config { #[serde(default)] pub eviction_kind: crate::EvictionKind, + + /// Whether to emit verbose metrics for the domain. + #[serde(default)] + pub verbose_metrics: bool, } const BATCH_SIZE: usize = 256; @@ -459,7 +463,7 @@ impl DomainBuilder { aggressively_update_state_sizes: self.config.aggressively_update_state_sizes, - metrics: domain_metrics::DomainMetrics::new(), + metrics: domain_metrics::DomainMetrics::new(self.config.verbose_metrics), eviction_kind: self.config.eviction_kind, remapped_keys: Default::default(), diff --git a/readyset-server/src/builder.rs b/readyset-server/src/builder.rs index fc024ff457..517126f598 100644 --- a/readyset-server/src/builder.rs +++ b/readyset-server/src/builder.rs @@ -95,6 +95,7 @@ impl Builder { )); builder.set_replication_strategy(opts.domain_replication_options.into()); + builder.set_verbose_domain_metrics(opts.verbose_domain_metrics); if let Some(volume_id) = opts.volume_id { builder.set_volume_id(volume_id); @@ -317,6 +318,12 @@ impl Builder { self.config.domain_config.view_request_timeout = value; } + /// Sets the value of [`Config::domain_config::verbose_metrics`]. See documentation of + /// that field for more information. + pub fn set_verbose_domain_metrics(&mut self, value: bool) { + self.config.domain_config.verbose_metrics = value; + } + /// Sets the value of [`Config::domain_config::table_request_timeout`]. See documentation of /// that field for more information. pub fn set_table_request_timeout(&mut self, value: std::time::Duration) { diff --git a/readyset-server/src/integration_serial.rs b/readyset-server/src/integration_serial.rs index e54047bb4f..07e2502e9b 100644 --- a/readyset-server/src/integration_serial.rs +++ b/readyset-server/src/integration_serial.rs @@ -58,6 +58,7 @@ async fn it_works_basic_impl() { builder.set_persistence(get_persistence_params("it_works_basic")); builder.set_allow_topk(true); builder.enable_packet_filters(); + builder.set_verbose_domain_metrics(true); builder.start_local() } .await diff --git a/readyset-server/src/lib.rs b/readyset-server/src/lib.rs index f7fc50d3f7..49184c177b 100644 --- a/readyset-server/src/lib.rs +++ b/readyset-server/src/lib.rs @@ -518,6 +518,7 @@ impl Default for Config { // now. table_request_timeout: Duration::from_millis(1800000), eviction_kind: dataflow::EvictionKind::Random, + verbose_metrics: false, }, persistence: Default::default(), min_workers: 1, @@ -668,6 +669,17 @@ pub struct WorkerOptions { hide = true )] pub background_recovery_interval_seconds: u64, + + /// Whether to emit verbose metrics for the domains on this worker. This should be used very + /// sparingly, as the metrics emitted will have high label cardinality and can be quite + /// expensive! + #[arg( + long, + env = "VERBOSE_DOMAIN_METRICS", + default_value = "false", + hide = true + )] + pub verbose_domain_metrics: bool, } impl WorkerOptions {