Skip to content

Commit

Permalink
server: Add domain metrics feature flag
Browse files Browse the repository at this point in the history
This commit puts potentially-expensive domain metrics behind a
`--verbose-domain-metrics` flag to reduce the cost of the metrics we
emit in production deployments.

Release-Note-Core: Added a feature flag to reduce the number of
  expensive domain metrics Readyset emits in production deployments
Change-Id: Ia4b6110d8b5af400d7449defc2d95fd6925028ea
Reviewed-on: https://gerrit.readyset.name/c/readyset/+/6898
Tested-by: Buildkite CI
Reviewed-by: Luke Osborne <luke@readyset.io>
  • Loading branch information
ethan-readyset committed Feb 12, 2024
1 parent 744f998 commit 47ae35e
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 59 deletions.
132 changes: 74 additions & 58 deletions readyset-dataflow/src/domain/domain_metrics.rs
Expand Up @@ -15,11 +15,15 @@ use crate::{Packet, PacketDiscriminants};
/// Whenever possible the handles are generated at init time, others
/// that require dynamic labels are created on demand and stored in
/// a BTreeMap or a NodeMap.
pub(super) struct DomainMetrics;
pub(super) struct DomainMetrics {
/// Whether to record metrics that include metric labels with high cardinality. This flag
/// should be used very sparingly, as the cost of emitting these metrics could be quite high!
verbose: bool,
}

impl DomainMetrics {
pub(super) fn new() -> Self {
DomainMetrics
pub(super) fn new(verbose: bool) -> Self {
DomainMetrics { verbose }
}

pub(super) fn inc_eviction_requests(&self) {
Expand All @@ -44,45 +48,51 @@ impl DomainMetrics {
}

pub(super) fn rec_replay_time(&mut self, cache_name: &Relation, time: Duration) {
counter!(
recorded::DOMAIN_TOTAL_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
if self.verbose {
counter!(
recorded::DOMAIN_TOTAL_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
}
}

pub(super) fn rec_seed_replay_time(&mut self, cache_name: &Relation, time: Duration) {
counter!(
recorded::DOMAIN_TOTAL_SEED_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_SEED_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
if self.verbose {
counter!(
recorded::DOMAIN_TOTAL_SEED_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_SEED_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
}
}

pub(super) fn rec_finish_replay_time(&mut self, cache_name: &Relation, time: Duration) {
counter!(
recorded::DOMAIN_TOTAL_FINISH_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_FINISH_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
if self.verbose {
counter!(
recorded::DOMAIN_TOTAL_FINISH_REPLAY_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_FINISH_REPLAY_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
}
}

pub(super) fn rec_forward_time_input(&mut self, time: Duration) {
Expand All @@ -96,17 +106,19 @@ impl DomainMetrics {
}

pub(super) fn rec_reader_replay_time(&mut self, cache_name: &Relation, time: Duration) {
counter!(
recorded::DOMAIN_READER_TOTAL_REPLAY_REQUEST_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_READER_REPLAY_REQUEST_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
if self.verbose {
counter!(
recorded::DOMAIN_READER_TOTAL_REPLAY_REQUEST_TIME,
time.as_micros() as u64,
"cache_name" => cache_name_to_string(cache_name)
);

histogram!(
recorded::DOMAIN_READER_REPLAY_REQUEST_TIME,
time.as_micros() as f64,
"cache_name" => cache_name_to_string(cache_name)
);
}
}

pub(super) fn inc_replay_misses(&mut self, cache_name: &Relation, n: usize) {
Expand All @@ -133,20 +145,24 @@ impl DomainMetrics {
}

pub(super) fn set_base_table_size(&self, name: &Relation, size: u64) {
gauge!(
recorded::ESTIMATED_BASE_TABLE_SIZE_BYTES,
size as f64,
"table_name" => cache_name_to_string(name),
);
if self.verbose {
gauge!(
recorded::ESTIMATED_BASE_TABLE_SIZE_BYTES,
size as f64,
"table_name" => cache_name_to_string(name),
);
}
}

pub(super) fn inc_base_table_lookups(&mut self, cache_name: &Relation, table_name: &Relation) {
counter!(
recorded::BASE_TABLE_LOOKUP_REQUESTS,
1,
"cache_name" => cache_name_to_string(cache_name),
"table_name" => cache_name_to_string(table_name)
);
if self.verbose {
counter!(
recorded::BASE_TABLE_LOOKUP_REQUESTS,
1,
"cache_name" => cache_name_to_string(cache_name),
"table_name" => cache_name_to_string(table_name)
);
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion readyset-dataflow/src/domain/mod.rs
Expand Up @@ -77,6 +77,10 @@ pub struct Config {

#[serde(default)]
pub eviction_kind: crate::EvictionKind,

/// Whether to emit verbose metrics for the domain.
#[serde(default)]
pub verbose_metrics: bool,
}

const BATCH_SIZE: usize = 256;
Expand Down Expand Up @@ -459,7 +463,7 @@ impl DomainBuilder {

aggressively_update_state_sizes: self.config.aggressively_update_state_sizes,

metrics: domain_metrics::DomainMetrics::new(),
metrics: domain_metrics::DomainMetrics::new(self.config.verbose_metrics),

eviction_kind: self.config.eviction_kind,
remapped_keys: Default::default(),
Expand Down
7 changes: 7 additions & 0 deletions readyset-server/src/builder.rs
Expand Up @@ -95,6 +95,7 @@ impl Builder {
));

builder.set_replication_strategy(opts.domain_replication_options.into());
builder.set_verbose_domain_metrics(opts.verbose_domain_metrics);

if let Some(volume_id) = opts.volume_id {
builder.set_volume_id(volume_id);
Expand Down Expand Up @@ -317,6 +318,12 @@ impl Builder {
self.config.domain_config.view_request_timeout = value;
}

/// Sets the value of [`Config::domain_config::verbose_metrics`]. See documentation of
/// that field for more information.
pub fn set_verbose_domain_metrics(&mut self, value: bool) {
self.config.domain_config.verbose_metrics = value;
}

/// Sets the value of [`Config::domain_config::table_request_timeout`]. See documentation of
/// that field for more information.
pub fn set_table_request_timeout(&mut self, value: std::time::Duration) {
Expand Down
1 change: 1 addition & 0 deletions readyset-server/src/integration_serial.rs
Expand Up @@ -58,6 +58,7 @@ async fn it_works_basic_impl() {
builder.set_persistence(get_persistence_params("it_works_basic"));
builder.set_allow_topk(true);
builder.enable_packet_filters();
builder.set_verbose_domain_metrics(true);
builder.start_local()
}
.await
Expand Down
12 changes: 12 additions & 0 deletions readyset-server/src/lib.rs
Expand Up @@ -518,6 +518,7 @@ impl Default for Config {
// now.
table_request_timeout: Duration::from_millis(1800000),
eviction_kind: dataflow::EvictionKind::Random,
verbose_metrics: false,
},
persistence: Default::default(),
min_workers: 1,
Expand Down Expand Up @@ -668,6 +669,17 @@ pub struct WorkerOptions {
hide = true
)]
pub background_recovery_interval_seconds: u64,

/// Whether to emit verbose metrics for the domains on this worker. This should be used very
/// sparingly, as the metrics emitted will have high label cardinality and can be quite
/// expensive!
#[arg(
long,
env = "VERBOSE_DOMAIN_METRICS",
default_value = "false",
hide = true
)]
pub verbose_domain_metrics: bool,
}

impl WorkerOptions {
Expand Down

0 comments on commit 47ae35e

Please sign in to comment.