diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index c258c8052d8..b4c6b69d7b1 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -14,6 +14,7 @@ pub(crate) mod serialize; +use std::collections::HashSet; use std::hash::{Hash, Hasher}; use std::num::NonZeroUsize; use std::str::FromStr; @@ -565,11 +566,67 @@ pub(super) fn validate_index_config( Ok(()) } +/// Returns the updated doc mapping and a boolean indicating whether a mutation occurred. +/// +/// The logic goes as follows: +/// 1. If the new doc mapping is the same as the current doc mapping, ignoring their UIDs, returns +/// the current doc mapping and `false`, indicating that no mutation occurred. +/// 2. If the new doc mapping is different from the current doc mapping, verifies the following +/// constraints before returning the new doc mapping and `true`, indicating that a mutation +/// occurred: +/// - The doc mapping UID should differ from the current one +/// - The timestamp field should remain the same +/// - The tokenizers should be a superset of the current tokenizers +/// - A doc mapper can be built from the new doc mapping +pub fn prepare_doc_mapping_update( + mut new_doc_mapping: DocMapping, + current_doc_mapping: &DocMapping, + search_settings: &SearchSettings, +) -> anyhow::Result<(DocMapping, bool)> { + // Save the new doc mapping UID in a temporary variable and override it with the current doc + // mapping UID to compare the two doc mappings, ignoring their UIDs. + let new_doc_mapping_uid = new_doc_mapping.doc_mapping_uid; + new_doc_mapping.doc_mapping_uid = current_doc_mapping.doc_mapping_uid; + + if new_doc_mapping == *current_doc_mapping { + return Ok((new_doc_mapping, false)); + } + // Restore the new doc mapping UID. + new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid; + + ensure!( + new_doc_mapping.doc_mapping_uid != current_doc_mapping.doc_mapping_uid, + "new doc mapping UID should differ from the current one, current UID `{}`, new UID `{}`", + current_doc_mapping.doc_mapping_uid, + new_doc_mapping.doc_mapping_uid, + ); + let new_timestamp_field = new_doc_mapping.timestamp_field.as_deref(); + let current_timestamp_field = current_doc_mapping.timestamp_field.as_deref(); + ensure!( + new_timestamp_field == current_timestamp_field, + "updating timestamp field is not allowed, current timestamp field `{}`, new timestamp \ + field `{}`", + current_timestamp_field.unwrap_or("none"), + new_timestamp_field.unwrap_or("none"), + ); + // TODO: Unsure this constraint is required, should we relax it? + let new_tokenizers: HashSet<_> = new_doc_mapping.tokenizers.iter().collect(); + let current_tokenizers: HashSet<_> = current_doc_mapping.tokenizers.iter().collect(); + ensure!( + new_tokenizers.is_superset(¤t_tokenizers), + "updating tokenizers is allowed only if adding new tokenizers, current tokenizers \ + `{current_tokenizers:?}`, new tokenizers `{new_tokenizers:?}`", + ); + build_doc_mapper(&new_doc_mapping, search_settings).context("invalid doc mapping")?; + Ok((new_doc_mapping, true)) +} + #[cfg(test)] mod tests { use cron::TimeUnitSpec; - use quickwit_doc_mapper::ModeType; + use quickwit_doc_mapper::{Mode, ModeType, TokenizerEntry}; + use quickwit_proto::types::DocMappingUid; use super::*; use crate::ConfigFormat; @@ -981,4 +1038,96 @@ mod tests { let error = serde_yaml::from_str::(settings_yaml).unwrap_err(); assert!(error.to_string().contains("expected a nonzero")); } + + #[test] + fn test_prepare_doc_mapping_update() { + let current_index_config = IndexConfig::for_test("test-index", "s3://test-index"); + let mut current_doc_mapping = current_index_config.doc_mapping; + let search_settings = current_index_config.search_settings; + + let tokenizer_json = r#" + { + "name": "breton-tokenizer", + "type": "regex", + "pattern": "crêpes*" + } + "#; + let tokenizer: TokenizerEntry = serde_json::from_str(tokenizer_json).unwrap(); + + current_doc_mapping.tokenizers.push(tokenizer.clone()); + + // The new doc mapping should have a different doc mapping UID. + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.store_source = false; // This is set to `true` for the current doc mapping. + let error = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap_err() + .to_string(); + assert!(error.contains("doc mapping UID should differ")); + + // The new doc mapping should not change the timestamp field. + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.timestamp_field = Some("ts".to_string()); // This is set to `timestamp` for the current doc mapping. + let error = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap_err() + .to_string(); + assert!(error.contains("timestamp field")); + + // The new doc mapping should not remove the timestamp field. + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.timestamp_field = None; + let error = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap_err() + .to_string(); + assert!(error.contains("timestamp field")); + + // The new doc mapping should not remove tokenizers. + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.tokenizers.clear(); + let error = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap_err() + .to_string(); + assert!(error.contains("tokenizers")); + + // The new doc mapping should be "buildable" into a doc mapper. + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.tokenizers.push(tokenizer); + let error = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap_err() + .source() + .unwrap() + .to_string(); + assert!(error.contains("duplicated custom tokenizer")); + + let mut new_doc_mapping = current_doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + let (updated_doc_mapping, mutation_occurred) = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap(); + assert!(!mutation_occurred); + assert_eq!( + updated_doc_mapping.doc_mapping_uid, + current_doc_mapping.doc_mapping_uid + ); + assert_eq!(updated_doc_mapping, current_doc_mapping); + + let mut new_doc_mapping = current_doc_mapping.clone(); + let new_doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid; + new_doc_mapping.mode = Mode::Strict; + let (updated_doc_mapping, mutation_occurred) = + prepare_doc_mapping_update(new_doc_mapping, ¤t_doc_mapping, &search_settings) + .unwrap(); + assert!(mutation_occurred); + assert_eq!(updated_doc_mapping.doc_mapping_uid, new_doc_mapping_uid); + assert_eq!(updated_doc_mapping.mode, Mode::Strict); + } } diff --git a/quickwit/quickwit-config/src/index_config/serialize.rs b/quickwit/quickwit-config/src/index_config/serialize.rs index 5e04e8dbf3c..24fcc6d1ac2 100644 --- a/quickwit/quickwit-config/src/index_config/serialize.rs +++ b/quickwit/quickwit-config/src/index_config/serialize.rs @@ -12,11 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; - use anyhow::{Context, ensure}; use quickwit_common::uri::Uri; -use quickwit_doc_mapper::DocMapperBuilder; use quickwit_proto::types::{DocMappingUid, IndexId}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -24,7 +21,7 @@ use tracing::info; use super::{IngestSettings, validate_index_config}; use crate::{ ConfigFormat, DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings, - validate_identifier, + prepare_doc_mapping_update, validate_identifier, }; /// Alias for the latest serialization format. @@ -92,63 +89,12 @@ pub fn load_index_config_update( current_index_config.index_uri, new_index_config.index_uri ); - - // verify the new mapping is coherent - let doc_mapper_builder = DocMapperBuilder { - doc_mapping: new_index_config.doc_mapping.clone(), - default_search_fields: new_index_config - .search_settings - .default_search_fields - .clone(), - legacy_type_tag: None, - }; - doc_mapper_builder - .try_build() - .context("invalid mapping update")?; - - { - let new_mapping_uid = new_index_config.doc_mapping.doc_mapping_uid; - // we verify whether they are equal ignoring the mapping uid as it is generated at random: - // we don't want to record a mapping change when nothing really happened. - new_index_config.doc_mapping.doc_mapping_uid = - current_index_config.doc_mapping.doc_mapping_uid; - if new_index_config.doc_mapping != current_index_config.doc_mapping { - new_index_config.doc_mapping.doc_mapping_uid = new_mapping_uid; - ensure!( - current_index_config.doc_mapping.doc_mapping_uid - != new_index_config.doc_mapping.doc_mapping_uid, - "`doc_mapping_doc_mapping_uid` must change when the doc mapping is updated", - ); - ensure!( - current_index_config.doc_mapping.timestamp_field - == new_index_config.doc_mapping.timestamp_field, - "`doc_mapping.timestamp_field` cannot be updated, current value {}, new expected \ - value {}", - current_index_config - .doc_mapping - .timestamp_field - .as_deref() - .unwrap_or(""), - new_index_config - .doc_mapping - .timestamp_field - .as_deref() - .unwrap_or(""), - ); - // TODO: i'm not sure this is necessary, we can relax this requirement once we know - // for sure - let current_tokenizers: HashSet<_> = - current_index_config.doc_mapping.tokenizers.iter().collect(); - let new_tokenizers: HashSet<_> = - new_index_config.doc_mapping.tokenizers.iter().collect(); - ensure!( - new_tokenizers.is_superset(¤t_tokenizers), - "`.doc_mapping.tokenizers` must be a superset of previously available tokenizers" - ); - } else { - // the docmapping is unchanged, keep the old uid - } - } + let (updated_doc_mapping, _mutation_occurred) = prepare_doc_mapping_update( + new_index_config.doc_mapping, + ¤t_index_config.doc_mapping, + &new_index_config.search_settings, + )?; + new_index_config.doc_mapping = updated_doc_mapping; Ok(new_index_config) } diff --git a/quickwit/quickwit-config/src/lib.rs b/quickwit/quickwit-config/src/lib.rs index f7589995f5d..21ef6455eed 100644 --- a/quickwit/quickwit-config/src/lib.rs +++ b/quickwit/quickwit-config/src/lib.rs @@ -47,6 +47,7 @@ use index_config::serialize::{IndexConfigV0_8, VersionedIndexConfig}; pub use index_config::{ IndexConfig, IndexingResources, IndexingSettings, IngestSettings, RetentionPolicy, SearchSettings, build_doc_mapper, load_index_config_from_user_config, load_index_config_update, + prepare_doc_mapping_update, }; pub use quickwit_doc_mapper::DocMapping; use serde::Serialize; diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs index 984932c9a5b..2fae5d45452 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs @@ -166,20 +166,6 @@ impl DocMapping { pub fn default_max_num_partitions() -> NonZeroU32 { NonZeroU32::new(200).unwrap() } - - /// Returns whether the `other` doc mapping is equal to `self` leaving their respective doc - /// mapping UIDs out of the comparison. - pub fn eq_ignore_doc_mapping_uid(&self, other: &Self) -> bool { - let doc_mapping_uid = DocMappingUid::default(); - - let mut left = self.clone(); - left.doc_mapping_uid = doc_mapping_uid; - - let mut right = other.clone(); - right.doc_mapping_uid = doc_mapping_uid; - - left == right - } } #[cfg(test)] diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 4bc0d19c793..15aeeb21a5c 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -221,7 +221,7 @@ impl FileBackedIndex { ingest_settings: IngestSettings, search_settings: SearchSettings, retention_policy_opt: Option, - ) -> bool { + ) -> MetastoreResult { self.metadata.update_index_config( doc_mapping, indexing_settings, diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 374e1430d81..6c2ecbc196f 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -575,7 +575,7 @@ impl MetastoreService for FileBackedMetastore { ingest_settings, search_settings, retention_policy_opt, - ); + )?; let index_metadata = index.metadata().clone(); if mutation_occurred { diff --git a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs index 4e40b0dfcda..e7a5677099e 100644 --- a/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/index_metadata/mod.rs @@ -20,7 +20,7 @@ use std::collections::hash_map::Entry; use quickwit_common::uri::Uri; use quickwit_config::{ DocMapping, IndexConfig, IndexingSettings, IngestSettings, RetentionPolicy, SearchSettings, - SourceConfig, + SourceConfig, prepare_doc_mapping_update, }; use quickwit_proto::metastore::{EntityKind, MetastoreError, MetastoreResult}; use quickwit_proto::types::{IndexUid, SourceId}; @@ -106,13 +106,16 @@ impl IndexMetadata { ingest_settings: IngestSettings, search_settings: SearchSettings, retention_policy_opt: Option, - ) -> bool { - let mut mutation_occurred = false; - - if doc_mapping != self.index_config.doc_mapping { - self.index_config.doc_mapping = doc_mapping; - mutation_occurred = true; - } + ) -> MetastoreResult { + let (updated_doc_mapping, mut mutation_occurred) = prepare_doc_mapping_update( + doc_mapping, + &self.index_config.doc_mapping, + &search_settings, + ) + .map_err(|error| MetastoreError::InvalidArgument { + message: error.to_string(), + })?; + self.index_config.doc_mapping = updated_doc_mapping; if indexing_settings != self.index_config.indexing_settings { self.index_config.indexing_settings = indexing_settings; mutation_occurred = true; @@ -129,7 +132,7 @@ impl IndexMetadata { self.index_config.retention_policy_opt = retention_policy_opt; mutation_occurred = true; } - mutation_occurred + Ok(mutation_occurred) } /// Adds a source to the index. Returns an error if the source already exists. @@ -234,3 +237,96 @@ impl quickwit_config::TestableForRegression for IndexMetadata { assert_eq!(self.sources, other.sources); } } + +#[cfg(test)] +mod tests { + use quickwit_doc_mapper::Mode; + use quickwit_proto::types::DocMappingUid; + + use super::*; + + #[test] + fn test_update_index_config() { + let current_index_config = IndexConfig::for_test("test-index", "s3://test-index"); + let mut current_index_metadata = IndexMetadata::new(current_index_config.clone()); + + let mutation_occurred = current_index_metadata + .update_index_config( + current_index_config.doc_mapping.clone(), + current_index_config.indexing_settings.clone(), + current_index_config.ingest_settings.clone(), + current_index_config.search_settings.clone(), + current_index_config.retention_policy_opt.clone(), + ) + .unwrap(); + assert!(!mutation_occurred); + + let new_search_settings = SearchSettings { + default_search_fields: vec!["message".to_string(), "status".to_string()], + }; + let mutation_occurred = current_index_metadata + .update_index_config( + current_index_config.doc_mapping.clone(), + current_index_config.indexing_settings.clone(), + current_index_config.ingest_settings.clone(), + new_search_settings, + current_index_config.retention_policy_opt.clone(), + ) + .unwrap(); + assert!(mutation_occurred); + assert_eq!( + current_index_metadata + .index_config() + .search_settings + .default_search_fields, + ["message", "status"] + ); + } + + #[test] + fn test_update_doc_mapping() { + let current_index_config = IndexConfig::for_test("test-index", "s3://test-index"); + let mut current_index_metadata = IndexMetadata::new(current_index_config.clone()); + + let mut new_doc_mapping = current_index_config.doc_mapping.clone(); + new_doc_mapping.doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.timestamp_field = Some("ts".to_string()); // This is set to `timestamp` for the current doc mapping. + + current_index_metadata + .update_index_config( + new_doc_mapping, + current_index_config.indexing_settings.clone(), + current_index_config.ingest_settings.clone(), + current_index_config.search_settings.clone(), + current_index_config.retention_policy_opt.clone(), + ) + .unwrap_err(); + + let mut new_doc_mapping = current_index_config.doc_mapping.clone(); + let new_doc_mapping_uid = DocMappingUid::random(); + new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid; + new_doc_mapping.mode = Mode::Strict; + + let mutation_occurred = current_index_metadata + .update_index_config( + new_doc_mapping, + current_index_config.indexing_settings, + current_index_config.ingest_settings, + current_index_config.search_settings, + current_index_config.retention_policy_opt, + ) + .unwrap(); + assert!(mutation_occurred); + assert_eq!( + current_index_metadata + .index_config() + .doc_mapping + .doc_mapping_uid, + new_doc_mapping_uid + ); + assert_eq!( + current_index_metadata.index_config().doc_mapping.mode, + Mode::Strict + ); + } +} diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index e4a00285df0..e63a9400688 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -431,7 +431,7 @@ impl MetastoreService for PostgresqlMetastore { ingest_settings, search_settings, retention_policy_opt, - ); + )?; Ok(MutationOccurred::from(mutation_occurred)) }) .await