Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 150 additions & 1 deletion quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

pub(crate) mod serialize;

use std::collections::HashSet;
use std::hash::{Hash, Hasher};
use std::num::NonZeroUsize;
use std::str::FromStr;
Expand Down Expand Up @@ -565,11 +566,67 @@ pub(super) fn validate_index_config(
Ok(())
}

/// Returns the updated doc mapping and a boolean indicating whether a mutation occurred.
///
/// The logic goes as follows:
/// 1. If the new doc mapping is the same as the current doc mapping, ignoring their UIDs, returns
/// the current doc mapping and `false`, indicating that no mutation occurred.
/// 2. If the new doc mapping is different from the current doc mapping, verifies the following
/// constraints before returning the new doc mapping and `true`, indicating that a mutation
/// occurred:
/// - The doc mapping UID should differ from the current one
/// - The timestamp field should remain the same
/// - The tokenizers should be a superset of the current tokenizers
/// - A doc mapper can be built from the new doc mapping
pub fn prepare_doc_mapping_update(
mut new_doc_mapping: DocMapping,
current_doc_mapping: &DocMapping,
search_settings: &SearchSettings,
) -> anyhow::Result<(DocMapping, bool)> {
// Save the new doc mapping UID in a temporary variable and override it with the current doc
// mapping UID to compare the two doc mappings, ignoring their UIDs.
let new_doc_mapping_uid = new_doc_mapping.doc_mapping_uid;
new_doc_mapping.doc_mapping_uid = current_doc_mapping.doc_mapping_uid;

if new_doc_mapping == *current_doc_mapping {
return Ok((new_doc_mapping, false));
}
// Restore the new doc mapping UID.
new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid;

ensure!(
new_doc_mapping.doc_mapping_uid != current_doc_mapping.doc_mapping_uid,
"new doc mapping UID should differ from the current one, current UID `{}`, new UID `{}`",
current_doc_mapping.doc_mapping_uid,
new_doc_mapping.doc_mapping_uid,
);
let new_timestamp_field = new_doc_mapping.timestamp_field.as_deref();
let current_timestamp_field = current_doc_mapping.timestamp_field.as_deref();
ensure!(
new_timestamp_field == current_timestamp_field,
"updating timestamp field is not allowed, current timestamp field `{}`, new timestamp \
field `{}`",
current_timestamp_field.unwrap_or("none"),
new_timestamp_field.unwrap_or("none"),
);
// TODO: Unsure this constraint is required, should we relax it?
let new_tokenizers: HashSet<_> = new_doc_mapping.tokenizers.iter().collect();
let current_tokenizers: HashSet<_> = current_doc_mapping.tokenizers.iter().collect();
ensure!(
new_tokenizers.is_superset(&current_tokenizers),
"updating tokenizers is allowed only if adding new tokenizers, current tokenizers \
`{current_tokenizers:?}`, new tokenizers `{new_tokenizers:?}`",
);
build_doc_mapper(&new_doc_mapping, search_settings).context("invalid doc mapping")?;
Ok((new_doc_mapping, true))
}

#[cfg(test)]
mod tests {

use cron::TimeUnitSpec;
use quickwit_doc_mapper::ModeType;
use quickwit_doc_mapper::{Mode, ModeType, TokenizerEntry};
use quickwit_proto::types::DocMappingUid;

use super::*;
use crate::ConfigFormat;
Expand Down Expand Up @@ -981,4 +1038,96 @@ mod tests {
let error = serde_yaml::from_str::<IngestSettings>(settings_yaml).unwrap_err();
assert!(error.to_string().contains("expected a nonzero"));
}

#[test]
fn test_prepare_doc_mapping_update() {
let current_index_config = IndexConfig::for_test("test-index", "s3://test-index");
let mut current_doc_mapping = current_index_config.doc_mapping;
let search_settings = current_index_config.search_settings;

let tokenizer_json = r#"
{
"name": "breton-tokenizer",
"type": "regex",
"pattern": "crêpes*"
}
"#;
let tokenizer: TokenizerEntry = serde_json::from_str(tokenizer_json).unwrap();

current_doc_mapping.tokenizers.push(tokenizer.clone());

// The new doc mapping should have a different doc mapping UID.
let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.store_source = false; // This is set to `true` for the current doc mapping.
let error =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap_err()
.to_string();
assert!(error.contains("doc mapping UID should differ"));

// The new doc mapping should not change the timestamp field.
let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
new_doc_mapping.timestamp_field = Some("ts".to_string()); // This is set to `timestamp` for the current doc mapping.
let error =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap_err()
.to_string();
assert!(error.contains("timestamp field"));

// The new doc mapping should not remove the timestamp field.
let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
new_doc_mapping.timestamp_field = None;
let error =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap_err()
.to_string();
assert!(error.contains("timestamp field"));

// The new doc mapping should not remove tokenizers.
let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
new_doc_mapping.tokenizers.clear();
let error =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap_err()
.to_string();
assert!(error.contains("tokenizers"));

// The new doc mapping should be "buildable" into a doc mapper.
let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
new_doc_mapping.tokenizers.push(tokenizer);
let error =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap_err()
.source()
.unwrap()
.to_string();
assert!(error.contains("duplicated custom tokenizer"));

let mut new_doc_mapping = current_doc_mapping.clone();
new_doc_mapping.doc_mapping_uid = DocMappingUid::random();
let (updated_doc_mapping, mutation_occurred) =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap();
assert!(!mutation_occurred);
assert_eq!(
updated_doc_mapping.doc_mapping_uid,
current_doc_mapping.doc_mapping_uid
);
assert_eq!(updated_doc_mapping, current_doc_mapping);

let mut new_doc_mapping = current_doc_mapping.clone();
let new_doc_mapping_uid = DocMappingUid::random();
new_doc_mapping.doc_mapping_uid = new_doc_mapping_uid;
new_doc_mapping.mode = Mode::Strict;
let (updated_doc_mapping, mutation_occurred) =
prepare_doc_mapping_update(new_doc_mapping, &current_doc_mapping, &search_settings)
.unwrap();
assert!(mutation_occurred);
assert_eq!(updated_doc_mapping.doc_mapping_uid, new_doc_mapping_uid);
assert_eq!(updated_doc_mapping.mode, Mode::Strict);
}
}
68 changes: 7 additions & 61 deletions quickwit/quickwit-config/src/index_config/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashSet;

use anyhow::{Context, ensure};
use quickwit_common::uri::Uri;
use quickwit_doc_mapper::DocMapperBuilder;
use quickwit_proto::types::{DocMappingUid, IndexId};
use serde::{Deserialize, Serialize};
use tracing::info;

use super::{IngestSettings, validate_index_config};
use crate::{
ConfigFormat, DocMapping, IndexConfig, IndexingSettings, RetentionPolicy, SearchSettings,
validate_identifier,
prepare_doc_mapping_update, validate_identifier,
};

/// Alias for the latest serialization format.
Expand Down Expand Up @@ -92,63 +89,12 @@ pub fn load_index_config_update(
current_index_config.index_uri,
new_index_config.index_uri
);

// verify the new mapping is coherent
let doc_mapper_builder = DocMapperBuilder {
doc_mapping: new_index_config.doc_mapping.clone(),
default_search_fields: new_index_config
.search_settings
.default_search_fields
.clone(),
legacy_type_tag: None,
};
doc_mapper_builder
.try_build()
.context("invalid mapping update")?;

{
let new_mapping_uid = new_index_config.doc_mapping.doc_mapping_uid;
// we verify whether they are equal ignoring the mapping uid as it is generated at random:
// we don't want to record a mapping change when nothing really happened.
new_index_config.doc_mapping.doc_mapping_uid =
current_index_config.doc_mapping.doc_mapping_uid;
if new_index_config.doc_mapping != current_index_config.doc_mapping {
new_index_config.doc_mapping.doc_mapping_uid = new_mapping_uid;
ensure!(
current_index_config.doc_mapping.doc_mapping_uid
!= new_index_config.doc_mapping.doc_mapping_uid,
"`doc_mapping_doc_mapping_uid` must change when the doc mapping is updated",
);
ensure!(
current_index_config.doc_mapping.timestamp_field
== new_index_config.doc_mapping.timestamp_field,
"`doc_mapping.timestamp_field` cannot be updated, current value {}, new expected \
value {}",
current_index_config
.doc_mapping
.timestamp_field
.as_deref()
.unwrap_or("<none>"),
new_index_config
.doc_mapping
.timestamp_field
.as_deref()
.unwrap_or("<none>"),
);
// TODO: i'm not sure this is necessary, we can relax this requirement once we know
// for sure
let current_tokenizers: HashSet<_> =
current_index_config.doc_mapping.tokenizers.iter().collect();
let new_tokenizers: HashSet<_> =
new_index_config.doc_mapping.tokenizers.iter().collect();
ensure!(
new_tokenizers.is_superset(&current_tokenizers),
"`.doc_mapping.tokenizers` must be a superset of previously available tokenizers"
);
} else {
// the docmapping is unchanged, keep the old uid
}
}
let (updated_doc_mapping, _mutation_occurred) = prepare_doc_mapping_update(
new_index_config.doc_mapping,
&current_index_config.doc_mapping,
&new_index_config.search_settings,
)?;
new_index_config.doc_mapping = updated_doc_mapping;

Ok(new_index_config)
}
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ use index_config::serialize::{IndexConfigV0_8, VersionedIndexConfig};
pub use index_config::{
IndexConfig, IndexingResources, IndexingSettings, IngestSettings, RetentionPolicy,
SearchSettings, build_doc_mapper, load_index_config_from_user_config, load_index_config_update,
prepare_doc_mapping_update,
};
pub use quickwit_doc_mapper::DocMapping;
use serde::Serialize;
Expand Down
14 changes: 0 additions & 14 deletions quickwit/quickwit-doc-mapper/src/doc_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,20 +166,6 @@ impl DocMapping {
pub fn default_max_num_partitions() -> NonZeroU32 {
NonZeroU32::new(200).unwrap()
}

/// Returns whether the `other` doc mapping is equal to `self` leaving their respective doc
/// mapping UIDs out of the comparison.
pub fn eq_ignore_doc_mapping_uid(&self, other: &Self) -> bool {
let doc_mapping_uid = DocMappingUid::default();

let mut left = self.clone();
left.doc_mapping_uid = doc_mapping_uid;

let mut right = other.clone();
right.doc_mapping_uid = doc_mapping_uid;

left == right
}
}

#[cfg(test)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ impl FileBackedIndex {
ingest_settings: IngestSettings,
search_settings: SearchSettings,
retention_policy_opt: Option<RetentionPolicy>,
) -> bool {
) -> MetastoreResult<bool> {
self.metadata.update_index_config(
doc_mapping,
indexing_settings,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ impl MetastoreService for FileBackedMetastore {
ingest_settings,
search_settings,
retention_policy_opt,
);
)?;
let index_metadata = index.metadata().clone();

if mutation_occurred {
Expand Down
Loading
Loading