Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 95 additions & 14 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,86 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields {
}
}

struct ExistsQueryFastFields {
fields: HashSet<FastFieldWarmupInfo>,
/// Term Queries on fields which are fast but not indexed.
struct TermSearchOnColumnar<'f> {
fields: &'f mut HashSet<FastFieldWarmupInfo>,
schema: Schema,
}
impl<'a, 'f> QueryAstVisitor<'a> for TermSearchOnColumnar<'f> {
type Err = Infallible;

fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> Result<(), Infallible> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a test covering TermSetQuery on fastfield?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test

for field in term_set_query.terms_per_field.keys() {
if let Some((_field, field_entry, path)) =
find_field_or_hit_dynamic(field, &self.schema)
{
if field_entry.is_fast() && !field_entry.is_indexed() {
self.fields.insert(FastFieldWarmupInfo {
name: if path.is_empty() {
field_entry.name().to_string()
} else {
format!("{}.{}", field_entry.name(), path)
},
with_subfields: false,
});
}
}
}
Ok(())
}

fn visit_term(
&mut self,
term_query: &'a quickwit_query::query_ast::TermQuery,
) -> Result<(), Infallible> {
if let Some((_field, field_entry, path)) =
find_field_or_hit_dynamic(&term_query.field, &self.schema)
{
if field_entry.is_fast() && !field_entry.is_indexed() {
self.fields.insert(FastFieldWarmupInfo {
name: if path.is_empty() {
field_entry.name().to_string()
} else {
format!("{}.{}", field_entry.name(), path)
},
with_subfields: false,
});
}
}
Ok(())
}
/// We also need to visit full text queries because they can be converted to term queries
/// on fast fields. We only care about the field being fast and not indexed AND the tokenizer
/// being `raw` or None.
fn visit_full_text(&mut self, full_text_query: &'a FullTextQuery) -> Result<(), Infallible> {
if let Some((_field, field_entry, path)) =
find_field_or_hit_dynamic(&full_text_query.field, &self.schema)
{
if field_entry.is_fast()
&& !field_entry.is_indexed()
&& (full_text_query.params.tokenizer.is_none()
|| full_text_query.params.tokenizer.as_deref() == Some("raw"))
{
self.fields.insert(FastFieldWarmupInfo {
name: if path.is_empty() {
field_entry.name().to_string()
} else {
format!("{}.{}", field_entry.name(), path)
},
with_subfields: false,
});
}
}
Ok(())
}
}

impl<'a> QueryAstVisitor<'a> for ExistsQueryFastFields {
struct ExistsQueryFastFields<'f> {
fields: &'f mut HashSet<FastFieldWarmupInfo>,
schema: Schema,
}

impl<'a, 'f> QueryAstVisitor<'a> for ExistsQueryFastFields<'f> {
type Err = Infallible;

fn visit_exists(&mut self, exists_query: &'a FieldPresenceQuery) -> Result<(), Infallible> {
Expand Down Expand Up @@ -88,18 +162,11 @@ pub(crate) fn build_query(
search_fields: &[String],
with_validation: bool,
) -> Result<(Box<dyn Query>, WarmupInfo), QueryParserError> {
let mut range_query_fields = RangeQueryFields::default();
// This cannot fail. The error type is Infallible.
let _: Result<(), Infallible> = range_query_fields.visit(query_ast);
let mut fast_fields: HashSet<FastFieldWarmupInfo> = HashSet::new();

let mut exists_query_fields = ExistsQueryFastFields {
fields: HashSet::new(),
schema: schema.clone(),
};
let mut range_query_fields = RangeQueryFields::default();
// This cannot fail. The error type is Infallible.
let _: Result<(), Infallible> = exists_query_fields.visit(query_ast);

let mut fast_fields = HashSet::new();
let Ok(_) = range_query_fields.visit(query_ast);
let range_query_fast_fields =
range_query_fields
.range_query_field_names
Expand All @@ -109,7 +176,18 @@ pub(crate) fn build_query(
with_subfields: false,
});
fast_fields.extend(range_query_fast_fields);
fast_fields.extend(exists_query_fields.fields);

let Ok(_) = TermSearchOnColumnar {
fields: &mut fast_fields,
schema: schema.clone(),
}
.visit(query_ast);

let Ok(_) = ExistsQueryFastFields {
fields: &mut fast_fields,
schema: schema.clone(),
}
.visit(query_ast);

let query = query_ast.build_tantivy_query(
&schema,
Expand All @@ -125,6 +203,9 @@ pub(crate) fn build_query(
let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
query.query_terms(&mut |term, need_position| {
let field = term.field();
if !schema.get_field_entry(field).is_indexed() {
return;
}
*terms_grouped_by_field
.entry(field)
.or_default()
Expand Down
22 changes: 14 additions & 8 deletions quickwit/quickwit-query/src/query_ast/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ use tantivy::json_utils::convert_to_fast_value_and_append_to_json_term;
use tantivy::query::TermQuery as TantivyTermQuery;
use tantivy::schema::{
Field, FieldEntry, FieldType, IndexRecordOption, JsonObjectOptions, Schema as TantivySchema,
Type,
TextFieldIndexing, Type,
};

use crate::InvalidQuery;
use crate::MatchAllOrNone::MatchNone as TantivyEmptyQuery;
use crate::json_literal::InterpretUserInput;
use crate::query_ast::full_text_query::FullTextParams;
use crate::query_ast::tantivy_query_ast::{TantivyBoolQuery, TantivyQueryAst};
use crate::tokenizers::TokenizerManager;
use crate::tokenizers::{RAW_TOKENIZER_NAME, TokenizerManager};

pub(crate) const DYNAMIC_FIELD_NAME: &str = "_dynamic";

Expand Down Expand Up @@ -147,12 +147,18 @@ fn compute_query_with_field(
Ok(make_term_query(term))
}
FieldType::Str(text_options) => {
let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let columnar_opt = TextFieldIndexing::default()
.set_fieldnorms(false)
.set_tokenizer(RAW_TOKENIZER_NAME);
let text_field_indexing = text_options
.get_indexing_options()
.or_else(|| text_options.is_fast().then_some(&columnar_opt))
.ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let terms = full_text_params.tokenize_text_into_terms(
field,
value,
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use self::chinese_compatible::ChineseTokenizer;
pub use self::code_tokenizer::CodeTokenizer;
#[cfg(feature = "multilang")]
pub use self::multilang::MultiLangTokenizer;
pub use self::tokenizer_manager::TokenizerManager;
pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager};

pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use tantivy::tokenizer::{

use crate::DEFAULT_REMOVE_TOKEN_LENGTH;

const RAW_TOKENIZER_NAME: &str = "raw";
pub const RAW_TOKENIZER_NAME: &str = "raw";
const LOWERCASE_TOKENIZER_NAME: &str = "lowercase";
const RAW_LOWERCASE_TOKENIZER_NAME: &str = "raw_lowercase";

Expand Down
15 changes: 12 additions & 3 deletions quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ expected:
_all:
primaries:
docs:
count: 100
count: 102
total:
segments:
count: 1
count: 2
docs:
count: 100
count: 102
indices:
gharchive:
primaries:
Expand All @@ -80,6 +80,15 @@ expected:
count: 1
docs:
count: 100
fast_only:
primaries:
docs:
count: 2
total:
segments:
count: 1
docs:
count: 2
empty_index:
primaries:
docs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ endpoint: "_cat/indices?format=json"
expected:
- index: empty_index
docs.count: '0'
- dataset.size: 222.8kb
- index: fast_only
docs.count: '2'
- index: gharchive
dataset.size: 222.8kb
docs.count: '100'
docs.deleted: '0'
health: green
index: gharchive
pri: '1'
pri.store.size:
$expect: 270 < float(val[:-2]) < 280
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Search for a term in a field that is not indexed but is a fast field
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
term:
fast_text: "abc-123"
expected:
hits:
total:
value: 1
relation: "eq"
--- # term query with no matches
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
term:
fast_text: "zzz"
expected:
hits:
total:
value: 0
relation: "eq"

--- # term set query with partial match
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
terms:
fast_text:
- "abc-123"
- "zzz"
expected:
hits:
total:
value: 1
relation: "eq"

--- # term set query with multiple matches
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
terms:
fast_text:
- "abc-123"
- "def-456"
expected:
hits:
total:
value: 2
relation: "eq"

--- # term query on nested JSON field
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
term:
obj.nested_text: "abc-123"
expected:
hits:
total:
value: 1
relation: "eq"

--- # term query with no matches
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
term:
obj.nested_text: "zzz"
expected:
hits:
total:
value: 0
relation: "eq"

--- # term set query
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
terms:
obj.nested_text:
- "abc-123"
- "ghi-789"
expected:
hits:
total:
value: 2
relation: "eq"

--- # term set query with no matches
engines:
- quickwit
endpoint: "fast_only/_search"
params:
size: 0
json:
query:
terms:
obj.nested_text:
- "zzz"
expected:
hits:
total:
value: 0
relation: "eq"
Loading