diff --git a/src/postings/docset.rs b/src/postings/docset.rs index d07c9228b6..24ae1ffe66 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -17,23 +17,23 @@ pub enum SkipResult { } -/// Represents an iterable set of sorted doc ids. +/// Represents an iterable set of sorted doc ids. pub trait DocSet { /// Goes to the next element. /// `.advance(...)` needs to be called a first time to point to the correct /// element. - fn advance(&mut self,) -> bool; - + fn advance(&mut self) -> bool; + /// After skipping, position the iterator in such a way that `.doc()` /// will return a value greater than or equal to target. - /// + /// /// SkipResult expresses whether the `target value` was reached, overstepped, /// or if the `DocSet` was entirely consumed without finding any value /// greater or equal to the `target`. /// /// WARNING: Calling skip always advances the docset. /// More specifically, if the docset is already positionned on the target - /// skipping will advance to the next position and return SkipResult::Overstep. + /// skipping will advance to the next position and return SkipResult::Overstep. /// fn skip_next(&mut self, target: DocId) -> SkipResult { self.advance(); @@ -43,32 +43,30 @@ pub trait DocSet { if !self.advance() { return SkipResult::End; } - }, - Ordering::Equal => { return SkipResult::Reached }, - Ordering::Greater => { return SkipResult::OverStep }, + } + Ordering::Equal => return SkipResult::Reached, + Ordering::Greater => return SkipResult::OverStep, } } } - + /// Returns the current document - fn doc(&self,) -> DocId; - + fn doc(&self) -> DocId; + /// Advances the cursor to the next document - /// None is returned if the iterator has `DocSet` - /// has already been entirely consumed. - fn next(&mut self,) -> Option { + /// None is returned if the iterator has `DocSet` + /// has already been entirely consumed. + fn next(&mut self) -> Option { if self.advance() { Some(self.doc()) - } - else { + } else { None } - } + } } impl DocSet for Box { - - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.advance() } @@ -78,28 +76,25 @@ impl DocSet for Box { unboxed.skip_next(target) } - fn doc(&self,) -> DocId { + fn doc(&self) -> DocId { let unboxed: &TDocSet = self.borrow(); unboxed.doc() } } impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { - - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { let unref: &mut TDocSet = *self; unref.advance() } - + fn skip_next(&mut self, target: DocId) -> SkipResult { let unref: &mut TDocSet = *self; unref.skip_next(target) } - fn doc(&self,) -> DocId { + fn doc(&self) -> DocId { let unref: &TDocSet = *self; unref.doc() } } - - diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index cb4d6d92e6..70808798ae 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -17,7 +17,7 @@ pub struct FreqHandler { fn read_positions(data: &[u8]) -> Vec { - let mut composite_reader = CompositeDecoder::new(); + let mut composite_reader = CompositeDecoder::new(); let mut readable: &[u8] = data; let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize; composite_reader.uncompress_unsorted(readable, uncompressed_len); @@ -27,17 +27,16 @@ fn read_positions(data: &[u8]) -> Vec { impl FreqHandler { - /// Returns a `FreqHandler` that just decodes `DocId`s. pub fn new_without_freq() -> FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::with_val(1u32), - positions: Vec::new(), + positions: Vec::new(), option: SegmentPostingsOption::NoFreq, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } - + /// Returns a `FreqHandler` that decodes `DocId`s and term frequencies. pub fn new_with_freq() -> FreqHandler { FreqHandler { @@ -54,15 +53,15 @@ impl FreqHandler { let positions = read_positions(position_data); FreqHandler { freq_decoder: SIMDBlockDecoder::new(), - positions: positions, + positions: positions, option: SegmentPostingsOption::FreqAndPositions, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } - - fn fill_positions_offset(&mut self,) { + + fn fill_positions_offset(&mut self) { let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK]; - let mut i: usize = 0; + let mut i: usize = 0; self.positions_offsets[i] = cur_position; let mut last_cur_position = cur_position; for &doc_freq in self.freq_decoder.output_array() { @@ -78,16 +77,16 @@ impl FreqHandler { last_cur_position = cur_position; } } - - + + /// Accessor to term frequency /// /// idx is the offset of the current doc in the block. /// It takes value between 0 and 128. - pub fn freq(&self, idx: usize)-> u32 { + pub fn freq(&self, idx: usize) -> u32 { self.freq_decoder.output(idx) } - + /// Accessor to the positions /// /// idx is the offset of the current doc in the block. @@ -97,16 +96,12 @@ impl FreqHandler { let stop = self.positions_offsets[idx + 1]; &self.positions[start..stop] } - + /// Decompresses a complete frequency block pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { match self.option { - SegmentPostingsOption::NoFreq => { - data - } - SegmentPostingsOption::Freq => { - self.freq_decoder.uncompress_block_unsorted(data) - } + SegmentPostingsOption::NoFreq => data, + SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data), SegmentPostingsOption::FreqAndPositions => { let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data); self.fill_positions_offset(); @@ -114,7 +109,7 @@ impl FreqHandler { } } } - + /// Decompresses an incomplete frequency block pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { match self.option { @@ -128,5 +123,4 @@ impl FreqHandler { } } } - } \ No newline at end of file diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index 6946a652e1..75699065c6 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -7,7 +7,7 @@ use DocId; /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. pub struct IntersectionDocSet { docsets: Vec, - finished: bool, + finished: bool, doc: DocId, } @@ -18,11 +18,14 @@ impl From> for IntersectionDocSet { docsets: docsets, finished: false, doc: DocId::max_value(), - } + } } } impl IntersectionDocSet { + /// Returns an array to the underlying `DocSet`s of the intersection. + /// These `DocSet` are in the same position as the `IntersectionDocSet`, + /// so that user can access their `docfreq` and `positions`. pub fn docsets(&self) -> &[TDocSet] { &self.docsets[..] } @@ -30,8 +33,7 @@ impl IntersectionDocSet { impl DocSet for IntersectionDocSet { - - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { if self.finished { return false; } @@ -71,8 +73,8 @@ impl DocSet for IntersectionDocSet { } } } - - fn doc(&self,) -> DocId { + + fn doc(&self) -> DocId { self.doc } } diff --git a/src/postings/offset_postings.rs b/src/postings/offset_postings.rs index fe7ea453d9..1410ef922b 100644 --- a/src/postings/offset_postings.rs +++ b/src/postings/offset_postings.rs @@ -15,7 +15,6 @@ pub struct OffsetPostings<'a> { } impl<'a> OffsetPostings<'a> { - /// Constructor pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings { OffsetPostings { @@ -26,38 +25,35 @@ impl<'a> OffsetPostings<'a> { } impl<'a> DocSet for OffsetPostings<'a> { - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { self.underlying.advance() } - - fn doc(&self,) -> DocId { + + fn doc(&self) -> DocId { self.underlying.doc() + self.offset } - + fn skip_next(&mut self, target: DocId) -> SkipResult { if target >= self.offset { SkipResult::OverStep - } - else { - self.underlying.skip_next(target - self.offset) + } else { + self.underlying.skip_next(target - self.offset) } } } impl<'a> HasLen for OffsetPostings<'a> { - fn len(&self,) -> usize { + fn len(&self) -> usize { self.underlying.len() } } impl<'a> Postings for OffsetPostings<'a> { - - fn term_freq(&self,) -> u32 { + fn term_freq(&self) -> u32 { self.underlying.term_freq() } - + fn positions(&self) -> &[u32] { self.underlying.positions() } - } \ No newline at end of file diff --git a/src/postings/postings.rs b/src/postings/postings.rs index dbd83a9973..52f16198ac 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -7,45 +7,38 @@ use postings::docset::DocSet; /// containing the term. Optionally, for each document, /// it may also give access to the term frequency /// as well as the list of term positions. -/// +/// /// Its main implementation is `SegmentPostings`, /// but other implementations mocking `SegmentPostings` exist, /// for merging segments or for testing. pub trait Postings: DocSet { /// Returns the term frequency - fn term_freq(&self,) -> u32; + fn term_freq(&self) -> u32; /// Returns the list of positions of the term, expressed as a list of /// token ordinals. fn positions(&self) -> &[u32]; } impl Postings for Box { - - fn term_freq(&self,) -> u32 { + fn term_freq(&self) -> u32 { let unboxed: &TPostings = self.borrow(); unboxed.term_freq() } - + fn positions(&self) -> &[u32] { let unboxed: &TPostings = self.borrow(); unboxed.positions() } - } impl<'a, TPostings: Postings> Postings for &'a mut TPostings { - - fn term_freq(&self,) -> u32 { + fn term_freq(&self) -> u32 { let unref: &TPostings = *self; unref.term_freq() } - + fn positions(&self) -> &[u32] { let unref: &TPostings = *self; unref.positions() } - } - - - diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 3b6ddd4404..c3d1f997f6 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -9,12 +9,11 @@ use schema::Field; use analyzer::StreamingIterator; use datastruct::stacker::{HashMap, Heap}; -/// The `PostingsWriter` is in charge of receiving documenting +/// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. /// /// `PostingsWriter` writes in a `Heap`. pub trait PostingsWriter { - /// Record that a document contains a term at a given position. /// /// * doc - the document id @@ -22,17 +21,22 @@ pub trait PostingsWriter { /// * term - the term /// * heap - heap used to store the postings informations as well as the terms /// in the hashmap. - fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap); - + fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap); + /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; - + /// Closes all of the currently open `Recorder`'s. fn close(&mut self, heap: &Heap); - + /// Tokenize a text and suscribe all of its token. - fn index_text<'a>(&mut self, doc_id: DocId, field: Field, field_values: &[&'a FieldValue], heap: &Heap) -> u32 { + fn index_text<'a>(&mut self, + doc_id: DocId, + field: Field, + field_values: &[&'a FieldValue], + heap: &Heap) + -> u32 { let mut pos = 0u32; let mut num_tokens: u32 = 0u32; let mut term = Term::allocate(field, 100); @@ -65,7 +69,7 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize { let num_buckets_usable = heap_capacity / 100; let hash_table_size = num_buckets_usable * 2; let mut pow = 512; - for num_bits in 10 .. 32 { + for num_bits in 10..32 { pow <<= 1; if pow > hash_table_size { return num_bits; @@ -75,31 +79,26 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize { } impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { - /// constructor pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> { let capacity = heap.capacity(); let hashmap_size = hashmap_size_in_bits(capacity); - SpecializedPostingsWriter { - term_index: HashMap::new(hashmap_size, heap), - } + SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) } } - + /// Builds a `SpecializedPostingsWriter` storing its data in a heap. pub fn new_boxed(heap: &'a Heap) -> Box { Box::new(SpecializedPostingsWriter::::new(heap)) - } - + } } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { - fn close(&mut self, heap: &Heap) { for recorder in self.term_index.values_mut() { recorder.close_doc(heap); } } - + #[inline] fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) { let mut recorder = self.term_index.get_or_create(term); @@ -112,9 +111,9 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } recorder.record_position(position, heap); } - + fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { - let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index + let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index .iter() .collect(); term_offsets.sort_by_key(|&(k, _v)| k); @@ -128,8 +127,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } Ok(()) } - - } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 095102a3d4..94173720b7 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -4,32 +4,36 @@ use postings::PostingsSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; -const POSITION_END: u32 = 4294967295; +const POSITION_END: u32 = 4294967295; /// Recorder is in charge of recording relevant information about /// the presence of a term in a document. /// -/// Depending on the `TextIndexingOptions` associated to the +/// Depending on the `TextIndexingOptions` associated to the /// field, the recorder may records /// * the document frequency -/// * the document id +/// * the document id /// * the term frequency /// * the term positions pub trait Recorder: HeapAllocable { /// Returns the current document - fn current_doc(&self,) -> u32; + fn current_doc(&self) -> u32; /// Starts recording information about a new document - /// This method shall only be called if the term is within the document. + /// This method shall only be called if the term is within the document. fn new_doc(&mut self, doc: DocId, heap: &Heap); - /// Record the position of a term. For each document, + /// Record the position of a term. For each document, /// this method will be called `term_freq` times. fn record_position(&mut self, position: u32, heap: &Heap); - /// Close the document. It will help record the term frequency. + /// Close the document. It will help record the term frequency. fn close_doc(&mut self, heap: &Heap); /// Returns the number of document that have been seen so far - fn doc_freq(&self,) -> u32; + fn doc_freq(&self) -> u32; /// Pushes the postings information to the serializer. - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; + fn serialize(&self, + self_addr: u32, + serializer: &mut PostingsSerializer, + heap: &Heap) + -> io::Result<()>; } /// Only records the doc ids @@ -51,11 +55,10 @@ impl HeapAllocable for NothingRecorder { } impl Recorder for NothingRecorder { - - fn current_doc(&self,) -> DocId { + fn current_doc(&self) -> DocId { self.current_doc } - + fn new_doc(&mut self, doc: DocId, heap: &Heap) { self.current_doc = doc; self.stack.push(doc, heap); @@ -66,17 +69,20 @@ impl Recorder for NothingRecorder { fn close_doc(&mut self, _heap: &Heap) {} - fn doc_freq(&self,) -> u32 { + fn doc_freq(&self) -> u32 { self.doc_freq } - - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { + + fn serialize(&self, + self_addr: u32, + serializer: &mut PostingsSerializer, + heap: &Heap) + -> io::Result<()> { for doc in self.stack.iter(self_addr, heap) { try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)); } Ok(()) } - } /// Recorder encoding document ids, and term frequencies @@ -94,16 +100,13 @@ impl HeapAllocable for TermFrequencyRecorder { stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), current_tf: 0u32, - doc_freq: 0u32 - } + doc_freq: 0u32, + } } } impl Recorder for TermFrequencyRecorder { - - - - fn current_doc(&self,) -> DocId { + fn current_doc(&self) -> DocId { self.current_doc } @@ -112,22 +115,26 @@ impl Recorder for TermFrequencyRecorder { self.current_doc = doc; self.stack.push(doc, heap); } - + fn record_position(&mut self, _position: u32, _heap: &Heap) { self.current_tf += 1; } - + fn close_doc(&mut self, heap: &Heap) { debug_assert!(self.current_tf > 0); self.stack.push(self.current_tf, heap); self.current_tf = 0; } - - fn doc_freq(&self,) -> u32 { + + fn doc_freq(&self) -> u32 { self.doc_freq } - - fn serialize(&self, self_addr:u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { + + fn serialize(&self, + self_addr: u32, + serializer: &mut PostingsSerializer, + heap: &Heap) + -> io::Result<()> { let mut doc_iter = self.stack.iter(self_addr, heap); loop { if let Some(doc) = doc_iter.next() { @@ -140,7 +147,6 @@ impl Recorder for TermFrequencyRecorder { } Ok(()) } - } /// Recorder encoding term frequencies as well as positions. @@ -162,12 +168,10 @@ impl HeapAllocable for TFAndPositionRecorder { } impl Recorder for TFAndPositionRecorder { - - - fn current_doc(&self,) -> DocId { + fn current_doc(&self) -> DocId { self.current_doc } - + fn new_doc(&mut self, doc: DocId, heap: &Heap) { self.doc_freq += 1; self.current_doc = doc; @@ -177,16 +181,20 @@ impl Recorder for TFAndPositionRecorder { fn record_position(&mut self, position: u32, heap: &Heap) { self.stack.push(position, heap); } - + fn close_doc(&mut self, heap: &Heap) { self.stack.push(POSITION_END, heap); } - - fn doc_freq(&self,) -> u32 { + + fn doc_freq(&self) -> u32 { self.doc_freq } - - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { + + fn serialize(&self, + self_addr: u32, + serializer: &mut PostingsSerializer, + heap: &Heap) + -> io::Result<()> { let mut doc_positions = Vec::with_capacity(100); let mut positions_iter = self.stack.iter(self_addr, heap); while let Some(doc) = positions_iter.next() { @@ -197,8 +205,7 @@ impl Recorder for TFAndPositionRecorder { Some(position) => { if position == POSITION_END { break; - } - else { + } else { doc_positions.push(position - prev_position); prev_position = position; } @@ -212,7 +219,4 @@ impl Recorder for TFAndPositionRecorder { } Ok(()) } - } - - diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index cac9b86c88..0bb8af8e3f 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -6,9 +6,9 @@ use std::num::Wrapping; const EMPTY_DATA: [u8; 0] = [0u8; 0]; -/// `SegmentPostings` represents the inverted list or postings associated to +/// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. -/// +/// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { @@ -16,21 +16,21 @@ pub struct SegmentPostings<'a> { doc_offset: u32, block_decoder: SIMDBlockDecoder, freq_handler: FreqHandler, - remaining_data: &'a[u8], + remaining_data: &'a [u8], cur: Wrapping, } impl<'a> SegmentPostings<'a> { - - fn load_next_block(&mut self,) { + fn load_next_block(&mut self) { let num_remaining_docs = self.len - self.cur.0; if num_remaining_docs >= NUM_DOCS_PER_BLOCK { - self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset); + self.remaining_data = self.block_decoder + .uncompress_block_sorted(self.remaining_data, self.doc_offset); self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - } - else { - self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs); + } else { + self.remaining_data = self.block_decoder + .uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs); self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs); } } @@ -39,7 +39,7 @@ impl<'a> SegmentPostings<'a> { /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. - /// * `freq_handler` - the freq handler is in charge of decoding + /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> { SegmentPostings { @@ -51,7 +51,7 @@ impl<'a> SegmentPostings<'a> { cur: Wrapping(usize::max_value()), } } - + /// Returns an empty segment postings object pub fn empty() -> SegmentPostings<'static> { SegmentPostings { @@ -65,11 +65,10 @@ impl<'a> SegmentPostings<'a> { } /// Index within a block is used as an address when - /// interacting with the `FreqHandler` - fn index_within_block(&self,) -> usize { + /// interacting with the `FreqHandler` + fn index_within_block(&self) -> usize { self.cur.0 % NUM_DOCS_PER_BLOCK } - } @@ -77,7 +76,7 @@ impl<'a> DocSet for SegmentPostings<'a> { // goes to the next element. // next needs to be called a first time to point to the correct element. #[inline] - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { self.cur += Wrapping(1); if self.cur.0 >= self.len { return false; @@ -87,27 +86,25 @@ impl<'a> DocSet for SegmentPostings<'a> { } true } - + #[inline] - fn doc(&self,) -> DocId { + fn doc(&self) -> DocId { self.block_decoder.output(self.index_within_block()) } - } impl<'a> HasLen for SegmentPostings<'a> { - fn len(&self,) -> usize { + fn len(&self) -> usize { self.len } } impl<'a> Postings for SegmentPostings<'a> { - fn term_freq(&self,) -> u32 { + fn term_freq(&self) -> u32 { self.freq_handler.freq(self.index_within_block()) } - + fn positions(&self) -> &[u32] { self.freq_handler.positions(self.index_within_block()) } } - diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs index 082ea0660b..cf2f8b9369 100644 --- a/src/postings/segment_postings_option.rs +++ b/src/postings/segment_postings_option.rs @@ -2,7 +2,7 @@ /// Object describing the amount of information required when reading a postings. /// -/// Since decoding information is not free, this makes it possible to +/// Since decoding information is not free, this makes it possible to /// avoid this extra cost when the information is not required. /// For instance, positions are useful when running phrase queries /// but useless in other queries. @@ -14,4 +14,4 @@ pub enum SegmentPostingsOption { Freq, /// DocIds, term frequencies and positions will be decoded. FreqAndPositions, -} \ No newline at end of file +} diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 673a9059d3..3316d1f5e0 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -19,14 +19,14 @@ use common::BinarySerializable; /// `PostingsSerializer` is in charge of serializing -/// postings on disk, in the +/// postings on disk, in the /// * `.idx` (inverted index) /// * `.pos` (positions file) /// * `.term` (term dictionary) -/// -/// `PostingsWriter` are in charge of pushing the data to the +/// +/// `PostingsWriter` are in charge of pushing the data to the /// serializer. -/// +/// /// The serializer expects to receive the following calls /// in this order : /// @@ -45,10 +45,10 @@ use common::BinarySerializable; /// Terms have to be pushed in a lexicographically-sorted order. /// Within a term, document have to be pushed in increasing order. /// -/// A description of the serialization format is -/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). +/// A description of the serialization format is +/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct PostingsSerializer { - terms_fst_builder: FstMapBuilder, // TODO find an alternative to work around the "move" + terms_fst_builder: FstMapBuilder, /* TODO find an alternative to work around the "move" */ postings_write: WritePtr, positions_write: WritePtr, written_bytes_postings: usize, @@ -65,14 +65,12 @@ pub struct PostingsSerializer { } impl PostingsSerializer { - - /// Open a new `PostingsSerializer` for the given segment - pub fn new( - terms_write: WritePtr, - postings_write: WritePtr, - positions_write: WritePtr, - schema: Schema - ) -> Result { + /// Open a new `PostingsSerializer` for the given segment + pub fn new(terms_write: WritePtr, + postings_write: WritePtr, + positions_write: WritePtr, + schema: Schema) + -> Result { let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, @@ -91,41 +89,36 @@ impl PostingsSerializer { term_open: false, }) } - - - /// Open a new `PostingsSerializer` for the given segment + + + /// Open a new `PostingsSerializer` for the given segment pub fn open(segment: &mut Segment) -> Result { let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); - PostingsSerializer::new( - terms_write, - postings_write, - positions_write, - segment.schema() - ) + PostingsSerializer::new(terms_write, + postings_write, + positions_write, + segment.schema()) } - + fn load_indexing_options(&mut self, field: Field) { let field_entry: &FieldEntry = self.schema.get_field_entry(field); self.text_indexing_options = match *field_entry.field_type() { - FieldType::Str(ref text_options) => { - text_options.get_indexing_options() - } + FieldType::Str(ref text_options) => text_options.get_indexing_options(), FieldType::U32(ref u32_options) => { if u32_options.is_indexed() { TextIndexingOptions::Unindexed - } - else { - TextIndexingOptions::Untokenized + } else { + TextIndexingOptions::Untokenized } } }; } - + /// Starts the postings for a new term. /// * term - the term. It needs to come after the previous term according - /// to the lexicographical order. + /// to the lexicographical order. /// * doc_freq - return the number of document containing the term. pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { if self.term_open { @@ -145,31 +138,34 @@ impl PostingsSerializer { self.terms_fst_builder .insert(term.as_slice(), &term_info) } - + /// Finish the serialization for this term postings. /// /// If the current block is incomplete, it need to be encoded - /// using `VInt` encoding. - pub fn close_term(&mut self,) -> io::Result<()> { + /// using `VInt` encoding. + pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { if !self.doc_ids.is_empty() { // we have doc ids waiting to be written - // this happens when the number of doc ids is + // this happens when the number of doc ids is // not a perfect multiple of our block size. // // In that case, the remaining part is encoded // using variable int encoding. { - let block_encoded = self.block_encoder.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded = self.block_encoder + .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); self.written_bytes_postings += block_encoded.len(); try!(self.postings_write.write_all(block_encoded)); self.doc_ids.clear(); } - // ... Idem for term frequencies + // ... Idem for term frequencies if self.text_indexing_options.is_termfreq_enabled() { - let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]); + let block_encoded = self.block_encoder + .compress_vint_unsorted(&self.term_freqs[..]); for num in block_encoded { - self.written_bytes_postings += try!(num.serialize(&mut self.postings_write)); + self.written_bytes_postings += + try!(num.serialize(&mut self.postings_write)); } self.term_freqs.clear(); } @@ -177,8 +173,10 @@ impl PostingsSerializer { // On the other hand, positions are entirely buffered until the // end of the term, at which point they are compressed and written. if self.text_indexing_options.is_position_enabled() { - self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write)); - let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]); + self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64) + .serialize(&mut self.positions_write)); + let positions_encoded: &[u8] = self.positions_encoder + .compress_unsorted(&self.position_deltas[..]); try!(self.positions_write.write_all(positions_encoded)); self.written_bytes_positions += positions_encoded.len(); self.position_deltas.clear(); @@ -187,8 +185,8 @@ impl PostingsSerializer { } Ok(()) } - - + + /// Serialize the information that a document contains the current term, /// its term frequency, and the position deltas. /// @@ -198,7 +196,11 @@ impl PostingsSerializer { /// /// Term frequencies and positions may be ignored by the serializer depending /// on the configuration of the field in the `Schema`. - pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> { + pub fn write_doc(&mut self, + doc_id: DocId, + term_freq: u32, + position_deltas: &[u32]) + -> io::Result<()> { self.doc_ids.push(doc_id); if self.text_indexing_options.is_termfreq_enabled() { self.term_freqs.push(term_freq as u32); @@ -209,14 +211,16 @@ impl PostingsSerializer { if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { { // encode the doc ids - let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded: &[u8] = self.block_encoder + .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); } if self.text_indexing_options.is_termfreq_enabled() { // encode the term_freqs - let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs); + let block_encoded: &[u8] = self.block_encoder + .compress_block_unsorted(&self.term_freqs); try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); self.term_freqs.clear(); @@ -225,9 +229,9 @@ impl PostingsSerializer { } Ok(()) } - + /// Closes the serializer. - pub fn close(mut self,) -> io::Result<()> { + pub fn close(mut self) -> io::Result<()> { try!(self.close_term()); try!(self.terms_fst_builder.finish()); try!(self.postings_write.flush()); diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index 8704915e0e..399307cff3 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -4,7 +4,7 @@ use DocId; use postings::{Postings, DocSet, HasLen}; use std::num::Wrapping; -const EMPTY_ARRAY: [u32; 0] = []; +const EMPTY_ARRAY: [u32; 0] = []; /// Simulate a `Postings` objects from a `VecPostings`. /// `VecPostings` only exist for testing purposes. @@ -26,43 +26,43 @@ impl From> for VecPostings { } impl DocSet for VecPostings { - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { self.cursor += Wrapping(1); self.doc_ids.len() > self.cursor.0 } - - fn doc(&self,) -> DocId { + + fn doc(&self) -> DocId { self.doc_ids[self.cursor.0] } } impl HasLen for VecPostings { - fn len(&self,) -> usize { + fn len(&self) -> usize { self.doc_ids.len() } } impl Postings for VecPostings { - fn term_freq(&self,) -> u32 { + fn term_freq(&self) -> u32 { 1u32 } - + fn positions(&self) -> &[u32] { &EMPTY_ARRAY - } + } } #[cfg(test)] pub mod tests { - + use super::*; - use DocId; - use postings::{Postings, SkipResult, DocSet}; - - + use DocId; + use postings::{Postings, SkipResult, DocSet}; + + #[test] pub fn test_vec_postings() { - let doc_ids: Vec = (0u32..1024u32).map(|e| e*3).collect(); + let doc_ids: Vec = (0u32..1024u32).map(|e| e * 3).collect(); let mut postings = VecPostings::from(doc_ids); assert!(postings.advance()); assert_eq!(postings.doc(), 0u32); @@ -77,4 +77,3 @@ pub mod tests { } } - diff --git a/src/query/boolean_query/boolean_clause.rs b/src/query/boolean_query/boolean_clause.rs index 34f49f0b76..e2e2a55b69 100644 --- a/src/query/boolean_query/boolean_clause.rs +++ b/src/query/boolean_query/boolean_clause.rs @@ -12,7 +12,7 @@ impl BooleanClause { pub fn new(query: Box, occur: Occur) -> BooleanClause { BooleanClause { query: query, - occur: occur + occur: occur, } - } + } } \ No newline at end of file diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index abecdb148d..195bbf20c1 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -12,11 +12,11 @@ use query::OccurFilter; /// /// The documents matched by the boolean query are /// those which -/// * match all of the sub queries associated with the +/// * match all of the sub queries associated with the /// `Must` occurence -/// * match none of the sub queries associated with the +/// * match none of the sub queries associated with the /// `MustNot` occurence. -/// * match at least one of the subqueries that is not +/// * match at least one of the subqueries that is not /// a `MustNot` occurence. #[derive(Debug)] pub struct BooleanQuery { @@ -25,14 +25,11 @@ pub struct BooleanQuery { impl From> for BooleanQuery { fn from(clauses: Vec) -> BooleanQuery { - BooleanQuery { - clauses: clauses, - } - } + BooleanQuery { clauses: clauses } + } } impl Query for BooleanQuery { - fn as_any(&self) -> &Any { self } @@ -41,8 +38,7 @@ impl Query for BooleanQuery { let sub_weights = try!(self.clauses .iter() .map(|clause| clause.query.weight(searcher)) - .collect() - ); + .collect()); let occurs: Vec = self.clauses .iter() .map(|clause| clause.occur) @@ -50,5 +46,4 @@ impl Query for BooleanQuery { let filter = OccurFilter::new(&occurs); Ok(box BooleanWeight::new(sub_weights, filter)) } - } \ No newline at end of file diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index e6c17af69b..c24f67760f 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -33,7 +33,7 @@ impl Ord for HeapItem { } pub struct BooleanScorer { - postings: Vec, + scorers: Vec, queue: BinaryHeap, doc: DocId, score_combiner: ScoreCombiner, @@ -43,20 +43,20 @@ pub struct BooleanScorer { impl BooleanScorer { pub fn scorers(&self) -> &[TScorer] { - &self.postings + &self.scorers } - pub fn new(postings: Vec, + pub fn new(scorers: Vec, occur_filter: OccurFilter) -> BooleanScorer { - let score_combiner = ScoreCombiner::default_for_num_scorers(postings.len()); - let mut non_empty_postings: Vec = Vec::new(); - for mut posting in postings { + let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len()); + let mut non_empty_scorers: Vec = Vec::new(); + for mut posting in scorers { let non_empty = posting.advance(); if non_empty { - non_empty_postings.push(posting); + non_empty_scorers.push(posting); } } - let heap_items: Vec = non_empty_postings + let heap_items: Vec = non_empty_scorers .iter() .map(|posting| posting.doc()) .enumerate() @@ -68,7 +68,7 @@ impl BooleanScorer { }) .collect(); BooleanScorer { - postings: non_empty_postings, + scorers: non_empty_scorers, queue: BinaryHeap::from(heap_items), doc: 0u32, score_combiner: score_combiner, @@ -77,7 +77,7 @@ impl BooleanScorer { } } - /// Advances the head of our heap (the segment postings with the lowest doc) + /// Advances the head of our heap (the segment posting with the lowest doc) /// It will also update the new current `DocId` as well as the term frequency /// associated with the segment postings. /// @@ -89,9 +89,9 @@ impl BooleanScorer { fn advance_head(&mut self,) { { let mut mutable_head = self.queue.peek_mut().unwrap(); - let cur_postings = &mut self.postings[mutable_head.ord as usize]; - if cur_postings.advance() { - mutable_head.doc = cur_postings.doc(); + let cur_scorers = &mut self.scorers[mutable_head.ord as usize]; + if cur_scorers.advance() { + mutable_head.doc = cur_scorers.doc(); return; } } @@ -108,7 +108,7 @@ impl DocSet for BooleanScorer { Some(heap_item) => { let ord = heap_item.ord as usize; self.doc = heap_item.doc; - let score = self.postings[ord].score(); + let score = self.scorers[ord].score(); self.score_combiner.update(score); ord_bitset |= 1 << ord; } @@ -120,7 +120,7 @@ impl DocSet for BooleanScorer { while let Some(&HeapItem {doc, ord}) = self.queue.peek() { if doc == self.doc { let ord = ord as usize; - let score = self.postings[ord].score(); + let score = self.scorers[ord].score(); self.score_combiner.update(score); ord_bitset |= 1 << ord; } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 930b473483..830f85edf2 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -11,8 +11,7 @@ pub struct BooleanWeight { } impl BooleanWeight { - pub fn new(weights: Vec>, - occur_filter: OccurFilter) -> BooleanWeight { + pub fn new(weights: Vec>, occur_filter: OccurFilter) -> BooleanWeight { BooleanWeight { weights: weights, occur_filter: occur_filter, @@ -22,15 +21,12 @@ impl BooleanWeight { impl Weight for BooleanWeight { - fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - let sub_scorers: Vec> = try!( - self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect() - ); - let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); + let sub_scorers: Vec> = try!(self.weights + .iter() + .map(|weight| weight.scorer(reader)) + .collect()); + let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); Ok(box boolean_scorer) } } diff --git a/src/query/multi_term_query/multi_term_query.rs b/src/query/multi_term_query/multi_term_query.rs index c5e4108359..7b207d64fb 100644 --- a/src/query/multi_term_query/multi_term_query.rs +++ b/src/query/multi_term_query/multi_term_query.rs @@ -12,20 +12,21 @@ use postings::SegmentPostingsOption; /// Query involving one or more terms. - #[derive(Eq, Clone, PartialEq, Debug)] -pub struct MultiTermQuery { - // TODO need a better Debug - occur_terms: Vec<(Occur, Term)> +pub struct MultiTermQuery { + // TODO need a better Debug + occur_terms: Vec<(Occur, Term)>, } impl MultiTermQuery { - /// Accessor for the number of terms - pub fn num_terms(&self,) -> usize { + pub fn num_terms(&self) -> usize { self.occur_terms.len() } + /// Same as `weight()`, except that rather than a boxed trait, + /// `specialized_weight` returns a specific type of the weight, allowing for + /// compile-time optimization. pub fn specialized_weight(&self, searcher: &Searcher) -> MultiTermWeight { let term_queries: Vec = self.occur_terms .iter() @@ -33,7 +34,7 @@ impl MultiTermQuery { .collect(); let occurs: Vec = self.occur_terms .iter() - .map(|&(occur, _) | occur.clone()) + .map(|&(occur, _)| occur.clone()) .collect(); let occur_filter = OccurFilter::new(&occurs); let weights = term_queries.iter() @@ -43,21 +44,17 @@ impl MultiTermQuery { term_weight }) .collect(); - MultiTermWeight { - weights: weights, - occur_filter: occur_filter, - } + MultiTermWeight::new(weights, occur_filter) } } impl Query for MultiTermQuery { - fn as_any(&self) -> &Any { self } - + fn weight(&self, searcher: &Searcher) -> Result> { Ok(box self.specialized_weight(searcher)) } @@ -66,16 +63,13 @@ impl Query for MultiTermQuery { impl From> for MultiTermQuery { fn from(occur_terms: Vec<(Occur, Term)>) -> MultiTermQuery { - MultiTermQuery { - occur_terms: occur_terms - } + MultiTermQuery { occur_terms: occur_terms } } } impl From> for MultiTermQuery { fn from(terms: Vec) -> MultiTermQuery { - let should_terms: Vec<(Occur, Term)> = terms - .into_iter() + let should_terms: Vec<(Occur, Term)> = terms.into_iter() .map(|term| (Occur::Should, term)) .collect(); MultiTermQuery::from(should_terms) diff --git a/src/query/multi_term_query/multi_term_weight.rs b/src/query/multi_term_query/multi_term_weight.rs index 6e12cd7a8f..17e58d8770 100644 --- a/src/query/multi_term_query/multi_term_weight.rs +++ b/src/query/multi_term_query/multi_term_weight.rs @@ -7,14 +7,28 @@ use postings::SegmentPostings; use query::term_query::{TermWeight, TermScorer}; use query::boolean_query::BooleanScorer; +/// Weight object associated to a [`MultiTermQuery`](./struct.MultiTermQuery.html). pub struct MultiTermWeight { - pub weights: Vec, - pub occur_filter: OccurFilter, + weights: Vec, + occur_filter: OccurFilter, } impl MultiTermWeight { + /// MultiTermWeigh constructor. + /// The `OccurFilter` is tied with the weights order. + pub fn new(weights: Vec, occur_filter: OccurFilter) -> MultiTermWeight { + MultiTermWeight { + weights: weights, + occur_filter: occur_filter, + } + } - pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result>>> { + /// Same as `scorer()`, except that rather than a boxed trait, + /// `specialized_scorer` returns a specific type of the scorer, allowing for + /// compile-time optimization. + pub fn specialized_scorer<'a>(&'a self, + reader: &'a SegmentReader) + -> Result>>> { let mut term_scorers: Vec> = Vec::new(); for term_weight in &self.weights { let term_scorer = try!(term_weight.specialized_scorer(reader)); @@ -22,12 +36,10 @@ impl MultiTermWeight { } Ok(BooleanScorer::new(term_scorers, self.occur_filter)) } - } impl Weight for MultiTermWeight { - fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - Ok(box try!(self.specialized_scorer(reader))) + Ok(box try!(self.specialized_scorer(reader))) } -} \ No newline at end of file +} diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 816fd4dbcf..d44bdceb05 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -7,6 +7,19 @@ use query::Weight; use Result; +/// `PhraseQuery` matches a specific sequence of word. +/// For instance the phrase query for `"part time"` will match +/// the sentence +/// +/// **Alan just got a part time job.** +/// +/// On the other hand it will not match the sentence. +/// +/// **This is my favorite part of the job.** +/// +/// Using a `PhraseQuery` on a field requires positions +/// to be indexed for this field. +/// #[derive(Debug)] pub struct PhraseQuery { phrase_terms: Vec, @@ -24,7 +37,7 @@ impl Query for PhraseQuery { /// Create the weight associated to a query. /// /// See [Weight](./trait.Weight.html). - fn weight(&self, searcher: &Searcher) -> Result> { + fn weight(&self, _searcher: &Searcher) -> Result> { Ok(box PhraseWeight::from(self.phrase_terms.clone())) } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 5f21a2ab35..ae8c33d664 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -37,7 +37,9 @@ impl<'a> Scorer for Box { } } - +/// EmptyScorer is a dummy Scorer in which no document matches. +/// +/// It is useful for tests and handling edge cases. pub struct EmptyScorer; impl DocSet for EmptyScorer {