diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 86ff1808156..f120b202b2a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -64,6 +64,9 @@ Improvements * GITHUB#13202: Early terminate graph and exact searches of AbstractKnnVectorQuery to follow timeout set from IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh) +* GITHUB#12966: Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself. + This reduces code duplication and enables future development. (Stefan Vodita) + Optimizations --------------------- @@ -126,6 +129,8 @@ Bug Fixes * GITHUB#13206: Subtract deleted file size from the cache size of NRTCachingDirectory. (Jean-François Boeuf) +* GITHUB#12966: Aggregation facets no longer assume that aggregation values are positive. (Stefan Vodita) + Build --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java index 335f93d56d3..e6c97779905 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java @@ -180,7 +180,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I topN = Math.min(topN, cardinality); TopOrdAndIntQueue q = null; - TopOrdAndIntQueue.OrdAndValue reuse = null; + TopOrdAndIntQueue.OrdAndInt reuse = null; int bottomCount = 0; int bottomOrd = Integer.MAX_VALUE; int childCount = 0; // total number of labels with non-zero count @@ -191,18 +191,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I int ord = cursor.key; int count = cursor.value; if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = count; if (q == null) { // Lazy init for sparse case: q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; bottomOrd = q.top().ord; } } @@ -213,18 +213,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I if (count != 0) { childCount++; if (count > bottomCount || (count == bottomCount && i < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = i; - reuse.value = count; if (q == null) { // Lazy init for sparse case: q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = i; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; bottomOrd = q.top().ord; } } @@ -235,7 +235,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I int resultCount = q == null ? 0 : q.size(); LabelAndValue[] labelValues = new LabelAndValue[resultCount]; for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop(); final BytesRef term = docValues.lookupOrd(ordAndValue.ord); labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java index 1166db37852..f5c43fd6dca 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java @@ -16,37 +16,42 @@ */ package org.apache.lucene.facet; -import org.apache.lucene.util.PriorityQueue; +/** Keeps highest results, first by largest float value, then tie-break by smallest ord. */ +public class TopOrdAndFloatQueue extends TopOrdAndNumberQueue { -/** Keeps highest results, first by largest float value, then tie break by smallest ord. */ -public class TopOrdAndFloatQueue extends PriorityQueue { - - /** Holds a single entry. */ - public static final class OrdAndValue { - - /** Ordinal of the entry. */ - public int ord; + /** Sole constructor. */ + public TopOrdAndFloatQueue(int topN) { + super(topN); + } - /** Value associated with the ordinal. */ + /** Holds an ordinal and a float value. */ + public static final class OrdAndFloat extends OrdAndValue { + /** The value corresponding to the ordinal is a float. */ public float value; /** Default constructor. */ - public OrdAndValue() {} - } + public OrdAndFloat() {} + + @Override + public boolean lessThan(OrdAndValue other) { + OrdAndFloat otherOrdAndFloat = (OrdAndFloat) other; + if (value < otherOrdAndFloat.value) { + return true; + } + if (value > otherOrdAndFloat.value) { + return false; + } + return ord > otherOrdAndFloat.ord; + } - /** Sole constructor. */ - public TopOrdAndFloatQueue(int topN) { - super(topN); + @Override + public Number getValue() { + return value; + } } @Override - protected boolean lessThan(OrdAndValue a, OrdAndValue b) { - if (a.value < b.value) { - return true; - } else if (a.value > b.value) { - return false; - } else { - return a.ord > b.ord; - } + public OrdAndValue newOrdAndValue() { + return new OrdAndFloat(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java index 2652dfb73c4..a34fe793a2e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java @@ -16,37 +16,42 @@ */ package org.apache.lucene.facet; -import org.apache.lucene.util.PriorityQueue; +/** Keeps highest results, first by largest int value, then tie-break by smallest ord. */ +public class TopOrdAndIntQueue extends TopOrdAndNumberQueue { -/** Keeps highest results, first by largest int value, then tie break by smallest ord. */ -public class TopOrdAndIntQueue extends PriorityQueue { - - /** Holds a single entry. */ - public static final class OrdAndValue { - - /** Ordinal of the entry. */ - public int ord; + /** Sole constructor. */ + public TopOrdAndIntQueue(int topN) { + super(topN); + } - /** Value associated with the ordinal. */ + /** Holds an ordinal and an int value. */ + public static final class OrdAndInt extends OrdAndValue { + /** The value corresponding to the ordinal is an int. */ public int value; /** Default constructor. */ - public OrdAndValue() {} - } + public OrdAndInt() {} + + @Override + public boolean lessThan(OrdAndValue other) { + OrdAndInt otherOrdAndInt = (OrdAndInt) other; + if (value < otherOrdAndInt.value) { + return true; + } + if (value > otherOrdAndInt.value) { + return false; + } + return ord > otherOrdAndInt.ord; + } - /** Sole constructor. */ - public TopOrdAndIntQueue(int topN) { - super(topN); + @Override + public Number getValue() { + return value; + } } @Override - protected boolean lessThan(OrdAndValue a, OrdAndValue b) { - if (a.value < b.value) { - return true; - } else if (a.value > b.value) { - return false; - } else { - return a.ord > b.ord; - } + public OrdAndValue newOrdAndValue() { + return new OrdAndInt(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java new file mode 100644 index 00000000000..07b91f9b5fb --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import org.apache.lucene.util.PriorityQueue; + +/** Keeps highest results, first by largest value, then tie-break by smallest ord. */ +public abstract class TopOrdAndNumberQueue extends PriorityQueue { + + /** Holds a single entry. */ + public abstract static class OrdAndValue { + + /** Ordinal of the entry. */ + public int ord; + + /** Default constructor. */ + public OrdAndValue() {} + + /** Compare with another {@link OrdAndValue}. */ + public abstract boolean lessThan(OrdAndValue other); + + /** Get the value stored in this {@link OrdAndValue}. */ + public abstract Number getValue(); + } + + /** Sole constructor. */ + public TopOrdAndNumberQueue(int topN) { + super(topN); + } + + @Override + public boolean lessThan(TopOrdAndNumberQueue.OrdAndValue a, TopOrdAndNumberQueue.OrdAndValue b) { + return a.lessThan(b); + } + + /** + * Create a new {@link org.apache.lucene.facet.TopOrdAndNumberQueue.OrdAndValue} of the + * appropriate type. + */ + public abstract OrdAndValue newOrdAndValue(); +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java index ac42cf5aa8a..0bde5a240e8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java @@ -328,7 +328,7 @@ private TopChildrenForPath computeTopChildren( int pathCount = 0; int childCount = 0; - TopOrdAndIntQueue.OrdAndValue reuse = null; + TopOrdAndIntQueue.OrdAndInt reuse = null; while (childOrds.hasNext()) { int ord = childOrds.next(); int count = getCount(ord); @@ -336,20 +336,20 @@ private TopChildrenForPath computeTopChildren( pathCount += count; childCount++; if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = count; if (q == null) { // Lazy init, so we don't create this for the // sparse case unnecessarily q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; - bottomOrd = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; + bottomOrd = q.top().ord; } } } @@ -397,7 +397,7 @@ private FacetResult createFacetResult( LabelAndValue[] labelValues = new LabelAndValue[q.size()]; for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop(); assert ordAndValue != null; final BytesRef term = dv.lookupOrd(ordAndValue.ord); String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java index b6098f752bd..c06588cb0e9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java @@ -38,7 +38,7 @@ * * @lucene.experimental */ -public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { +public class FastTaxonomyFacetCounts extends TaxonomyFacets { /** Create {@code FastTaxonomyFacetCounts}, which also counts all facet labels. */ public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) @@ -54,7 +54,7 @@ public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, F public FastTaxonomyFacetCounts( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, fc); + super(indexFieldName, taxoReader, config, fc); count(fc.getMatchingDocs()); } @@ -66,7 +66,7 @@ public FastTaxonomyFacetCounts( public FastTaxonomyFacetCounts( String indexFieldName, IndexReader reader, TaxonomyReader taxoReader, FacetsConfig config) throws IOException { - super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, null); + super(indexFieldName, taxoReader, config, null); countAll(reader); } @@ -89,26 +89,26 @@ private void count(List matchingDocs) throws IOException { ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); if (singleValued != null) { - if (values != null) { + if (counts != null) { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } else { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - sparseValues.addTo((int) singleValued.longValue(), 1); + sparseCounts.addTo((int) singleValued.longValue(), 1); } } } else { - if (values != null) { + if (counts != null) { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } else { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < multiValued.docValueCount(); i++) { - sparseValues.addTo((int) multiValued.nextValue(), 1); + sparseCounts.addTo((int) multiValued.nextValue(), 1); } } } @@ -126,7 +126,7 @@ private void countAll(IndexReader reader) throws IOException { continue; } initializeValueCounters(); - assert values != null; + assert counts != null; Bits liveDocs = context.reader().getLiveDocs(); NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued); @@ -136,7 +136,7 @@ private void countAll(IndexReader reader) throws IOException { for (int doc = singleValued.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = singleValued.nextDoc()) { - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } else { for (int doc = singleValued.nextDoc(); @@ -145,7 +145,7 @@ private void countAll(IndexReader reader) throws IOException { if (liveDocs.get(doc) == false) { continue; } - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } } else { @@ -154,7 +154,7 @@ private void countAll(IndexReader reader) throws IOException { doc != DocIdSetIterator.NO_MORE_DOCS; doc = multiValued.nextDoc()) { for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } else { @@ -165,7 +165,7 @@ private void countAll(IndexReader reader) throws IOException { continue; } for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index c456e77f17d..b0ae828e18f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -16,21 +16,12 @@ */ package org.apache.lucene.facet.taxonomy; -import com.carrotsearch.hppc.FloatArrayList; -import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntFloatHashMap; import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.FacetsConfig.DimConfig; -import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndFloatQueue; -import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; /** * Base class for all taxonomy-based facets that aggregate to a per-ords float[]. @@ -44,14 +35,15 @@ @Deprecated public abstract class FloatTaxonomyFacets extends TaxonomyFacets { - // TODO: also use native hash map for sparse collection, like IntTaxonomyFacets - /** Aggregation function used for combining values. */ protected final AssociationAggregationFunction aggregationFunction; /** Per-ordinal value. */ protected float[] values; + /** Sparse ordinal values. */ + IntFloatHashMap sparseValues; + /** * Constructor that defaults the aggregation function to {@link * AssociationAggregationFunction#SUM}. @@ -73,363 +65,107 @@ protected FloatTaxonomyFacets( throws IOException { super(indexFieldName, taxoReader, config, fc); this.aggregationFunction = aggregationFunction; + valueComparator = (o1, o2) -> Float.compare(o1.floatValue(), o2.floatValue()); } @Override - boolean hasValues() { - return values != null; - } + protected void initializeValueCounters() { + if (initialized) { + return; + } + super.initializeValueCounters(); - void initializeValueCounters() { - if (values == null) { + assert sparseValues == null && values == null; + if (sparseCounts != null) { + sparseValues = new IntFloatHashMap(); + } else { values = new float[taxoReader.getSize()]; } } - /** Rolls up any single-valued hierarchical dimensions. */ - protected void rollup() throws IOException { - if (values == null) { - return; - } - - // Rollup any necessary dims: - ParallelTaxonomyArrays.IntArray children = getChildren(); - for (Map.Entry ent : config.getDimConfigs().entrySet()) { - String dim = ent.getKey(); - DimConfig ft = ent.getValue(); - if (ft.hierarchical && ft.multiValued == false) { - int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); - assert dimRootOrd > 0; - float newValue = - aggregationFunction.aggregate(values[dimRootOrd], rollup(children.get(dimRootOrd))); - values[dimRootOrd] = newValue; - } + /** Set the value associated with this ordinal to {@code newValue}. */ + void setValue(int ordinal, float newValue) { + if (sparseValues != null) { + sparseValues.put(ordinal, newValue); + } else { + values[ordinal] = newValue; } } - private float rollup(int ord) throws IOException { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - float aggregationValue = 0f; - while (ord != TaxonomyReader.INVALID_ORDINAL) { - float childValue = aggregationFunction.aggregate(values[ord], rollup(children.get(ord))); - values[ord] = childValue; - aggregationValue = aggregationFunction.aggregate(aggregationValue, childValue); - ord = siblings.get(ord); + /** Get the value associated with this ordinal. */ + float getValue(int ordinal) { + if (sparseValues != null) { + return sparseValues.get(ordinal); + } else { + return values[ordinal]; } - return aggregationValue; } @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - if (path.length == 0) { - if (dimConfig.hierarchical && dimConfig.multiValued == false) { - // ok: rolled up at search time - } else if (dimConfig.requireDimCount && dimConfig.multiValued) { - // ok: we indexed all ords at index time - } else { - throw new IllegalArgumentException( - "cannot return dimension-level value alone; use getTopChildren instead"); - } - } - int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); - if (ord < 0) { - return -1; - } - return values == null ? 0 : values[ord]; + protected Number getAggregationValue(int ordinal) { + return getValue(ordinal); } @Override - public FacetResult getAllChildren(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (values == null) { - return null; - } - - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - int ord = children.get(dimOrd); - float aggregatedValue = 0; - - IntArrayList ordinals = new IntArrayList(); - FloatArrayList ordValues = new FloatArrayList(); - - while (ord != TaxonomyReader.INVALID_ORDINAL) { - if (values[ord] > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, values[ord]); - ordinals.add(ord); - ordValues.add(values[ord]); - } - ord = siblings.get(ord); - } - - if (aggregatedValue == 0) { - return null; - } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = values[dimOrd]; - } else { - // Our sum'd count is not correct, in general: - aggregatedValue = -1; - } - } else { - // Our sum'd dim count is accurate, so we keep it - } - - // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to - // do an array copy here: - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); - - LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); - } - return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + protected Number aggregate(Number existingVal, Number newVal) { + return aggregationFunction.aggregate(existingVal.floatValue(), newVal.floatValue()); } @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (values == null) { - return null; - } - - TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); - return createFacetResult(topChildrenForPath, dim, path); + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + super.updateValueFromRollup(ordinal, childOrdinal); + float currentValue = getValue(ordinal); + float newValue = aggregationFunction.aggregate(currentValue, rollup(childOrdinal)); + setValue(ordinal, newValue); } - /** - * Determine the top-n children for a specified dimension + path. Results are in an intermediate - * form. - */ - private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) - throws IOException { + @Override + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); + } - TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); - float bottomValue = 0; - int bottomOrd = Integer.MAX_VALUE; + @Override + protected Number missingAggregationValue() { + return -1f; + } + private float rollup(int ord) throws IOException { ParallelTaxonomyArrays.IntArray children = getChildren(); ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - int ord = children.get(pathOrd); - float aggregatedValue = 0; - int childCount = 0; - - TopOrdAndFloatQueue.OrdAndValue reuse = null; + float aggregatedValue = 0f; while (ord != TaxonomyReader.INVALID_ORDINAL) { - float value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndFloatQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - + updateValueFromRollup(ord, children.get(ord)); + aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord)); ord = siblings.get(ord); } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = values[pathOrd]; - } else { - // Our sum'd count is not correct, in general: - aggregatedValue = -1; - } - } - return new TopChildrenForPath(aggregatedValue, childCount, q); - } - - /** - * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work - * of resolving ordinals -> labels, etc. Will return null if there are no children. - */ - FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) - throws IOException { - // If the intermediate result is null or there are no children, we return null: - if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { - return null; - } - - TopOrdAndFloatQueue q = topChildrenForPath.childQueue; - assert q != null; - - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - float[] values = new float[labelValues.length]; - - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; - } - - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - // The path component we're interested in is the one immediately after the provided path. We - // add 1 here to also account for the dim: - int childComponentIdx = path.length + 1; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); - } - - return new FacetResult( - dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + return aggregatedValue; } @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - validateTopN(topNDims); - validateTopN(topNChildren); - - if (values == null) { - return Collections.emptyList(); - } - - // get existing children and siblings ordinal array from TaxonomyFacets - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - // Create priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValue a, DimValue b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndFloatQueue.OrdAndFloat) incomingOrdAndValue).value = getValue(ord); + } - // Keep track of intermediate results, if we compute them, so we can reuse them later: - Map intermediateResults = null; + protected class FloatAggregatedValue extends AggregatedValue { + private float value; - // iterate over children and siblings ordinals for all dims - int ord = children.get(TaxonomyReader.ROOT_ORDINAL); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - String dim = taxoReader.getPath(ord).components[0]; - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd != -1) { - float dimValue; - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - // If the dim is configured as multi-valued and requires dim counts, we can access - // an accurate count for the dim computed at indexing time: - dimValue = values[dimOrd]; - } else { - // If the dim is configured as multi-valued but not requiring dim counts, we cannot - // compute an accurate dim count, and use -1 as a place-holder: - dimValue = -1; - } - } else { - // Single-valued dims require aggregating descendant paths to get accurate dim counts - // since we don't directly access ancestry paths: - // TODO: We could consider indexing dim counts directly if getTopDims is a common - // use-case. - TopChildrenForPath topChildrenForPath = - getTopChildrenForPath(dimConfig, dimOrd, topNChildren); - if (intermediateResults == null) { - intermediateResults = new HashMap<>(); - } - intermediateResults.put(dim, topChildrenForPath); - dimValue = topChildrenForPath.pathValue; - } - if (dimValue != 0) { - if (pq.size() < topNDims) { - pq.add(new DimValue(dim, dimOrd, dimValue)); - } else { - if (dimValue > pq.top().value - || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValue bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimValue; - pq.updateTop(); - } - } - } - } - } - ord = siblings.get(ord); + public FloatAggregatedValue(float value) { + this.value = value; } - FacetResult[] results = new FacetResult[pq.size()]; - - while (pq.size() > 0) { - DimValue dimValue = pq.pop(); - assert dimValue != null; - String dim = dimValue.dim; - TopChildrenForPath topChildrenForPath = null; - if (intermediateResults != null) { - topChildrenForPath = intermediateResults.get(dim); - } - if (topChildrenForPath == null) { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); - } - FacetResult facetResult = createFacetResult(topChildrenForPath, dim); - assert facetResult != null; - results[pq.size()] = facetResult; + @Override + public void aggregate(int ord) { + value = aggregationFunction.aggregate(value, getValue(ord)); } - return Arrays.asList(results); - } - private static class DimValue { - String dim; - int dimOrd; - float value; - - DimValue(String dim, int dimOrd, float value) { - this.dim = dim; - this.dimOrd = dimOrd; - this.value = value; + @Override + public Number get() { + return value; } } - /** Intermediate result to store top children for a given path before resolving labels, etc. */ - private static class TopChildrenForPath { - private final float pathValue; - private final int childCount; - private final TopOrdAndFloatQueue childQueue; - - TopChildrenForPath(float pathValue, int childCount, TopOrdAndFloatQueue childQueue) { - this.pathValue = pathValue; - this.childCount = childCount; - this.childQueue = childQueue; - } + @Override + protected AggregatedValue newAggregatedValue() { + return new FloatAggregatedValue(0f); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index d46ad78acef..1181fec477b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -16,23 +16,13 @@ */ package org.apache.lucene.facet.taxonomy; -import com.carrotsearch.hppc.IntArrayList; import com.carrotsearch.hppc.IntIntHashMap; -import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.lucene.facet.FacetResult; +import java.util.Comparator; import org.apache.lucene.facet.FacetsCollector; -import org.apache.lucene.facet.FacetsCollector.MatchingDocs; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.FacetsConfig.DimConfig; -import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndIntQueue; -import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; /** * Base class for all taxonomy-based facets that aggregate to a per-ords int[]. @@ -75,16 +65,7 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { protected IntTaxonomyFacets( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(indexFieldName, taxoReader, config); - this.aggregationFunction = AssociationAggregationFunction.SUM; - - if (useHashTable(fc, taxoReader)) { - sparseValues = new IntIntHashMap(); - values = null; - } else { - sparseValues = null; - values = new int[taxoReader.getSize()]; - } + this(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, fc); } /** Constructor that uses the provided aggregation function. */ @@ -97,49 +78,24 @@ protected IntTaxonomyFacets( throws IOException { super(indexFieldName, taxoReader, config, fc); this.aggregationFunction = aggregationFunction; + valueComparator = Comparator.comparingInt(o -> (int) o); } @Override - boolean hasValues() { - return initialized; - } - - void initializeValueCounters() { + protected void initializeValueCounters() { if (initialized) { return; } - initialized = true; + super.initializeValueCounters(); + assert sparseValues == null && values == null; - if (useHashTable(fc, taxoReader)) { + if (sparseCounts != null) { sparseValues = new IntIntHashMap(); } else { values = new int[taxoReader.getSize()]; } } - /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ - protected boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { - if (taxoReader.getSize() < 1024) { - // small number of unique values: use an array - return false; - } - - if (fc == null) { - // counting all docs: use an array - return false; - } - - int maxDoc = 0; - int sumTotalHits = 0; - for (MatchingDocs docs : fc.getMatchingDocs()) { - sumTotalHits += docs.totalHits; - maxDoc += docs.context.reader().maxDoc(); - } - - // if our result set is < 10% of the index, we collect sparsely (use hash map): - return sumTotalHits < maxDoc / 10; - } - /** Increment the count for this ordinal by 1. */ protected void increment(int ordinal) { increment(ordinal, 1); @@ -154,7 +110,7 @@ protected void increment(int ordinal, int amount) { } } - /** Set the count for this ordinal to {@code newValue}. */ + /** Set the value associated with this ordinal to {@code newValue}. */ void setValue(int ordinal, int newValue) { if (sparseValues != null) { sparseValues.put(ordinal, newValue); @@ -163,8 +119,8 @@ void setValue(int ordinal, int newValue) { } } - /** Get the count for this ordinal. */ - protected int getValue(int ordinal) { + /** Get the value associated with this ordinal. */ + int getValue(int ordinal) { if (sparseValues != null) { return sparseValues.get(ordinal); } else { @@ -172,33 +128,22 @@ protected int getValue(int ordinal) { } } - /** Rolls up any single-valued hierarchical dimensions. */ - protected void rollup() throws IOException { - if (initialized == false) { - return; - } + @Override + protected Number getAggregationValue(int ordinal) { + return getValue(ordinal); + } - // Rollup any necessary dims: - ParallelTaxonomyArrays.IntArray children = null; - for (Map.Entry ent : config.getDimConfigs().entrySet()) { - String dim = ent.getKey(); - DimConfig ft = ent.getValue(); - if (ft.hierarchical && ft.multiValued == false) { - int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); - // It can be -1 if this field was declared in the - // config but never indexed: - if (dimRootOrd > 0) { - if (children == null) { - // lazy init - children = getChildren(); - } - int currentValue = getValue(dimRootOrd); - int newValue = - aggregationFunction.aggregate(currentValue, rollup(children.get(dimRootOrd))); - setValue(dimRootOrd, newValue); - } - } - } + @Override + protected Number aggregate(Number existingVal, Number newVal) { + return aggregationFunction.aggregate((int) existingVal, (int) newVal); + } + + @Override + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + super.updateValueFromRollup(ordinal, childOrdinal); + int currentValue = getValue(ordinal); + int newValue = aggregationFunction.aggregate(currentValue, rollup(childOrdinal)); + setValue(ordinal, newValue); } private int rollup(int ord) throws IOException { @@ -206,9 +151,7 @@ private int rollup(int ord) throws IOException { ParallelTaxonomyArrays.IntArray siblings = getSiblings(); int aggregatedValue = 0; while (ord != TaxonomyReader.INVALID_ORDINAL) { - int currentValue = getValue(ord); - int newValue = aggregationFunction.aggregate(currentValue, rollup(children.get(ord))); - setValue(ord, newValue); + updateValueFromRollup(ord, children.get(ord)); aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord)); ord = siblings.get(ord); } @@ -216,351 +159,30 @@ private int rollup(int ord) throws IOException { } @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - if (path.length == 0) { - if (dimConfig.hierarchical && dimConfig.multiValued == false) { - // ok: rolled up at search time - } else if (dimConfig.requireDimCount && dimConfig.multiValued) { - // ok: we indexed all ords at index time - } else { - throw new IllegalArgumentException( - "cannot return dimension-level value alone; use getTopChildren instead"); - } - } - int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); - if (ord < 0) { - return -1; - } - return initialized ? getValue(ord) : 0; + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndIntQueue.OrdAndInt) incomingOrdAndValue).value = getValue(ord); } - @Override - public FacetResult getAllChildren(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (initialized == false) { - return null; - } - - int aggregatedValue = 0; - - IntArrayList ordinals = new IntArrayList(); - IntArrayList ordValues = new IntArrayList(); - - if (sparseValues != null) { - for (IntIntCursor c : sparseValues) { - int value = c.value; - int ord = c.key; - if (parents.get(ord) == dimOrd && value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - ordinals.add(ord); - ordValues.add(value); - } - } - } else { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - int ord = children.get(dimOrd); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - int value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - ordinals.add(ord); - ordValues.add(value); - } - ord = siblings.get(ord); - } - } - - if (aggregatedValue == 0) { - return null; - } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = getValue(dimOrd); - } else { - // Our sum'd value is not correct, in general: - aggregatedValue = -1; - } - } else { - // Our sum'd dim value is accurate, so we keep it - } - - // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to - // do an array copy here: - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); - - LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; - for (int i = 0; i < ordValues.size(); i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); - } - return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); - } - - @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } + protected class IntAggregatedValue extends AggregatedValue { + private int value; - if (initialized == false) { - return null; + public IntAggregatedValue(int value) { + this.value = value; } - TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); - return createFacetResult(topChildrenForPath, dim, path); - } - - /** - * Determine the top-n children for a specified dimension + path. Results are in an intermediate - * form. - */ - private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) - throws IOException { - TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); - int bottomValue = 0; - int bottomOrd = Integer.MAX_VALUE; - - int aggregatedValue = 0; - int childCount = 0; - TopOrdAndIntQueue.OrdAndValue reuse = null; - - // TODO: would be faster if we had a "get the following children" API? then we - // can make a single pass over the hashmap - if (sparseValues != null) { - for (IntIntCursor c : sparseValues) { - int value = c.value; - int ord = c.key; - if (parents.get(ord) == pathOrd && value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - } - } else { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - int ord = children.get(pathOrd); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - int value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - ord = siblings.get(ord); - } + @Override + public void aggregate(int ord) { + value = aggregationFunction.aggregate(value, getValue(ord)); } - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = getValue(pathOrd); - } else { - // Our sum'd value is not correct, in general: - aggregatedValue = -1; - } + @Override + public Number get() { + return value; } - - return new TopChildrenForPath(aggregatedValue, childCount, q); } @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - if (topNDims <= 0 || topNChildren <= 0) { - throw new IllegalArgumentException("topN must be > 0"); - } - - if (initialized == false) { - return Collections.emptyList(); - } - - // get children and siblings ordinal array from TaxonomyFacets - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - // Create priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValue a, DimValue b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; - - // Keep track of intermediate results, if we compute them, so we can reuse them later: - Map intermediateResults = null; - - // iterate over children and siblings ordinals for all dims - int ord = children.get(TaxonomyReader.ROOT_ORDINAL); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - String dim = taxoReader.getPath(ord).components[0]; - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd != -1) { - int dimValue; - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - // If the dim is configured as multi-valued and requires dim counts, we can access - // an accurate count for the dim computed at indexing time: - dimValue = getValue(dimOrd); - } else { - // If the dim is configured as multi-valued but not requiring dim counts, we cannot - // compute an accurate dim count, and use -1 as a place-holder: - dimValue = -1; - } - } else { - // Single-valued dims require aggregating descendant paths to get accurate dim counts - // since we don't directly access ancestry paths: - // TODO: We could consider indexing dim counts directly if getTopDims is a common - // use-case. - TopChildrenForPath topChildrenForPath = - getTopChildrenForPath(dimConfig, dimOrd, topNChildren); - if (intermediateResults == null) { - intermediateResults = new HashMap<>(); - } - intermediateResults.put(dim, topChildrenForPath); - dimValue = topChildrenForPath.pathValue; - } - if (dimValue != 0) { - if (pq.size() < topNDims) { - pq.add(new DimValue(dim, dimOrd, dimValue)); - } else { - if (dimValue > pq.top().value - || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValue bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimValue; - pq.updateTop(); - } - } - } - } - } - ord = siblings.get(ord); - } - - FacetResult[] results = new FacetResult[pq.size()]; - - while (pq.size() > 0) { - DimValue dimValue = pq.pop(); - assert dimValue != null; - String dim = dimValue.dim; - TopChildrenForPath topChildrenForPath = null; - if (intermediateResults != null) { - topChildrenForPath = intermediateResults.get(dim); - } - if (topChildrenForPath == null) { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); - } - FacetResult facetResult = createFacetResult(topChildrenForPath, dim); - assert facetResult != null; - results[pq.size()] = facetResult; - } - return Arrays.asList(results); - } - - /** - * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work - * of resolving ordinals -> labels, etc. Will return null if there are no children. - */ - FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) - throws IOException { - // If the intermediate result is null or there are no children, we return null: - if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { - return null; - } - - TopOrdAndIntQueue q = topChildrenForPath.childQueue; - assert q != null; - - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - int[] values = new int[labelValues.length]; - - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; - } - - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - // The path component we're interested in is the one immediately after the provided path. We - // add 1 here to also account for the dim: - int childComponentIdx = path.length + 1; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); - } - - return new FacetResult( - dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); - } - - private static class DimValue { - String dim; - int dimOrd; - int value; - - DimValue(String dim, int dimOrd, int value) { - this.dim = dim; - this.dimOrd = dimOrd; - this.value = value; - } - } - - /** Intermediate result to store top children for a given path before resolving labels, etc. */ - private static class TopChildrenForPath { - private final int pathValue; - private final int childCount; - private final TopOrdAndIntQueue childQueue; - - TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) { - this.pathValue = pathValue; - this.childCount = childCount; - this.childQueue = childQueue; - } + protected AggregatedValue newAggregatedValue() { + return new IntAggregatedValue(0); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java index 2e3db92008c..45c18fb3867 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java @@ -213,8 +213,10 @@ private void aggregateValues( int ordinalCount = ordinalValues.docValueCount(); for (int i = 0; i < ordinalCount; i++) { int ord = (int) ordinalValues.nextValue(); - float newValue = aggregationFunction.aggregate(values[ord], value); - values[ord] = newValue; + float currentValue = getValue(ord); + float newValue = aggregationFunction.aggregate(currentValue, value); + setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } @@ -250,8 +252,10 @@ private void aggregateValues( offset += 4; float value = (float) BitUtil.VH_BE_FLOAT.get(bytes, offset); offset += 4; - float newValue = aggregationFunction.aggregate(values[ord], value); - values[ord] = newValue; + float currentValue = getValue(ord); + float newValue = aggregationFunction.aggregate(currentValue, value); + setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java index f437efa0d8a..86cc3d1f714 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java @@ -85,6 +85,7 @@ private void aggregateValues( int currentValue = getValue(ord); int newValue = aggregationFunction.aggregate(currentValue, value); setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java index 5299264887c..11325c54fe1 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java @@ -17,17 +17,27 @@ package org.apache.lucene.facet.taxonomy; +import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; +import org.apache.lucene.util.PriorityQueue; /** * Base class for all taxonomy-based facets impls. @@ -40,6 +50,30 @@ */ @Deprecated public abstract class TaxonomyFacets extends Facets { + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + static class TopChildrenForPath { + Number pathValue; + int childCount; + TopOrdAndNumberQueue childQueue; + + public TopChildrenForPath(Number pathValue, int childCount, TopOrdAndNumberQueue childQueue) { + this.pathValue = pathValue; + this.childCount = childCount; + this.childQueue = childQueue; + } + } + + private static class DimValue { + String dim; + int dimOrd; + Number value; + + DimValue(String dim, int dimOrd, Number value) { + this.dim = dim; + this.dimOrd = dimOrd; + this.value = value; + } + } private static final Comparator BY_VALUE_THEN_DIM = new Comparator() { @@ -88,6 +122,17 @@ protected TaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, Facet this(indexFieldName, taxoReader, config, null); } + /** Dense ordinal counts. */ + int[] counts; + + /** Sparse ordinal counts. */ + IntIntHashMap sparseCounts; + + /** Have value counters been initialized. */ + boolean initialized; + + protected Comparator valueComparator; + /** * Constructor with a {@link FacetsCollector}, allowing lazy initialization of internal data * structures. @@ -100,6 +145,78 @@ protected TaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, Facet this.config = config; this.fc = fc; parents = taxoReader.getParallelTaxonomyArrays().parents(); + valueComparator = Comparator.comparingInt((x) -> (int) x); + } + + /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ + private boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { + if (taxoReader.getSize() < 1024) { + // small number of unique values: use an array + return false; + } + + if (fc == null) { + // counting all docs: use an array + return false; + } + + int maxDoc = 0; + int sumTotalHits = 0; + for (FacetsCollector.MatchingDocs docs : fc.getMatchingDocs()) { + sumTotalHits += docs.totalHits; + maxDoc += docs.context.reader().maxDoc(); + } + + // if our result set is < 10% of the index, we collect sparsely (use hash map): + return sumTotalHits < maxDoc / 10; + } + + protected void initializeValueCounters() { + if (initialized) { + return; + } + initialized = true; + assert sparseCounts == null && counts == null; + if (useHashTable(fc, taxoReader)) { + sparseCounts = new IntIntHashMap(); + } else { + counts = new int[taxoReader.getSize()]; + } + } + + /** Set the count for this ordinal to {@code newValue}. */ + protected void setCount(int ordinal, int newValue) { + if (sparseCounts != null) { + sparseCounts.put(ordinal, newValue); + } else { + counts[ordinal] = newValue; + } + } + + /** Get the count for this ordinal. */ + protected int getCount(int ordinal) { + if (sparseCounts != null) { + return sparseCounts.get(ordinal); + } else { + return counts[ordinal]; + } + } + + /** Get the aggregation value for this ordinal. */ + protected Number getAggregationValue(int ordinal) { + // By default, this is just the count. + return getCount(ordinal); + } + + /** Apply an aggregation to the two values and return the result. */ + protected Number aggregate(Number existingVal, Number newVal) { + // By default, we are computing counts, so the values are interpreted as integers and summed. + return (int) existingVal + (int) newVal; + } + + /** Were any values actually aggregated during counting? */ + boolean hasValues() { + return initialized; } /** @@ -166,6 +283,320 @@ protected DimConfig verifyDim(String dim) { return dimConfig; } + /** + * Roll-up the aggregation values from {@code childOrdinal} to {@code ordinal}. Overrides should + * probably call this to update the counts. Overriding allows us to work with primitive types for + * the aggregation values, keeping aggregation efficient. + */ + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + setCount(ordinal, getCount(ordinal) + rollup(childOrdinal)); + } + + /** + * Return a {@link TopOrdAndNumberQueue} of the appropriate type, i.e. a {@link TopOrdAndIntQueue} + * or a {@link org.apache.lucene.facet.TopOrdAndFloatQueue}. + */ + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); + } + + // TODO: We don't need this if we're okay with having an integer -1 in the results even for float + // aggregations. + /** Return the value for a missing aggregation, i.e. {@code -1} or {@code -1f}. */ + protected Number missingAggregationValue() { + return -1; + } + + /** Rolls up any single-valued hierarchical dimensions. */ + void rollup() throws IOException { + if (initialized == false) { + return; + } + + // Rollup any necessary dims: + ParallelTaxonomyArrays.IntArray children = null; + for (Map.Entry ent : config.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); + // It can be -1 if this field was declared in the + // config but never indexed: + if (dimRootOrd > 0) { + if (children == null) { + // lazy init + children = getChildren(); + } + updateValueFromRollup(dimRootOrd, children.get(dimRootOrd)); + } + } + } + } + + private int rollup(int ord) throws IOException { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int aggregatedValue = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int currentValue = getCount(ord); + int newValue = currentValue + rollup(children.get(ord)); + setCount(ord, newValue); + aggregatedValue += getCount(ord); + ord = siblings.get(ord); + } + return aggregatedValue; + } + + /** + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. + */ + private FacetResult createFacetResult( + TopChildrenForPath topChildrenForPath, String dim, String... path) throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndNumberQueue q = topChildrenForPath.childQueue; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + Number[] values = new Number[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndNumberQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.getValue(); + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + } + + @Override + public FacetResult getAllChildren(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + Number aggregatedValue = 0; + int aggregatedCount = 0; + + IntArrayList ordinals = new IntArrayList(); + List ordValues = new ArrayList<>(); + + if (sparseCounts != null) { + for (IntIntCursor ordAndCount : sparseCounts) { + int ord = ordAndCount.key; + int count = ordAndCount.value; + Number value = getAggregationValue(ord); + if (parents.get(ord) == dimOrd && count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + } + } else { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int ord = children.get(dimOrd); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + Number value = getAggregationValue(ord); + if (count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + ord = siblings.get(ord); + } + } + + if (aggregatedCount == 0) { + return null; + } + + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValue = getAggregationValue(dimOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValue = missingAggregationValue(); + } + } else { + // Our aggregateddim value is accurate, so we keep it + } + + // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to + // do an array copy here: + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); + + LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; + for (int i = 0; i < ordValues.size(); i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); + } + return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + } + + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndIntQueue.OrdAndInt) incomingOrdAndValue).value = getCount(ord); + } + + protected TopOrdAndNumberQueue.OrdAndValue insertIntoQueue( + TopOrdAndNumberQueue q, TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + if (incomingOrdAndValue == null) { + incomingOrdAndValue = q.newOrdAndValue(); + } + incomingOrdAndValue.ord = ord; + setIncomingValue(incomingOrdAndValue, ord); + + incomingOrdAndValue = q.insertWithOverflow(incomingOrdAndValue); + return incomingOrdAndValue; + } + + protected abstract static class AggregatedValue { + /** Aggregate the value corresponding to the given ordinal into this value. */ + public abstract void aggregate(int ord); + + /** Retrieve the encapsulated value. */ + public abstract Number get(); + } + + private class AggregatedCount extends AggregatedValue { + private int count; + + private AggregatedCount(int count) { + this.count = count; + } + + @Override + public void aggregate(int ord) { + count += getCount(ord); + } + + @Override + public Number get() { + return count; + } + } + + protected AggregatedValue newAggregatedValue() { + return new AggregatedCount(0); + } + + /** + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. + */ + protected TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) + throws IOException { + TopOrdAndNumberQueue q = makeTopOrdAndNumberQueue(topN); + + AggregatedValue aggregatedValue = newAggregatedValue(); + int childCount = 0; + + TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue = null; + + // TODO: would be faster if we had a "get the following children" API? then we + // can make a single pass over the hashmap + if (sparseCounts != null) { + for (IntIntCursor c : sparseCounts) { + int ord = c.key; + int count = c.value; + if (parents.get(ord) == pathOrd && count > 0) { + aggregatedValue.aggregate(ord); + childCount++; + + incomingOrdAndValue = insertIntoQueue(q, incomingOrdAndValue, ord); + } + } + } else { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int ord = children.get(pathOrd); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + if (count > 0) { + aggregatedValue.aggregate(ord); + childCount++; + + incomingOrdAndValue = insertIntoQueue(q, incomingOrdAndValue, ord); + } + ord = siblings.get(ord); + } + } + + Number aggregatedValueNumber = aggregatedValue.get(); + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValueNumber = getAggregationValue(pathOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValueNumber = missingAggregationValue(); + } + } + + return new TopChildrenForPath(aggregatedValueNumber, childCount, q); + } + + @Override + public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { + validateTopN(topN); + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); + return createFacetResult(topChildrenForPath, dim, path); + } + + @Override + public Number getSpecificValue(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + if (path.length == 0) { + if (dimConfig.hierarchical && dimConfig.multiValued == false) { + // ok: rolled up at search time + } else if (dimConfig.requireDimCount && dimConfig.multiValued) { + // ok: we indexed all ords at index time + } else { + throw new IllegalArgumentException( + "cannot return dimension-level value alone; use getTopChildren instead"); + } + } + int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); + if (ord < 0) { + return -1; + } + return initialized ? getAggregationValue(ord) : 0; + } + @Override public List getAllDims(int topN) throws IOException { validateTopN(topN); @@ -195,6 +626,110 @@ public List getAllDims(int topN) throws IOException { return results; } - /** Were any values actually aggregated during counting? */ - abstract boolean hasValues(); + @Override + public List getTopDims(int topNDims, int topNChildren) throws IOException { + if (topNDims <= 0 || topNChildren <= 0) { + throw new IllegalArgumentException("topN must be > 0"); + } + + if (initialized == false) { + return Collections.emptyList(); + } + + // get children and siblings ordinal array from TaxonomyFacets + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + + // Create priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan(DimValue a, DimValue b) { + int comparison = valueComparator.compare(a.value, b.value); + if (comparison < 0) { + return true; + } + if (comparison > 0) { + return false; + } + return a.dim.compareTo(b.dim) > 0; + } + }; + + // Keep track of intermediate results, if we compute them, so we can reuse them later: + Map intermediateResults = null; + + // iterate over children and siblings ordinals for all dims + int ord = children.get(TaxonomyReader.ROOT_ORDINAL); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + String dim = taxoReader.getPath(ord).components[0]; + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + if (dimConfig.indexFieldName.equals(indexFieldName)) { + FacetLabel cp = new FacetLabel(dim); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd != -1) { + Number dimValue; + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + // If the dim is configured as multi-valued and requires dim counts, we can access + // an accurate count for the dim computed at indexing time: + dimValue = getAggregationValue(dimOrd); + } else { + // If the dim is configured as multi-valued but not requiring dim counts, we cannot + // compute an accurate dim count, and use -1 as a place-holder: + dimValue = -1; + } + } else { + // Single-valued dims require aggregating descendant paths to get accurate dim counts + // since we don't directly access ancestry paths: + // TODO: We could consider indexing dim counts directly if getTopDims is a common + // use-case. + TopChildrenForPath topChildrenForPath = + getTopChildrenForPath(dimConfig, dimOrd, topNChildren); + if (intermediateResults == null) { + intermediateResults = new HashMap<>(); + } + intermediateResults.put(dim, topChildrenForPath); + dimValue = topChildrenForPath.pathValue; + } + if (valueComparator.compare(dimValue, 0) != 0) { + if (pq.size() < topNDims) { + pq.add(new DimValue(dim, dimOrd, dimValue)); + } else { + if (valueComparator.compare(dimValue, pq.top().value) > 0 + || (valueComparator.compare(dimValue, pq.top().value) == 0 + && dim.compareTo(pq.top().dim) < 0)) { + DimValue bottomDim = pq.top(); + bottomDim.dim = dim; + bottomDim.value = dimValue; + pq.updateTop(); + } + } + } + } + } + ord = siblings.get(ord); + } + + FacetResult[] results = new FacetResult[pq.size()]; + + while (pq.size() > 0) { + DimValue dimValue = pq.pop(); + assert dimValue != null; + String dim = dimValue.dim; + TopChildrenForPath topChildrenForPath = null; + if (intermediateResults != null) { + topChildrenForPath = intermediateResults.get(dim); + } + if (topChildrenForPath == null) { + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); + } + FacetResult facetResult = createFacetResult(topChildrenForPath, dim); + assert facetResult != null; + results[pq.size()] = facetResult; + } + return Arrays.asList(results); + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java index c69dc2943b2..3db906eb05c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java @@ -24,7 +24,10 @@ import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.facet.DrillDownQuery; +import org.apache.lucene.facet.FacetField; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.Facets; @@ -38,10 +41,13 @@ import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BitUtil; @@ -101,6 +107,7 @@ public static void beforeClass() throws Exception { doc.add(new FloatAssociationFacetField(0.2f, "float", "b")); } } + doc.add(new TextField("match", "yes", Field.Store.NO)); writer.addDocument(config.build(taxoWriter, doc)); } @@ -142,6 +149,17 @@ public static void beforeClass() throws Exception { } } + doc.add(new TextField("match", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + } + + // Add more random labels and documents to randomly make the test run on sparse/dense + // aggregation values. + count = random().nextInt(10_000); + for (int i = 0; i < count; i++) { + Document doc = new Document(); + doc.add(new FacetField("random_dim_" + i, "path")); + doc.add(new TextField("match", "no", Field.Store.NO)); writer.addDocument(config.build(taxoWriter, doc)); } @@ -194,7 +212,8 @@ public static void afterClass() throws Exception { public void testIntSumAssociation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getIntSumFacets("$facets.int", taxoReader, config, fc); assertEquals( @@ -225,7 +244,7 @@ public void testIntAssociationRandom() throws Exception { FacetsCollector fc = new FacetsCollector(); IndexSearcher searcher = newSearcher(reader); - searcher.search(new MatchAllDocsQuery(), fc); + searcher.search(new TermQuery(new Term("match", "yes")), fc); Map expected; Facets facets; @@ -273,7 +292,8 @@ public void testIntAssociationRandom() throws Exception { public void testFloatSumAssociation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("$facets.float", taxoReader, config, fc, null); assertEquals( @@ -285,7 +305,7 @@ public void testFloatSumAssociation() throws Exception { "float", new String[0], 2, - -1.0f, + -1f, new LabelAndValue[] { new LabelAndValue("a", 50.0f), new LabelAndValue("b", 9.999995f), }); @@ -304,7 +324,7 @@ public void testFloatSumAssociation() throws Exception { // test getAllDims and getTopDims List topDims = facets.getTopDims(10, 10); List allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); } public void testFloatAssociationRandom() throws Exception { @@ -312,7 +332,7 @@ public void testFloatAssociationRandom() throws Exception { FacetsCollector fc = new FacetsCollector(); IndexSearcher searcher = newSearcher(reader); - searcher.search(new MatchAllDocsQuery(), fc); + searcher.search(new TermQuery(new Term("match", "yes")), fc); Map expected; Facets facets; @@ -336,7 +356,7 @@ public void testFloatAssociationRandom() throws Exception { // test getAllDims and getTopDims List topDims = facets.getTopDims(10, 10); List allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); // MAX: facets = @@ -357,7 +377,7 @@ public void testFloatAssociationRandom() throws Exception { // test getAllDims and getTopDims topDims = facets.getTopDims(10, 10); allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); } /** @@ -366,7 +386,8 @@ public void testFloatAssociationRandom() throws Exception { */ public void testIntAndFloatAssocation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("$facets.float", taxoReader, config, fc, null); assertEquals( @@ -389,7 +410,8 @@ public void testIntAndFloatAssocation() throws Exception { public void testWrongIndexFieldName() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("wrong_field", taxoReader, config, fc, null); expectThrows( IllegalArgumentException.class, @@ -538,6 +560,63 @@ private Facets getFloatSumFacets( } } + public void testNonPositiveAggregations() throws IOException { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + + FacetsConfig config = new FacetsConfig(); + config.setIndexFieldName("a", "$float_facets"); + config.setIndexFieldName("b", "$int_facets"); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document d; + + d = new Document(); + // Positive association + d.add(new FloatAssociationFacetField(1f, "a", "1")); + d.add(new IntAssociationFacetField(1, "b", "1")); + writer.addDocument(config.build(taxoWriter, d)); + + d = new Document(); + // Zero association + d.add(new FloatAssociationFacetField(0f, "a", "2")); + d.add(new IntAssociationFacetField(0, "b", "2")); + writer.addDocument(config.build(taxoWriter, d)); + + d = new Document(); + // Negative association + d.add(new FloatAssociationFacetField(-1f, "a", "3")); + d.add(new IntAssociationFacetField(-1, "b", "3")); + writer.addDocument(config.build(taxoWriter, d)); + + IndexReader reader = writer.getReader(); + IOUtils.close(taxoWriter, writer); + + IndexSearcher searcher = newSearcher(reader); + Query q = new MatchAllDocsQuery(); + FacetsCollector fc = searcher.search(q, new FacetsCollectorManager()); + + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + FloatTaxonomyFacets floatFacets = + new TaxonomyFacetFloatAssociations( + "$float_facets", taxoReader, config, fc, AssociationAggregationFunction.SUM); + IntTaxonomyFacets intFacets = + new TaxonomyFacetIntAssociations( + "$int_facets", taxoReader, config, fc, AssociationAggregationFunction.SUM); + + // "2" and "3" are included in the result despite having non-positive values associated to them. + assertEquals( + "dim=a path=[] value=0.0 childCount=3\n 1 (1.0)\n 2 (0.0)\n 3 (-1.0)\n", + floatFacets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[] value=0 childCount=3\n 1 (1)\n 2 (0)\n 3 (-1)\n", + intFacets.getTopChildren(10, "b").toString()); + + IOUtils.close(taxoReader, reader, taxoDir, dir); + } + private void validateInts( String dim, Map expected, @@ -613,6 +692,19 @@ private void validateFloats( } } + private void assertFloatFacetResultsEqual(List expected, List actual) { + assertEquals(expected.size(), actual.size()); + for (int i = 0; i < expected.size(); i++) { + FacetResult expectedResult = expected.get(i); + FacetResult actualResult = actual.get(i); + + assertEquals(expectedResult.dim, actualResult.dim); + assertArrayEquals(expectedResult.path, actualResult.path); + assertEquals((float) expectedResult.value, (float) actualResult.value, 2e-1); + assertEquals(expectedResult.childCount, actualResult.childCount); + } + } + // since we have no insight into the ordinals assigned to the values, we sort labels by value and // count in // ascending order in order to compare with expected results diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java index 998d2b7b241..a467ff844d7 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java @@ -552,53 +552,44 @@ public void testRollupValues() throws Exception { } // LUCENE-10495 - public void testSiblingsLoaded() throws Exception { - Directory indexDir = newDirectory(); - Directory taxoDir = newDirectory(); - - DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); - IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random()))); - FacetsConfig config = new FacetsConfig(); + public void testChildrenAndSiblingsLoaded() throws Exception { + boolean[] shouldLoad = new boolean[] {false, true}; + for (boolean load : shouldLoad) { + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); - config.setHierarchical("a", true); - config.setMultiValued("a", true); - config.setRequireDimCount("a", true); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random()))); + FacetsConfig config = new FacetsConfig(); - Document doc = new Document(); - doc.add(new FacetField("a", Integer.toString(2), "1")); - iw.addDocument(config.build(taxoWriter, doc)); + config.setHierarchical("a", true); + config.setMultiValued("a", load == false); + config.setRequireDimCount("a", true); - DirectoryReader r = DirectoryReader.open(iw); - DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + Document doc = new Document(); + doc.add(new FacetField("a", "1", "2")); + iw.addDocument(config.build(taxoWriter, doc)); - FacetsCollector sfc = - newSearcher(r).search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + DirectoryReader r = DirectoryReader.open(iw); + DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); - // Test MAX: - Facets facets = - new TaxonomyFacetFloatAssociations( - taxoReader, - config, - sfc, - AssociationAggregationFunction.MAX, - DoubleValuesSource.fromLongField("price")); + FacetsCollector sfc = + newSearcher(r).search(new MatchAllDocsQuery(), new FacetsCollectorManager()); - assertTrue(((TaxonomyFacets) facets).childrenLoaded()); - assertFalse(((TaxonomyFacets) facets).siblingsLoaded()); + TaxonomyFacets facets = + new TaxonomyFacetFloatAssociations( + taxoReader, + config, + sfc, + AssociationAggregationFunction.MAX, + DoubleValuesSource.fromLongField("price")); - // Test SUM: - facets = - new TaxonomyFacetFloatAssociations( - taxoReader, - config, - sfc, - AssociationAggregationFunction.SUM, - DoubleValuesSource.fromLongField("price")); - assertTrue(((TaxonomyFacets) facets).childrenLoaded()); - assertFalse(((TaxonomyFacets) facets).siblingsLoaded()); + assertEquals(load, facets.childrenLoaded()); + assertEquals(load, facets.siblingsLoaded()); - iw.close(); - IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); + iw.close(); + IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); + } } public void testCountAndSumScore() throws Exception {