From 4130c720422e29145d14eb82f5709e60018858aa Mon Sep 17 00:00:00 2001 From: Stefan Vodita <41467371+stefanvodita@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:28:47 +0100 Subject: [PATCH] Reduce duplication in taxonomy facets; always do counts (#12966) This is a large change, refactoring most of the taxonomy facets code and changing internal behaviour, without changing the API. There are specific API changes this sets us up to do later, e.g. retrieving counts from aggregation facets. 1. Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself. This reduces code duplication and enables future development. Addresses genericity issue mentioned in #12553. 2. As a consequence, introduce sparse values to FloatTaxonomyFacets, which previously used dense values always. This issue is part of #12576. 3. Compute counts for all taxonomy facets always, which enables us to add an API to retrieve counts for association facets in the future. Addresses #11282. 4. As a consequence of having counts, we can check whether we encountered a label while faceting (count > 0), while previously we relied on the aggregation value to be positive. Closes #12585. 5. Introduce the idea of doing multiple aggregations in one go, with association facets doing the aggregation they were already doing, plus a count. We can extend to an arbitrary number of aggregations, as suggested in #12546. 6. Don't change the API. The only change in behaviour users should notice is the fix for non-positive aggregation values, which were previously discarded. 7. Add tests which were missing for sparse/dense values and non-positive aggregations. --- lucene/CHANGES.txt | 5 + .../lucene/facet/StringValueFacetCounts.java | 32 +- .../lucene/facet/TopOrdAndFloatQueue.java | 51 +- .../lucene/facet/TopOrdAndIntQueue.java | 51 +- .../lucene/facet/TopOrdAndNumberQueue.java | 55 ++ .../AbstractSortedSetDocValueFacetCounts.java | 20 +- .../taxonomy/FastTaxonomyFacetCounts.java | 28 +- .../facet/taxonomy/FloatTaxonomyFacets.java | 394 +++---------- .../facet/taxonomy/IntTaxonomyFacets.java | 460 ++------------- .../TaxonomyFacetFloatAssociations.java | 12 +- .../TaxonomyFacetIntAssociations.java | 1 + .../lucene/facet/taxonomy/TaxonomyFacets.java | 539 +++++++++++++++++- .../TestTaxonomyFacetAssociations.java | 112 +++- .../TestTaxonomyFacetValueSource.java | 69 +-- 14 files changed, 940 insertions(+), 889 deletions(-) create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 86ff1808156..f120b202b2a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -64,6 +64,9 @@ Improvements * GITHUB#13202: Early terminate graph and exact searches of AbstractKnnVectorQuery to follow timeout set from IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh) +* GITHUB#12966: Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself. + This reduces code duplication and enables future development. (Stefan Vodita) + Optimizations --------------------- @@ -126,6 +129,8 @@ Bug Fixes * GITHUB#13206: Subtract deleted file size from the cache size of NRTCachingDirectory. (Jean-François Boeuf) +* GITHUB#12966: Aggregation facets no longer assume that aggregation values are positive. (Stefan Vodita) + Build --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java index 335f93d56d3..e6c97779905 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java @@ -180,7 +180,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I topN = Math.min(topN, cardinality); TopOrdAndIntQueue q = null; - TopOrdAndIntQueue.OrdAndValue reuse = null; + TopOrdAndIntQueue.OrdAndInt reuse = null; int bottomCount = 0; int bottomOrd = Integer.MAX_VALUE; int childCount = 0; // total number of labels with non-zero count @@ -191,18 +191,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I int ord = cursor.key; int count = cursor.value; if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = count; if (q == null) { // Lazy init for sparse case: q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; bottomOrd = q.top().ord; } } @@ -213,18 +213,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I if (count != 0) { childCount++; if (count > bottomCount || (count == bottomCount && i < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = i; - reuse.value = count; if (q == null) { // Lazy init for sparse case: q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = i; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; bottomOrd = q.top().ord; } } @@ -235,7 +235,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I int resultCount = q == null ? 0 : q.size(); LabelAndValue[] labelValues = new LabelAndValue[resultCount]; for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop(); final BytesRef term = docValues.lookupOrd(ordAndValue.ord); labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java index 1166db37852..f5c43fd6dca 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndFloatQueue.java @@ -16,37 +16,42 @@ */ package org.apache.lucene.facet; -import org.apache.lucene.util.PriorityQueue; +/** Keeps highest results, first by largest float value, then tie-break by smallest ord. */ +public class TopOrdAndFloatQueue extends TopOrdAndNumberQueue { -/** Keeps highest results, first by largest float value, then tie break by smallest ord. */ -public class TopOrdAndFloatQueue extends PriorityQueue { - - /** Holds a single entry. */ - public static final class OrdAndValue { - - /** Ordinal of the entry. */ - public int ord; + /** Sole constructor. */ + public TopOrdAndFloatQueue(int topN) { + super(topN); + } - /** Value associated with the ordinal. */ + /** Holds an ordinal and a float value. */ + public static final class OrdAndFloat extends OrdAndValue { + /** The value corresponding to the ordinal is a float. */ public float value; /** Default constructor. */ - public OrdAndValue() {} - } + public OrdAndFloat() {} + + @Override + public boolean lessThan(OrdAndValue other) { + OrdAndFloat otherOrdAndFloat = (OrdAndFloat) other; + if (value < otherOrdAndFloat.value) { + return true; + } + if (value > otherOrdAndFloat.value) { + return false; + } + return ord > otherOrdAndFloat.ord; + } - /** Sole constructor. */ - public TopOrdAndFloatQueue(int topN) { - super(topN); + @Override + public Number getValue() { + return value; + } } @Override - protected boolean lessThan(OrdAndValue a, OrdAndValue b) { - if (a.value < b.value) { - return true; - } else if (a.value > b.value) { - return false; - } else { - return a.ord > b.ord; - } + public OrdAndValue newOrdAndValue() { + return new OrdAndFloat(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java index 2652dfb73c4..a34fe793a2e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndIntQueue.java @@ -16,37 +16,42 @@ */ package org.apache.lucene.facet; -import org.apache.lucene.util.PriorityQueue; +/** Keeps highest results, first by largest int value, then tie-break by smallest ord. */ +public class TopOrdAndIntQueue extends TopOrdAndNumberQueue { -/** Keeps highest results, first by largest int value, then tie break by smallest ord. */ -public class TopOrdAndIntQueue extends PriorityQueue { - - /** Holds a single entry. */ - public static final class OrdAndValue { - - /** Ordinal of the entry. */ - public int ord; + /** Sole constructor. */ + public TopOrdAndIntQueue(int topN) { + super(topN); + } - /** Value associated with the ordinal. */ + /** Holds an ordinal and an int value. */ + public static final class OrdAndInt extends OrdAndValue { + /** The value corresponding to the ordinal is an int. */ public int value; /** Default constructor. */ - public OrdAndValue() {} - } + public OrdAndInt() {} + + @Override + public boolean lessThan(OrdAndValue other) { + OrdAndInt otherOrdAndInt = (OrdAndInt) other; + if (value < otherOrdAndInt.value) { + return true; + } + if (value > otherOrdAndInt.value) { + return false; + } + return ord > otherOrdAndInt.ord; + } - /** Sole constructor. */ - public TopOrdAndIntQueue(int topN) { - super(topN); + @Override + public Number getValue() { + return value; + } } @Override - protected boolean lessThan(OrdAndValue a, OrdAndValue b) { - if (a.value < b.value) { - return true; - } else if (a.value > b.value) { - return false; - } else { - return a.ord > b.ord; - } + public OrdAndValue newOrdAndValue() { + return new OrdAndInt(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java new file mode 100644 index 00000000000..07b91f9b5fb --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/TopOrdAndNumberQueue.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet; + +import org.apache.lucene.util.PriorityQueue; + +/** Keeps highest results, first by largest value, then tie-break by smallest ord. */ +public abstract class TopOrdAndNumberQueue extends PriorityQueue { + + /** Holds a single entry. */ + public abstract static class OrdAndValue { + + /** Ordinal of the entry. */ + public int ord; + + /** Default constructor. */ + public OrdAndValue() {} + + /** Compare with another {@link OrdAndValue}. */ + public abstract boolean lessThan(OrdAndValue other); + + /** Get the value stored in this {@link OrdAndValue}. */ + public abstract Number getValue(); + } + + /** Sole constructor. */ + public TopOrdAndNumberQueue(int topN) { + super(topN); + } + + @Override + public boolean lessThan(TopOrdAndNumberQueue.OrdAndValue a, TopOrdAndNumberQueue.OrdAndValue b) { + return a.lessThan(b); + } + + /** + * Create a new {@link org.apache.lucene.facet.TopOrdAndNumberQueue.OrdAndValue} of the + * appropriate type. + */ + public abstract OrdAndValue newOrdAndValue(); +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java index ac42cf5aa8a..0bde5a240e8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java @@ -328,7 +328,7 @@ private TopChildrenForPath computeTopChildren( int pathCount = 0; int childCount = 0; - TopOrdAndIntQueue.OrdAndValue reuse = null; + TopOrdAndIntQueue.OrdAndInt reuse = null; while (childOrds.hasNext()) { int ord = childOrds.next(); int count = getCount(ord); @@ -336,20 +336,20 @@ private TopChildrenForPath computeTopChildren( pathCount += count; childCount++; if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = count; if (q == null) { // Lazy init, so we don't create this for the // sparse case unnecessarily q = new TopOrdAndIntQueue(topN); } - reuse = q.insertWithOverflow(reuse); + if (reuse == null) { + reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse); if (q.size() == topN) { - bottomCount = q.top().value; - bottomOrd = q.top().value; + bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value; + bottomOrd = q.top().ord; } } } @@ -397,7 +397,7 @@ private FacetResult createFacetResult( LabelAndValue[] labelValues = new LabelAndValue[q.size()]; for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop(); assert ordAndValue != null; final BytesRef term = dv.lookupOrd(ordAndValue.ord); String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java index b6098f752bd..c06588cb0e9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java @@ -38,7 +38,7 @@ * * @lucene.experimental */ -public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { +public class FastTaxonomyFacetCounts extends TaxonomyFacets { /** Create {@code FastTaxonomyFacetCounts}, which also counts all facet labels. */ public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) @@ -54,7 +54,7 @@ public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, F public FastTaxonomyFacetCounts( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, fc); + super(indexFieldName, taxoReader, config, fc); count(fc.getMatchingDocs()); } @@ -66,7 +66,7 @@ public FastTaxonomyFacetCounts( public FastTaxonomyFacetCounts( String indexFieldName, IndexReader reader, TaxonomyReader taxoReader, FacetsConfig config) throws IOException { - super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, null); + super(indexFieldName, taxoReader, config, null); countAll(reader); } @@ -89,26 +89,26 @@ private void count(List matchingDocs) throws IOException { ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt)); if (singleValued != null) { - if (values != null) { + if (counts != null) { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } else { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - sparseValues.addTo((int) singleValued.longValue(), 1); + sparseCounts.addTo((int) singleValued.longValue(), 1); } } } else { - if (values != null) { + if (counts != null) { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } else { while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < multiValued.docValueCount(); i++) { - sparseValues.addTo((int) multiValued.nextValue(), 1); + sparseCounts.addTo((int) multiValued.nextValue(), 1); } } } @@ -126,7 +126,7 @@ private void countAll(IndexReader reader) throws IOException { continue; } initializeValueCounters(); - assert values != null; + assert counts != null; Bits liveDocs = context.reader().getLiveDocs(); NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued); @@ -136,7 +136,7 @@ private void countAll(IndexReader reader) throws IOException { for (int doc = singleValued.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = singleValued.nextDoc()) { - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } else { for (int doc = singleValued.nextDoc(); @@ -145,7 +145,7 @@ private void countAll(IndexReader reader) throws IOException { if (liveDocs.get(doc) == false) { continue; } - values[(int) singleValued.longValue()]++; + counts[(int) singleValued.longValue()]++; } } } else { @@ -154,7 +154,7 @@ private void countAll(IndexReader reader) throws IOException { doc != DocIdSetIterator.NO_MORE_DOCS; doc = multiValued.nextDoc()) { for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } else { @@ -165,7 +165,7 @@ private void countAll(IndexReader reader) throws IOException { continue; } for (int i = 0; i < multiValued.docValueCount(); i++) { - values[(int) multiValued.nextValue()]++; + counts[(int) multiValued.nextValue()]++; } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index c456e77f17d..b0ae828e18f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -16,21 +16,12 @@ */ package org.apache.lucene.facet.taxonomy; -import com.carrotsearch.hppc.FloatArrayList; -import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntFloatHashMap; import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.FacetsConfig.DimConfig; -import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndFloatQueue; -import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; /** * Base class for all taxonomy-based facets that aggregate to a per-ords float[]. @@ -44,14 +35,15 @@ @Deprecated public abstract class FloatTaxonomyFacets extends TaxonomyFacets { - // TODO: also use native hash map for sparse collection, like IntTaxonomyFacets - /** Aggregation function used for combining values. */ protected final AssociationAggregationFunction aggregationFunction; /** Per-ordinal value. */ protected float[] values; + /** Sparse ordinal values. */ + IntFloatHashMap sparseValues; + /** * Constructor that defaults the aggregation function to {@link * AssociationAggregationFunction#SUM}. @@ -73,363 +65,107 @@ protected FloatTaxonomyFacets( throws IOException { super(indexFieldName, taxoReader, config, fc); this.aggregationFunction = aggregationFunction; + valueComparator = (o1, o2) -> Float.compare(o1.floatValue(), o2.floatValue()); } @Override - boolean hasValues() { - return values != null; - } + protected void initializeValueCounters() { + if (initialized) { + return; + } + super.initializeValueCounters(); - void initializeValueCounters() { - if (values == null) { + assert sparseValues == null && values == null; + if (sparseCounts != null) { + sparseValues = new IntFloatHashMap(); + } else { values = new float[taxoReader.getSize()]; } } - /** Rolls up any single-valued hierarchical dimensions. */ - protected void rollup() throws IOException { - if (values == null) { - return; - } - - // Rollup any necessary dims: - ParallelTaxonomyArrays.IntArray children = getChildren(); - for (Map.Entry ent : config.getDimConfigs().entrySet()) { - String dim = ent.getKey(); - DimConfig ft = ent.getValue(); - if (ft.hierarchical && ft.multiValued == false) { - int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); - assert dimRootOrd > 0; - float newValue = - aggregationFunction.aggregate(values[dimRootOrd], rollup(children.get(dimRootOrd))); - values[dimRootOrd] = newValue; - } + /** Set the value associated with this ordinal to {@code newValue}. */ + void setValue(int ordinal, float newValue) { + if (sparseValues != null) { + sparseValues.put(ordinal, newValue); + } else { + values[ordinal] = newValue; } } - private float rollup(int ord) throws IOException { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - float aggregationValue = 0f; - while (ord != TaxonomyReader.INVALID_ORDINAL) { - float childValue = aggregationFunction.aggregate(values[ord], rollup(children.get(ord))); - values[ord] = childValue; - aggregationValue = aggregationFunction.aggregate(aggregationValue, childValue); - ord = siblings.get(ord); + /** Get the value associated with this ordinal. */ + float getValue(int ordinal) { + if (sparseValues != null) { + return sparseValues.get(ordinal); + } else { + return values[ordinal]; } - return aggregationValue; } @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - if (path.length == 0) { - if (dimConfig.hierarchical && dimConfig.multiValued == false) { - // ok: rolled up at search time - } else if (dimConfig.requireDimCount && dimConfig.multiValued) { - // ok: we indexed all ords at index time - } else { - throw new IllegalArgumentException( - "cannot return dimension-level value alone; use getTopChildren instead"); - } - } - int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); - if (ord < 0) { - return -1; - } - return values == null ? 0 : values[ord]; + protected Number getAggregationValue(int ordinal) { + return getValue(ordinal); } @Override - public FacetResult getAllChildren(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (values == null) { - return null; - } - - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - int ord = children.get(dimOrd); - float aggregatedValue = 0; - - IntArrayList ordinals = new IntArrayList(); - FloatArrayList ordValues = new FloatArrayList(); - - while (ord != TaxonomyReader.INVALID_ORDINAL) { - if (values[ord] > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, values[ord]); - ordinals.add(ord); - ordValues.add(values[ord]); - } - ord = siblings.get(ord); - } - - if (aggregatedValue == 0) { - return null; - } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = values[dimOrd]; - } else { - // Our sum'd count is not correct, in general: - aggregatedValue = -1; - } - } else { - // Our sum'd dim count is accurate, so we keep it - } - - // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to - // do an array copy here: - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); - - LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); - } - return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + protected Number aggregate(Number existingVal, Number newVal) { + return aggregationFunction.aggregate(existingVal.floatValue(), newVal.floatValue()); } @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (values == null) { - return null; - } - - TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); - return createFacetResult(topChildrenForPath, dim, path); + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + super.updateValueFromRollup(ordinal, childOrdinal); + float currentValue = getValue(ordinal); + float newValue = aggregationFunction.aggregate(currentValue, rollup(childOrdinal)); + setValue(ordinal, newValue); } - /** - * Determine the top-n children for a specified dimension + path. Results are in an intermediate - * form. - */ - private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) - throws IOException { + @Override + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); + } - TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); - float bottomValue = 0; - int bottomOrd = Integer.MAX_VALUE; + @Override + protected Number missingAggregationValue() { + return -1f; + } + private float rollup(int ord) throws IOException { ParallelTaxonomyArrays.IntArray children = getChildren(); ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - int ord = children.get(pathOrd); - float aggregatedValue = 0; - int childCount = 0; - - TopOrdAndFloatQueue.OrdAndValue reuse = null; + float aggregatedValue = 0f; while (ord != TaxonomyReader.INVALID_ORDINAL) { - float value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndFloatQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - + updateValueFromRollup(ord, children.get(ord)); + aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord)); ord = siblings.get(ord); } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = values[pathOrd]; - } else { - // Our sum'd count is not correct, in general: - aggregatedValue = -1; - } - } - return new TopChildrenForPath(aggregatedValue, childCount, q); - } - - /** - * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work - * of resolving ordinals -> labels, etc. Will return null if there are no children. - */ - FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) - throws IOException { - // If the intermediate result is null or there are no children, we return null: - if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { - return null; - } - - TopOrdAndFloatQueue q = topChildrenForPath.childQueue; - assert q != null; - - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - float[] values = new float[labelValues.length]; - - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; - } - - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - // The path component we're interested in is the one immediately after the provided path. We - // add 1 here to also account for the dim: - int childComponentIdx = path.length + 1; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); - } - - return new FacetResult( - dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + return aggregatedValue; } @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - validateTopN(topNDims); - validateTopN(topNChildren); - - if (values == null) { - return Collections.emptyList(); - } - - // get existing children and siblings ordinal array from TaxonomyFacets - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - // Create priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValue a, DimValue b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndFloatQueue.OrdAndFloat) incomingOrdAndValue).value = getValue(ord); + } - // Keep track of intermediate results, if we compute them, so we can reuse them later: - Map intermediateResults = null; + protected class FloatAggregatedValue extends AggregatedValue { + private float value; - // iterate over children and siblings ordinals for all dims - int ord = children.get(TaxonomyReader.ROOT_ORDINAL); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - String dim = taxoReader.getPath(ord).components[0]; - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd != -1) { - float dimValue; - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - // If the dim is configured as multi-valued and requires dim counts, we can access - // an accurate count for the dim computed at indexing time: - dimValue = values[dimOrd]; - } else { - // If the dim is configured as multi-valued but not requiring dim counts, we cannot - // compute an accurate dim count, and use -1 as a place-holder: - dimValue = -1; - } - } else { - // Single-valued dims require aggregating descendant paths to get accurate dim counts - // since we don't directly access ancestry paths: - // TODO: We could consider indexing dim counts directly if getTopDims is a common - // use-case. - TopChildrenForPath topChildrenForPath = - getTopChildrenForPath(dimConfig, dimOrd, topNChildren); - if (intermediateResults == null) { - intermediateResults = new HashMap<>(); - } - intermediateResults.put(dim, topChildrenForPath); - dimValue = topChildrenForPath.pathValue; - } - if (dimValue != 0) { - if (pq.size() < topNDims) { - pq.add(new DimValue(dim, dimOrd, dimValue)); - } else { - if (dimValue > pq.top().value - || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValue bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimValue; - pq.updateTop(); - } - } - } - } - } - ord = siblings.get(ord); + public FloatAggregatedValue(float value) { + this.value = value; } - FacetResult[] results = new FacetResult[pq.size()]; - - while (pq.size() > 0) { - DimValue dimValue = pq.pop(); - assert dimValue != null; - String dim = dimValue.dim; - TopChildrenForPath topChildrenForPath = null; - if (intermediateResults != null) { - topChildrenForPath = intermediateResults.get(dim); - } - if (topChildrenForPath == null) { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); - } - FacetResult facetResult = createFacetResult(topChildrenForPath, dim); - assert facetResult != null; - results[pq.size()] = facetResult; + @Override + public void aggregate(int ord) { + value = aggregationFunction.aggregate(value, getValue(ord)); } - return Arrays.asList(results); - } - private static class DimValue { - String dim; - int dimOrd; - float value; - - DimValue(String dim, int dimOrd, float value) { - this.dim = dim; - this.dimOrd = dimOrd; - this.value = value; + @Override + public Number get() { + return value; } } - /** Intermediate result to store top children for a given path before resolving labels, etc. */ - private static class TopChildrenForPath { - private final float pathValue; - private final int childCount; - private final TopOrdAndFloatQueue childQueue; - - TopChildrenForPath(float pathValue, int childCount, TopOrdAndFloatQueue childQueue) { - this.pathValue = pathValue; - this.childCount = childCount; - this.childQueue = childQueue; - } + @Override + protected AggregatedValue newAggregatedValue() { + return new FloatAggregatedValue(0f); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index d46ad78acef..1181fec477b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -16,23 +16,13 @@ */ package org.apache.lucene.facet.taxonomy; -import com.carrotsearch.hppc.IntArrayList; import com.carrotsearch.hppc.IntIntHashMap; -import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.lucene.facet.FacetResult; +import java.util.Comparator; import org.apache.lucene.facet.FacetsCollector; -import org.apache.lucene.facet.FacetsCollector.MatchingDocs; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.FacetsConfig.DimConfig; -import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndIntQueue; -import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; /** * Base class for all taxonomy-based facets that aggregate to a per-ords int[]. @@ -75,16 +65,7 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { protected IntTaxonomyFacets( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(indexFieldName, taxoReader, config); - this.aggregationFunction = AssociationAggregationFunction.SUM; - - if (useHashTable(fc, taxoReader)) { - sparseValues = new IntIntHashMap(); - values = null; - } else { - sparseValues = null; - values = new int[taxoReader.getSize()]; - } + this(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, fc); } /** Constructor that uses the provided aggregation function. */ @@ -97,49 +78,24 @@ protected IntTaxonomyFacets( throws IOException { super(indexFieldName, taxoReader, config, fc); this.aggregationFunction = aggregationFunction; + valueComparator = Comparator.comparingInt(o -> (int) o); } @Override - boolean hasValues() { - return initialized; - } - - void initializeValueCounters() { + protected void initializeValueCounters() { if (initialized) { return; } - initialized = true; + super.initializeValueCounters(); + assert sparseValues == null && values == null; - if (useHashTable(fc, taxoReader)) { + if (sparseCounts != null) { sparseValues = new IntIntHashMap(); } else { values = new int[taxoReader.getSize()]; } } - /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ - protected boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { - if (taxoReader.getSize() < 1024) { - // small number of unique values: use an array - return false; - } - - if (fc == null) { - // counting all docs: use an array - return false; - } - - int maxDoc = 0; - int sumTotalHits = 0; - for (MatchingDocs docs : fc.getMatchingDocs()) { - sumTotalHits += docs.totalHits; - maxDoc += docs.context.reader().maxDoc(); - } - - // if our result set is < 10% of the index, we collect sparsely (use hash map): - return sumTotalHits < maxDoc / 10; - } - /** Increment the count for this ordinal by 1. */ protected void increment(int ordinal) { increment(ordinal, 1); @@ -154,7 +110,7 @@ protected void increment(int ordinal, int amount) { } } - /** Set the count for this ordinal to {@code newValue}. */ + /** Set the value associated with this ordinal to {@code newValue}. */ void setValue(int ordinal, int newValue) { if (sparseValues != null) { sparseValues.put(ordinal, newValue); @@ -163,8 +119,8 @@ void setValue(int ordinal, int newValue) { } } - /** Get the count for this ordinal. */ - protected int getValue(int ordinal) { + /** Get the value associated with this ordinal. */ + int getValue(int ordinal) { if (sparseValues != null) { return sparseValues.get(ordinal); } else { @@ -172,33 +128,22 @@ protected int getValue(int ordinal) { } } - /** Rolls up any single-valued hierarchical dimensions. */ - protected void rollup() throws IOException { - if (initialized == false) { - return; - } + @Override + protected Number getAggregationValue(int ordinal) { + return getValue(ordinal); + } - // Rollup any necessary dims: - ParallelTaxonomyArrays.IntArray children = null; - for (Map.Entry ent : config.getDimConfigs().entrySet()) { - String dim = ent.getKey(); - DimConfig ft = ent.getValue(); - if (ft.hierarchical && ft.multiValued == false) { - int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); - // It can be -1 if this field was declared in the - // config but never indexed: - if (dimRootOrd > 0) { - if (children == null) { - // lazy init - children = getChildren(); - } - int currentValue = getValue(dimRootOrd); - int newValue = - aggregationFunction.aggregate(currentValue, rollup(children.get(dimRootOrd))); - setValue(dimRootOrd, newValue); - } - } - } + @Override + protected Number aggregate(Number existingVal, Number newVal) { + return aggregationFunction.aggregate((int) existingVal, (int) newVal); + } + + @Override + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + super.updateValueFromRollup(ordinal, childOrdinal); + int currentValue = getValue(ordinal); + int newValue = aggregationFunction.aggregate(currentValue, rollup(childOrdinal)); + setValue(ordinal, newValue); } private int rollup(int ord) throws IOException { @@ -206,9 +151,7 @@ private int rollup(int ord) throws IOException { ParallelTaxonomyArrays.IntArray siblings = getSiblings(); int aggregatedValue = 0; while (ord != TaxonomyReader.INVALID_ORDINAL) { - int currentValue = getValue(ord); - int newValue = aggregationFunction.aggregate(currentValue, rollup(children.get(ord))); - setValue(ord, newValue); + updateValueFromRollup(ord, children.get(ord)); aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord)); ord = siblings.get(ord); } @@ -216,351 +159,30 @@ private int rollup(int ord) throws IOException { } @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - if (path.length == 0) { - if (dimConfig.hierarchical && dimConfig.multiValued == false) { - // ok: rolled up at search time - } else if (dimConfig.requireDimCount && dimConfig.multiValued) { - // ok: we indexed all ords at index time - } else { - throw new IllegalArgumentException( - "cannot return dimension-level value alone; use getTopChildren instead"); - } - } - int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); - if (ord < 0) { - return -1; - } - return initialized ? getValue(ord) : 0; + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndIntQueue.OrdAndInt) incomingOrdAndValue).value = getValue(ord); } - @Override - public FacetResult getAllChildren(String dim, String... path) throws IOException { - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } - - if (initialized == false) { - return null; - } - - int aggregatedValue = 0; - - IntArrayList ordinals = new IntArrayList(); - IntArrayList ordValues = new IntArrayList(); - - if (sparseValues != null) { - for (IntIntCursor c : sparseValues) { - int value = c.value; - int ord = c.key; - if (parents.get(ord) == dimOrd && value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - ordinals.add(ord); - ordValues.add(value); - } - } - } else { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - int ord = children.get(dimOrd); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - int value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - ordinals.add(ord); - ordValues.add(value); - } - ord = siblings.get(ord); - } - } - - if (aggregatedValue == 0) { - return null; - } - - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = getValue(dimOrd); - } else { - // Our sum'd value is not correct, in general: - aggregatedValue = -1; - } - } else { - // Our sum'd dim value is accurate, so we keep it - } - - // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to - // do an array copy here: - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); - - LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; - for (int i = 0; i < ordValues.size(); i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); - } - return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); - } - - @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - DimConfig dimConfig = verifyDim(dim); - FacetLabel cp = new FacetLabel(dim, path); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd == -1) { - return null; - } + protected class IntAggregatedValue extends AggregatedValue { + private int value; - if (initialized == false) { - return null; + public IntAggregatedValue(int value) { + this.value = value; } - TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); - return createFacetResult(topChildrenForPath, dim, path); - } - - /** - * Determine the top-n children for a specified dimension + path. Results are in an intermediate - * form. - */ - private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) - throws IOException { - TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); - int bottomValue = 0; - int bottomOrd = Integer.MAX_VALUE; - - int aggregatedValue = 0; - int childCount = 0; - TopOrdAndIntQueue.OrdAndValue reuse = null; - - // TODO: would be faster if we had a "get the following children" API? then we - // can make a single pass over the hashmap - if (sparseValues != null) { - for (IntIntCursor c : sparseValues) { - int value = c.value; - int ord = c.key; - if (parents.get(ord) == pathOrd && value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - } - } else { - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - int ord = children.get(pathOrd); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - int value = values[ord]; - if (value > 0) { - aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); - childCount++; - if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = value; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; - bottomOrd = q.top().ord; - } - } - } - ord = siblings.get(ord); - } + @Override + public void aggregate(int ord) { + value = aggregationFunction.aggregate(value, getValue(ord)); } - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - aggregatedValue = getValue(pathOrd); - } else { - // Our sum'd value is not correct, in general: - aggregatedValue = -1; - } + @Override + public Number get() { + return value; } - - return new TopChildrenForPath(aggregatedValue, childCount, q); } @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - if (topNDims <= 0 || topNChildren <= 0) { - throw new IllegalArgumentException("topN must be > 0"); - } - - if (initialized == false) { - return Collections.emptyList(); - } - - // get children and siblings ordinal array from TaxonomyFacets - ParallelTaxonomyArrays.IntArray children = getChildren(); - ParallelTaxonomyArrays.IntArray siblings = getSiblings(); - - // Create priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValue a, DimValue b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; - - // Keep track of intermediate results, if we compute them, so we can reuse them later: - Map intermediateResults = null; - - // iterate over children and siblings ordinals for all dims - int ord = children.get(TaxonomyReader.ROOT_ORDINAL); - while (ord != TaxonomyReader.INVALID_ORDINAL) { - String dim = taxoReader.getPath(ord).components[0]; - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim); - int dimOrd = taxoReader.getOrdinal(cp); - if (dimOrd != -1) { - int dimValue; - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - // If the dim is configured as multi-valued and requires dim counts, we can access - // an accurate count for the dim computed at indexing time: - dimValue = getValue(dimOrd); - } else { - // If the dim is configured as multi-valued but not requiring dim counts, we cannot - // compute an accurate dim count, and use -1 as a place-holder: - dimValue = -1; - } - } else { - // Single-valued dims require aggregating descendant paths to get accurate dim counts - // since we don't directly access ancestry paths: - // TODO: We could consider indexing dim counts directly if getTopDims is a common - // use-case. - TopChildrenForPath topChildrenForPath = - getTopChildrenForPath(dimConfig, dimOrd, topNChildren); - if (intermediateResults == null) { - intermediateResults = new HashMap<>(); - } - intermediateResults.put(dim, topChildrenForPath); - dimValue = topChildrenForPath.pathValue; - } - if (dimValue != 0) { - if (pq.size() < topNDims) { - pq.add(new DimValue(dim, dimOrd, dimValue)); - } else { - if (dimValue > pq.top().value - || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValue bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimValue; - pq.updateTop(); - } - } - } - } - } - ord = siblings.get(ord); - } - - FacetResult[] results = new FacetResult[pq.size()]; - - while (pq.size() > 0) { - DimValue dimValue = pq.pop(); - assert dimValue != null; - String dim = dimValue.dim; - TopChildrenForPath topChildrenForPath = null; - if (intermediateResults != null) { - topChildrenForPath = intermediateResults.get(dim); - } - if (topChildrenForPath == null) { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); - } - FacetResult facetResult = createFacetResult(topChildrenForPath, dim); - assert facetResult != null; - results[pq.size()] = facetResult; - } - return Arrays.asList(results); - } - - /** - * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work - * of resolving ordinals -> labels, etc. Will return null if there are no children. - */ - FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) - throws IOException { - // If the intermediate result is null or there are no children, we return null: - if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { - return null; - } - - TopOrdAndIntQueue q = topChildrenForPath.childQueue; - assert q != null; - - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - int[] values = new int[labelValues.length]; - - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; - } - - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - // The path component we're interested in is the one immediately after the provided path. We - // add 1 here to also account for the dim: - int childComponentIdx = path.length + 1; - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); - } - - return new FacetResult( - dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); - } - - private static class DimValue { - String dim; - int dimOrd; - int value; - - DimValue(String dim, int dimOrd, int value) { - this.dim = dim; - this.dimOrd = dimOrd; - this.value = value; - } - } - - /** Intermediate result to store top children for a given path before resolving labels, etc. */ - private static class TopChildrenForPath { - private final int pathValue; - private final int childCount; - private final TopOrdAndIntQueue childQueue; - - TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) { - this.pathValue = pathValue; - this.childCount = childCount; - this.childQueue = childQueue; - } + protected AggregatedValue newAggregatedValue() { + return new IntAggregatedValue(0); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java index 2e3db92008c..45c18fb3867 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetFloatAssociations.java @@ -213,8 +213,10 @@ private void aggregateValues( int ordinalCount = ordinalValues.docValueCount(); for (int i = 0; i < ordinalCount; i++) { int ord = (int) ordinalValues.nextValue(); - float newValue = aggregationFunction.aggregate(values[ord], value); - values[ord] = newValue; + float currentValue = getValue(ord); + float newValue = aggregationFunction.aggregate(currentValue, value); + setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } @@ -250,8 +252,10 @@ private void aggregateValues( offset += 4; float value = (float) BitUtil.VH_BE_FLOAT.get(bytes, offset); offset += 4; - float newValue = aggregationFunction.aggregate(values[ord], value); - values[ord] = newValue; + float currentValue = getValue(ord); + float newValue = aggregationFunction.aggregate(currentValue, value); + setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java index f437efa0d8a..86cc3d1f714 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetIntAssociations.java @@ -85,6 +85,7 @@ private void aggregateValues( int currentValue = getValue(ord); int newValue = aggregationFunction.aggregate(currentValue, value); setValue(ord, newValue); + setCount(ord, getCount(ord) + 1); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java index 5299264887c..11325c54fe1 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java @@ -17,17 +17,27 @@ package org.apache.lucene.facet.taxonomy; +import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.facet.TopOrdAndNumberQueue; +import org.apache.lucene.util.PriorityQueue; /** * Base class for all taxonomy-based facets impls. @@ -40,6 +50,30 @@ */ @Deprecated public abstract class TaxonomyFacets extends Facets { + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + static class TopChildrenForPath { + Number pathValue; + int childCount; + TopOrdAndNumberQueue childQueue; + + public TopChildrenForPath(Number pathValue, int childCount, TopOrdAndNumberQueue childQueue) { + this.pathValue = pathValue; + this.childCount = childCount; + this.childQueue = childQueue; + } + } + + private static class DimValue { + String dim; + int dimOrd; + Number value; + + DimValue(String dim, int dimOrd, Number value) { + this.dim = dim; + this.dimOrd = dimOrd; + this.value = value; + } + } private static final Comparator BY_VALUE_THEN_DIM = new Comparator() { @@ -88,6 +122,17 @@ protected TaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, Facet this(indexFieldName, taxoReader, config, null); } + /** Dense ordinal counts. */ + int[] counts; + + /** Sparse ordinal counts. */ + IntIntHashMap sparseCounts; + + /** Have value counters been initialized. */ + boolean initialized; + + protected Comparator valueComparator; + /** * Constructor with a {@link FacetsCollector}, allowing lazy initialization of internal data * structures. @@ -100,6 +145,78 @@ protected TaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, Facet this.config = config; this.fc = fc; parents = taxoReader.getParallelTaxonomyArrays().parents(); + valueComparator = Comparator.comparingInt((x) -> (int) x); + } + + /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ + private boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { + if (taxoReader.getSize() < 1024) { + // small number of unique values: use an array + return false; + } + + if (fc == null) { + // counting all docs: use an array + return false; + } + + int maxDoc = 0; + int sumTotalHits = 0; + for (FacetsCollector.MatchingDocs docs : fc.getMatchingDocs()) { + sumTotalHits += docs.totalHits; + maxDoc += docs.context.reader().maxDoc(); + } + + // if our result set is < 10% of the index, we collect sparsely (use hash map): + return sumTotalHits < maxDoc / 10; + } + + protected void initializeValueCounters() { + if (initialized) { + return; + } + initialized = true; + assert sparseCounts == null && counts == null; + if (useHashTable(fc, taxoReader)) { + sparseCounts = new IntIntHashMap(); + } else { + counts = new int[taxoReader.getSize()]; + } + } + + /** Set the count for this ordinal to {@code newValue}. */ + protected void setCount(int ordinal, int newValue) { + if (sparseCounts != null) { + sparseCounts.put(ordinal, newValue); + } else { + counts[ordinal] = newValue; + } + } + + /** Get the count for this ordinal. */ + protected int getCount(int ordinal) { + if (sparseCounts != null) { + return sparseCounts.get(ordinal); + } else { + return counts[ordinal]; + } + } + + /** Get the aggregation value for this ordinal. */ + protected Number getAggregationValue(int ordinal) { + // By default, this is just the count. + return getCount(ordinal); + } + + /** Apply an aggregation to the two values and return the result. */ + protected Number aggregate(Number existingVal, Number newVal) { + // By default, we are computing counts, so the values are interpreted as integers and summed. + return (int) existingVal + (int) newVal; + } + + /** Were any values actually aggregated during counting? */ + boolean hasValues() { + return initialized; } /** @@ -166,6 +283,320 @@ protected DimConfig verifyDim(String dim) { return dimConfig; } + /** + * Roll-up the aggregation values from {@code childOrdinal} to {@code ordinal}. Overrides should + * probably call this to update the counts. Overriding allows us to work with primitive types for + * the aggregation values, keeping aggregation efficient. + */ + protected void updateValueFromRollup(int ordinal, int childOrdinal) throws IOException { + setCount(ordinal, getCount(ordinal) + rollup(childOrdinal)); + } + + /** + * Return a {@link TopOrdAndNumberQueue} of the appropriate type, i.e. a {@link TopOrdAndIntQueue} + * or a {@link org.apache.lucene.facet.TopOrdAndFloatQueue}. + */ + protected TopOrdAndNumberQueue makeTopOrdAndNumberQueue(int topN) { + return new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); + } + + // TODO: We don't need this if we're okay with having an integer -1 in the results even for float + // aggregations. + /** Return the value for a missing aggregation, i.e. {@code -1} or {@code -1f}. */ + protected Number missingAggregationValue() { + return -1; + } + + /** Rolls up any single-valued hierarchical dimensions. */ + void rollup() throws IOException { + if (initialized == false) { + return; + } + + // Rollup any necessary dims: + ParallelTaxonomyArrays.IntArray children = null; + for (Map.Entry ent : config.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); + // It can be -1 if this field was declared in the + // config but never indexed: + if (dimRootOrd > 0) { + if (children == null) { + // lazy init + children = getChildren(); + } + updateValueFromRollup(dimRootOrd, children.get(dimRootOrd)); + } + } + } + } + + private int rollup(int ord) throws IOException { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int aggregatedValue = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int currentValue = getCount(ord); + int newValue = currentValue + rollup(children.get(ord)); + setCount(ord, newValue); + aggregatedValue += getCount(ord); + ord = siblings.get(ord); + } + return aggregatedValue; + } + + /** + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. + */ + private FacetResult createFacetResult( + TopChildrenForPath topChildrenForPath, String dim, String... path) throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndNumberQueue q = topChildrenForPath.childQueue; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + Number[] values = new Number[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndNumberQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.getValue(); + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + } + + @Override + public FacetResult getAllChildren(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + Number aggregatedValue = 0; + int aggregatedCount = 0; + + IntArrayList ordinals = new IntArrayList(); + List ordValues = new ArrayList<>(); + + if (sparseCounts != null) { + for (IntIntCursor ordAndCount : sparseCounts) { + int ord = ordAndCount.key; + int count = ordAndCount.value; + Number value = getAggregationValue(ord); + if (parents.get(ord) == dimOrd && count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + } + } else { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int ord = children.get(dimOrd); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + Number value = getAggregationValue(ord); + if (count > 0) { + aggregatedCount += count; + aggregatedValue = aggregate(aggregatedValue, value); + ordinals.add(ord); + ordValues.add(value); + } + ord = siblings.get(ord); + } + } + + if (aggregatedCount == 0) { + return null; + } + + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValue = getAggregationValue(dimOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValue = missingAggregationValue(); + } + } else { + // Our aggregateddim value is accurate, so we keep it + } + + // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to + // do an array copy here: + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); + + LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; + for (int i = 0; i < ordValues.size(); i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); + } + return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); + } + + protected void setIncomingValue(TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + ((TopOrdAndIntQueue.OrdAndInt) incomingOrdAndValue).value = getCount(ord); + } + + protected TopOrdAndNumberQueue.OrdAndValue insertIntoQueue( + TopOrdAndNumberQueue q, TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue, int ord) { + if (incomingOrdAndValue == null) { + incomingOrdAndValue = q.newOrdAndValue(); + } + incomingOrdAndValue.ord = ord; + setIncomingValue(incomingOrdAndValue, ord); + + incomingOrdAndValue = q.insertWithOverflow(incomingOrdAndValue); + return incomingOrdAndValue; + } + + protected abstract static class AggregatedValue { + /** Aggregate the value corresponding to the given ordinal into this value. */ + public abstract void aggregate(int ord); + + /** Retrieve the encapsulated value. */ + public abstract Number get(); + } + + private class AggregatedCount extends AggregatedValue { + private int count; + + private AggregatedCount(int count) { + this.count = count; + } + + @Override + public void aggregate(int ord) { + count += getCount(ord); + } + + @Override + public Number get() { + return count; + } + } + + protected AggregatedValue newAggregatedValue() { + return new AggregatedCount(0); + } + + /** + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. + */ + protected TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) + throws IOException { + TopOrdAndNumberQueue q = makeTopOrdAndNumberQueue(topN); + + AggregatedValue aggregatedValue = newAggregatedValue(); + int childCount = 0; + + TopOrdAndNumberQueue.OrdAndValue incomingOrdAndValue = null; + + // TODO: would be faster if we had a "get the following children" API? then we + // can make a single pass over the hashmap + if (sparseCounts != null) { + for (IntIntCursor c : sparseCounts) { + int ord = c.key; + int count = c.value; + if (parents.get(ord) == pathOrd && count > 0) { + aggregatedValue.aggregate(ord); + childCount++; + + incomingOrdAndValue = insertIntoQueue(q, incomingOrdAndValue, ord); + } + } + } else { + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + int ord = children.get(pathOrd); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int count = counts[ord]; + if (count > 0) { + aggregatedValue.aggregate(ord); + childCount++; + + incomingOrdAndValue = insertIntoQueue(q, incomingOrdAndValue, ord); + } + ord = siblings.get(ord); + } + } + + Number aggregatedValueNumber = aggregatedValue.get(); + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + aggregatedValueNumber = getAggregationValue(pathOrd); + } else { + // Our aggregated value is not correct, in general: + aggregatedValueNumber = missingAggregationValue(); + } + } + + return new TopChildrenForPath(aggregatedValueNumber, childCount, q); + } + + @Override + public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { + validateTopN(topN); + DimConfig dimConfig = verifyDim(dim); + FacetLabel cp = new FacetLabel(dim, path); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd == -1) { + return null; + } + + if (initialized == false) { + return null; + } + + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); + return createFacetResult(topChildrenForPath, dim, path); + } + + @Override + public Number getSpecificValue(String dim, String... path) throws IOException { + DimConfig dimConfig = verifyDim(dim); + if (path.length == 0) { + if (dimConfig.hierarchical && dimConfig.multiValued == false) { + // ok: rolled up at search time + } else if (dimConfig.requireDimCount && dimConfig.multiValued) { + // ok: we indexed all ords at index time + } else { + throw new IllegalArgumentException( + "cannot return dimension-level value alone; use getTopChildren instead"); + } + } + int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); + if (ord < 0) { + return -1; + } + return initialized ? getAggregationValue(ord) : 0; + } + @Override public List getAllDims(int topN) throws IOException { validateTopN(topN); @@ -195,6 +626,110 @@ public List getAllDims(int topN) throws IOException { return results; } - /** Were any values actually aggregated during counting? */ - abstract boolean hasValues(); + @Override + public List getTopDims(int topNDims, int topNChildren) throws IOException { + if (topNDims <= 0 || topNChildren <= 0) { + throw new IllegalArgumentException("topN must be > 0"); + } + + if (initialized == false) { + return Collections.emptyList(); + } + + // get children and siblings ordinal array from TaxonomyFacets + ParallelTaxonomyArrays.IntArray children = getChildren(); + ParallelTaxonomyArrays.IntArray siblings = getSiblings(); + + // Create priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan(DimValue a, DimValue b) { + int comparison = valueComparator.compare(a.value, b.value); + if (comparison < 0) { + return true; + } + if (comparison > 0) { + return false; + } + return a.dim.compareTo(b.dim) > 0; + } + }; + + // Keep track of intermediate results, if we compute them, so we can reuse them later: + Map intermediateResults = null; + + // iterate over children and siblings ordinals for all dims + int ord = children.get(TaxonomyReader.ROOT_ORDINAL); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + String dim = taxoReader.getPath(ord).components[0]; + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + if (dimConfig.indexFieldName.equals(indexFieldName)) { + FacetLabel cp = new FacetLabel(dim); + int dimOrd = taxoReader.getOrdinal(cp); + if (dimOrd != -1) { + Number dimValue; + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + // If the dim is configured as multi-valued and requires dim counts, we can access + // an accurate count for the dim computed at indexing time: + dimValue = getAggregationValue(dimOrd); + } else { + // If the dim is configured as multi-valued but not requiring dim counts, we cannot + // compute an accurate dim count, and use -1 as a place-holder: + dimValue = -1; + } + } else { + // Single-valued dims require aggregating descendant paths to get accurate dim counts + // since we don't directly access ancestry paths: + // TODO: We could consider indexing dim counts directly if getTopDims is a common + // use-case. + TopChildrenForPath topChildrenForPath = + getTopChildrenForPath(dimConfig, dimOrd, topNChildren); + if (intermediateResults == null) { + intermediateResults = new HashMap<>(); + } + intermediateResults.put(dim, topChildrenForPath); + dimValue = topChildrenForPath.pathValue; + } + if (valueComparator.compare(dimValue, 0) != 0) { + if (pq.size() < topNDims) { + pq.add(new DimValue(dim, dimOrd, dimValue)); + } else { + if (valueComparator.compare(dimValue, pq.top().value) > 0 + || (valueComparator.compare(dimValue, pq.top().value) == 0 + && dim.compareTo(pq.top().dim) < 0)) { + DimValue bottomDim = pq.top(); + bottomDim.dim = dim; + bottomDim.value = dimValue; + pq.updateTop(); + } + } + } + } + } + ord = siblings.get(ord); + } + + FacetResult[] results = new FacetResult[pq.size()]; + + while (pq.size() > 0) { + DimValue dimValue = pq.pop(); + assert dimValue != null; + String dim = dimValue.dim; + TopChildrenForPath topChildrenForPath = null; + if (intermediateResults != null) { + topChildrenForPath = intermediateResults.get(dim); + } + if (topChildrenForPath == null) { + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); + } + FacetResult facetResult = createFacetResult(topChildrenForPath, dim); + assert facetResult != null; + results[pq.size()] = facetResult; + } + return Arrays.asList(results); + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java index c69dc2943b2..3db906eb05c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java @@ -24,7 +24,10 @@ import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.facet.DrillDownQuery; +import org.apache.lucene.facet.FacetField; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.Facets; @@ -38,10 +41,13 @@ import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BitUtil; @@ -101,6 +107,7 @@ public static void beforeClass() throws Exception { doc.add(new FloatAssociationFacetField(0.2f, "float", "b")); } } + doc.add(new TextField("match", "yes", Field.Store.NO)); writer.addDocument(config.build(taxoWriter, doc)); } @@ -142,6 +149,17 @@ public static void beforeClass() throws Exception { } } + doc.add(new TextField("match", "yes", Field.Store.NO)); + writer.addDocument(config.build(taxoWriter, doc)); + } + + // Add more random labels and documents to randomly make the test run on sparse/dense + // aggregation values. + count = random().nextInt(10_000); + for (int i = 0; i < count; i++) { + Document doc = new Document(); + doc.add(new FacetField("random_dim_" + i, "path")); + doc.add(new TextField("match", "no", Field.Store.NO)); writer.addDocument(config.build(taxoWriter, doc)); } @@ -194,7 +212,8 @@ public static void afterClass() throws Exception { public void testIntSumAssociation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getIntSumFacets("$facets.int", taxoReader, config, fc); assertEquals( @@ -225,7 +244,7 @@ public void testIntAssociationRandom() throws Exception { FacetsCollector fc = new FacetsCollector(); IndexSearcher searcher = newSearcher(reader); - searcher.search(new MatchAllDocsQuery(), fc); + searcher.search(new TermQuery(new Term("match", "yes")), fc); Map expected; Facets facets; @@ -273,7 +292,8 @@ public void testIntAssociationRandom() throws Exception { public void testFloatSumAssociation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("$facets.float", taxoReader, config, fc, null); assertEquals( @@ -285,7 +305,7 @@ public void testFloatSumAssociation() throws Exception { "float", new String[0], 2, - -1.0f, + -1f, new LabelAndValue[] { new LabelAndValue("a", 50.0f), new LabelAndValue("b", 9.999995f), }); @@ -304,7 +324,7 @@ public void testFloatSumAssociation() throws Exception { // test getAllDims and getTopDims List topDims = facets.getTopDims(10, 10); List allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); } public void testFloatAssociationRandom() throws Exception { @@ -312,7 +332,7 @@ public void testFloatAssociationRandom() throws Exception { FacetsCollector fc = new FacetsCollector(); IndexSearcher searcher = newSearcher(reader); - searcher.search(new MatchAllDocsQuery(), fc); + searcher.search(new TermQuery(new Term("match", "yes")), fc); Map expected; Facets facets; @@ -336,7 +356,7 @@ public void testFloatAssociationRandom() throws Exception { // test getAllDims and getTopDims List topDims = facets.getTopDims(10, 10); List allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); // MAX: facets = @@ -357,7 +377,7 @@ public void testFloatAssociationRandom() throws Exception { // test getAllDims and getTopDims topDims = facets.getTopDims(10, 10); allDims = facets.getAllDims(10); - assertEquals(topDims, allDims); + assertFloatFacetResultsEqual(topDims, allDims); } /** @@ -366,7 +386,8 @@ public void testFloatAssociationRandom() throws Exception { */ public void testIntAndFloatAssocation() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("$facets.float", taxoReader, config, fc, null); assertEquals( @@ -389,7 +410,8 @@ public void testIntAndFloatAssocation() throws Exception { public void testWrongIndexFieldName() throws Exception { IndexSearcher searcher = newSearcher(reader); - FacetsCollector fc = searcher.search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Facets facets = getFloatSumFacets("wrong_field", taxoReader, config, fc, null); expectThrows( IllegalArgumentException.class, @@ -538,6 +560,63 @@ private Facets getFloatSumFacets( } } + public void testNonPositiveAggregations() throws IOException { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + + FacetsConfig config = new FacetsConfig(); + config.setIndexFieldName("a", "$float_facets"); + config.setIndexFieldName("b", "$int_facets"); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document d; + + d = new Document(); + // Positive association + d.add(new FloatAssociationFacetField(1f, "a", "1")); + d.add(new IntAssociationFacetField(1, "b", "1")); + writer.addDocument(config.build(taxoWriter, d)); + + d = new Document(); + // Zero association + d.add(new FloatAssociationFacetField(0f, "a", "2")); + d.add(new IntAssociationFacetField(0, "b", "2")); + writer.addDocument(config.build(taxoWriter, d)); + + d = new Document(); + // Negative association + d.add(new FloatAssociationFacetField(-1f, "a", "3")); + d.add(new IntAssociationFacetField(-1, "b", "3")); + writer.addDocument(config.build(taxoWriter, d)); + + IndexReader reader = writer.getReader(); + IOUtils.close(taxoWriter, writer); + + IndexSearcher searcher = newSearcher(reader); + Query q = new MatchAllDocsQuery(); + FacetsCollector fc = searcher.search(q, new FacetsCollectorManager()); + + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + FloatTaxonomyFacets floatFacets = + new TaxonomyFacetFloatAssociations( + "$float_facets", taxoReader, config, fc, AssociationAggregationFunction.SUM); + IntTaxonomyFacets intFacets = + new TaxonomyFacetIntAssociations( + "$int_facets", taxoReader, config, fc, AssociationAggregationFunction.SUM); + + // "2" and "3" are included in the result despite having non-positive values associated to them. + assertEquals( + "dim=a path=[] value=0.0 childCount=3\n 1 (1.0)\n 2 (0.0)\n 3 (-1.0)\n", + floatFacets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[] value=0 childCount=3\n 1 (1)\n 2 (0)\n 3 (-1)\n", + intFacets.getTopChildren(10, "b").toString()); + + IOUtils.close(taxoReader, reader, taxoDir, dir); + } + private void validateInts( String dim, Map expected, @@ -613,6 +692,19 @@ private void validateFloats( } } + private void assertFloatFacetResultsEqual(List expected, List actual) { + assertEquals(expected.size(), actual.size()); + for (int i = 0; i < expected.size(); i++) { + FacetResult expectedResult = expected.get(i); + FacetResult actualResult = actual.get(i); + + assertEquals(expectedResult.dim, actualResult.dim); + assertArrayEquals(expectedResult.path, actualResult.path); + assertEquals((float) expectedResult.value, (float) actualResult.value, 2e-1); + assertEquals(expectedResult.childCount, actualResult.childCount); + } + } + // since we have no insight into the ordinals assigned to the values, we sort labels by value and // count in // ascending order in order to compare with expected results diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java index 998d2b7b241..a467ff844d7 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetValueSource.java @@ -552,53 +552,44 @@ public void testRollupValues() throws Exception { } // LUCENE-10495 - public void testSiblingsLoaded() throws Exception { - Directory indexDir = newDirectory(); - Directory taxoDir = newDirectory(); - - DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); - IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random()))); - FacetsConfig config = new FacetsConfig(); + public void testChildrenAndSiblingsLoaded() throws Exception { + boolean[] shouldLoad = new boolean[] {false, true}; + for (boolean load : shouldLoad) { + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); - config.setHierarchical("a", true); - config.setMultiValued("a", true); - config.setRequireDimCount("a", true); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random()))); + FacetsConfig config = new FacetsConfig(); - Document doc = new Document(); - doc.add(new FacetField("a", Integer.toString(2), "1")); - iw.addDocument(config.build(taxoWriter, doc)); + config.setHierarchical("a", true); + config.setMultiValued("a", load == false); + config.setRequireDimCount("a", true); - DirectoryReader r = DirectoryReader.open(iw); - DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + Document doc = new Document(); + doc.add(new FacetField("a", "1", "2")); + iw.addDocument(config.build(taxoWriter, doc)); - FacetsCollector sfc = - newSearcher(r).search(new MatchAllDocsQuery(), new FacetsCollectorManager()); + DirectoryReader r = DirectoryReader.open(iw); + DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); - // Test MAX: - Facets facets = - new TaxonomyFacetFloatAssociations( - taxoReader, - config, - sfc, - AssociationAggregationFunction.MAX, - DoubleValuesSource.fromLongField("price")); + FacetsCollector sfc = + newSearcher(r).search(new MatchAllDocsQuery(), new FacetsCollectorManager()); - assertTrue(((TaxonomyFacets) facets).childrenLoaded()); - assertFalse(((TaxonomyFacets) facets).siblingsLoaded()); + TaxonomyFacets facets = + new TaxonomyFacetFloatAssociations( + taxoReader, + config, + sfc, + AssociationAggregationFunction.MAX, + DoubleValuesSource.fromLongField("price")); - // Test SUM: - facets = - new TaxonomyFacetFloatAssociations( - taxoReader, - config, - sfc, - AssociationAggregationFunction.SUM, - DoubleValuesSource.fromLongField("price")); - assertTrue(((TaxonomyFacets) facets).childrenLoaded()); - assertFalse(((TaxonomyFacets) facets).siblingsLoaded()); + assertEquals(load, facets.childrenLoaded()); + assertEquals(load, facets.siblingsLoaded()); - iw.close(); - IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); + iw.close(); + IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); + } } public void testCountAndSumScore() throws Exception {