Skip to content

Commit

Permalink
Reduce duplication in taxonomy facets; always do counts (apache#12966)
Browse files Browse the repository at this point in the history
This is a large change, refactoring most of the taxonomy facets code and changing internal behaviour, without changing the API. There are specific API changes this sets us up to do later, e.g. retrieving counts from aggregation facets.

1. Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself. This reduces code duplication and enables future development. Addresses genericity issue mentioned in apache#12553.
2. As a consequence, introduce sparse values to FloatTaxonomyFacets, which previously used dense values always. This issue is part of apache#12576.
3. Compute counts for all taxonomy facets always, which enables us to add an API to retrieve counts for association facets in the future. Addresses apache#11282.
4. As a consequence of having counts, we can check whether we encountered a label while faceting (count > 0), while previously we relied on the aggregation value to be positive. Closes apache#12585.
5. Introduce the idea of doing multiple aggregations in one go, with association facets doing the aggregation they were already doing, plus a count. We can extend to an arbitrary number of aggregations, as suggested in apache#12546.
6. Don't change the API. The only change in behaviour users should notice is the fix for non-positive aggregation values, which were previously discarded.
7. Add tests which were missing for sparse/dense values and non-positive aggregations.
  • Loading branch information
stefanvodita committed May 10, 2024
1 parent b888652 commit 4130c72
Show file tree
Hide file tree
Showing 14 changed files with 940 additions and 889 deletions.
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ Improvements
* GITHUB#13202: Early terminate graph and exact searches of AbstractKnnVectorQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)

* GITHUB#12966: Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself.
This reduces code duplication and enables future development. (Stefan Vodita)

Optimizations
---------------------

Expand Down Expand Up @@ -126,6 +129,8 @@ Bug Fixes

* GITHUB#13206: Subtract deleted file size from the cache size of NRTCachingDirectory. (Jean-François Boeuf)

* GITHUB#12966: Aggregation facets no longer assume that aggregation values are positive. (Stefan Vodita)

Build
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I

topN = Math.min(topN, cardinality);
TopOrdAndIntQueue q = null;
TopOrdAndIntQueue.OrdAndValue reuse = null;
TopOrdAndIntQueue.OrdAndInt reuse = null;
int bottomCount = 0;
int bottomOrd = Integer.MAX_VALUE;
int childCount = 0; // total number of labels with non-zero count
Expand All @@ -191,18 +191,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
int ord = cursor.key;
int count = cursor.value;
if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
Expand All @@ -213,18 +213,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
if (count != 0) {
childCount++;
if (count > bottomCount || (count == bottomCount && i < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = i;
reuse.value = count;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = i;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
Expand All @@ -235,7 +235,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
int resultCount = q == null ? 0 : q.size();
LabelAndValue[] labelValues = new LabelAndValue[resultCount];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop();
final BytesRef term = docValues.lookupOrd(ordAndValue.ord);
labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,42 @@
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;
/** Keeps highest results, first by largest float value, then tie-break by smallest ord. */
public class TopOrdAndFloatQueue extends TopOrdAndNumberQueue {

/** Keeps highest results, first by largest float value, then tie break by smallest ord. */
public class TopOrdAndFloatQueue extends PriorityQueue<TopOrdAndFloatQueue.OrdAndValue> {

/** Holds a single entry. */
public static final class OrdAndValue {

/** Ordinal of the entry. */
public int ord;
/** Sole constructor. */
public TopOrdAndFloatQueue(int topN) {
super(topN);
}

/** Value associated with the ordinal. */
/** Holds an ordinal and a float value. */
public static final class OrdAndFloat extends OrdAndValue {
/** The value corresponding to the ordinal is a float. */
public float value;

/** Default constructor. */
public OrdAndValue() {}
}
public OrdAndFloat() {}

@Override
public boolean lessThan(OrdAndValue other) {
OrdAndFloat otherOrdAndFloat = (OrdAndFloat) other;
if (value < otherOrdAndFloat.value) {
return true;
}
if (value > otherOrdAndFloat.value) {
return false;
}
return ord > otherOrdAndFloat.ord;
}

/** Sole constructor. */
public TopOrdAndFloatQueue(int topN) {
super(topN);
@Override
public Number getValue() {
return value;
}
}

@Override
protected boolean lessThan(OrdAndValue a, OrdAndValue b) {
if (a.value < b.value) {
return true;
} else if (a.value > b.value) {
return false;
} else {
return a.ord > b.ord;
}
public OrdAndValue newOrdAndValue() {
return new OrdAndFloat();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,42 @@
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;
/** Keeps highest results, first by largest int value, then tie-break by smallest ord. */
public class TopOrdAndIntQueue extends TopOrdAndNumberQueue {

/** Keeps highest results, first by largest int value, then tie break by smallest ord. */
public class TopOrdAndIntQueue extends PriorityQueue<TopOrdAndIntQueue.OrdAndValue> {

/** Holds a single entry. */
public static final class OrdAndValue {

/** Ordinal of the entry. */
public int ord;
/** Sole constructor. */
public TopOrdAndIntQueue(int topN) {
super(topN);
}

/** Value associated with the ordinal. */
/** Holds an ordinal and an int value. */
public static final class OrdAndInt extends OrdAndValue {
/** The value corresponding to the ordinal is an int. */
public int value;

/** Default constructor. */
public OrdAndValue() {}
}
public OrdAndInt() {}

@Override
public boolean lessThan(OrdAndValue other) {
OrdAndInt otherOrdAndInt = (OrdAndInt) other;
if (value < otherOrdAndInt.value) {
return true;
}
if (value > otherOrdAndInt.value) {
return false;
}
return ord > otherOrdAndInt.ord;
}

/** Sole constructor. */
public TopOrdAndIntQueue(int topN) {
super(topN);
@Override
public Number getValue() {
return value;
}
}

@Override
protected boolean lessThan(OrdAndValue a, OrdAndValue b) {
if (a.value < b.value) {
return true;
} else if (a.value > b.value) {
return false;
} else {
return a.ord > b.ord;
}
public OrdAndValue newOrdAndValue() {
return new OrdAndInt();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;

/** Keeps highest results, first by largest value, then tie-break by smallest ord. */
public abstract class TopOrdAndNumberQueue extends PriorityQueue<TopOrdAndNumberQueue.OrdAndValue> {

/** Holds a single entry. */
public abstract static class OrdAndValue {

/** Ordinal of the entry. */
public int ord;

/** Default constructor. */
public OrdAndValue() {}

/** Compare with another {@link OrdAndValue}. */
public abstract boolean lessThan(OrdAndValue other);

/** Get the value stored in this {@link OrdAndValue}. */
public abstract Number getValue();
}

/** Sole constructor. */
public TopOrdAndNumberQueue(int topN) {
super(topN);
}

@Override
public boolean lessThan(TopOrdAndNumberQueue.OrdAndValue a, TopOrdAndNumberQueue.OrdAndValue b) {
return a.lessThan(b);
}

/**
* Create a new {@link org.apache.lucene.facet.TopOrdAndNumberQueue.OrdAndValue} of the
* appropriate type.
*/
public abstract OrdAndValue newOrdAndValue();
}
Original file line number Diff line number Diff line change
Expand Up @@ -328,28 +328,28 @@ private TopChildrenForPath computeTopChildren(
int pathCount = 0;
int childCount = 0;

TopOrdAndIntQueue.OrdAndValue reuse = null;
TopOrdAndIntQueue.OrdAndInt reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
int count = getCount(ord);
if (count > 0) {
pathCount += count;
childCount++;
if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomOrd = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
}
Expand Down Expand Up @@ -397,7 +397,7 @@ private FacetResult createFacetResult(

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
Expand Down

0 comments on commit 4130c72

Please sign in to comment.