Skip to content
Permalink
Browse files
[optimization][expressions] Simplify "CASE WHEN..." expressions
during preparation wherever possible

In many situations we are able to optimize a case when expression
and replace it with a simpler expression node during the preparation
stage. Specifically, if the WHEN conditions are known to be
static values (such as those coming from certain expression context
variables) then we can often replace the whole condition node
with the THEN node of the first static true condition.

E.g.

    CASE
        WHEN @variable=1 THEN "first_field"
        WHEN @variable=2 THEN "second_field"
        ELSE "third_field"
    END

If @variable is static and '1', then the whole expression node will ALWAYS
be identical to "first_field". Similiarly if @variable='2', then the
whole expression will ALWAYS be "second_field".

If we're able to apply this optimization, then we use the simplified
effective node which represents the whole node during evaluation
time and save a bunch of unnecessary work.

TODO: If we use the effective node during expression compilation
for providers we would be able to handoff more expressions involving
QGIS-side variables and other components to the backend, resulting
in increased use of backend provider indices, etc....
  • Loading branch information
nyalldawson committed Jun 9, 2021
1 parent 1bcb09a commit 0a4b9a62cf52c54b82b2b487320faa3210e20190
@@ -278,10 +278,28 @@ Returns the node's static cached value. Only valid if :py:func:`~QgsExpressionNo
.. seealso:: :py:func:`hasCachedStaticValue`

.. versionadded:: 3.18
%End

const QgsExpressionNode *effectiveNode() const;
%Docstring
Returns a reference to the simplest node which represents this node,
after any compilation optimizations have been applied.

Eg. a node like "CASE WHEN true THEN "some_field" WHEN other condition THEN ... END" can effectively
be replaced entirely by a :py:class:`QgsExpressionNodeColumnRef` referencing the "some_field" field, as the
CASE WHEN ... will ALWAYS evalute to "some_field".

Returns a reference to the current object if no optimizations were applied.

.. versionadded:: 3.20
%End

protected:

QgsExpressionNode();

QgsExpressionNode( const QgsExpressionNode &other );



};
@@ -23,6 +23,10 @@ QVariant QgsExpressionNode::eval( QgsExpression *parent, const QgsExpressionCont
{
return mCachedStaticValue;
}
else if ( mCompiledSimplifiedNode )
{
return mCompiledSimplifiedNode->eval( parent, context );
}
else
{
QVariant res = evalNode( parent, context );
@@ -33,6 +37,7 @@ QVariant QgsExpressionNode::eval( QgsExpression *parent, const QgsExpressionCont
bool QgsExpressionNode::prepare( QgsExpression *parent, const QgsExpressionContext *context )
{
mHasCachedValue = false;
mCompiledSimplifiedNode.reset();
if ( isStatic( parent, context ) )
{
// some calls to isStatic already evaluate the node to a cached value, so if that's
@@ -51,10 +56,36 @@ bool QgsExpressionNode::prepare( QgsExpression *parent, const QgsExpressionConte
}
}

QgsExpressionNode::QgsExpressionNode( const QgsExpressionNode &other )
: parserFirstLine( other.parserFirstLine )
, parserFirstColumn( other.parserFirstColumn )
, parserLastLine( other.parserLastLine )
, parserLastColumn( other.parserLastColumn )
, mHasCachedValue( other.mHasCachedValue )
, mCachedStaticValue( other.mCachedStaticValue )
, mCompiledSimplifiedNode( other.mCompiledSimplifiedNode ? other.mCompiledSimplifiedNode->clone() : nullptr )
{

}

QgsExpressionNode &QgsExpressionNode::operator=( const QgsExpressionNode &other )
{
parserFirstLine = other.parserFirstLine;
parserFirstColumn = other.parserFirstColumn;
parserLastLine = other.parserLastLine;
parserLastColumn = other.parserLastColumn;
mHasCachedValue = other.mHasCachedValue;
mCachedStaticValue = other.mCachedStaticValue;
mCompiledSimplifiedNode.reset( other.mCompiledSimplifiedNode ? other.mCompiledSimplifiedNode->clone() : nullptr );
return *this;
}

void QgsExpressionNode::cloneTo( QgsExpressionNode *target ) const
{
target->mHasCachedValue = mHasCachedValue;
target->mCachedStaticValue = mCachedStaticValue;
if ( mCompiledSimplifiedNode )
target->mCompiledSimplifiedNode.reset( mCompiledSimplifiedNode->clone() );
target->parserLastColumn = parserLastColumn;
target->parserLastLine = parserLastLine;
target->parserFirstColumn = parserFirstColumn;
@@ -329,8 +329,27 @@ class CORE_EXPORT QgsExpressionNode SIP_ABSTRACT
*/
QVariant cachedStaticValue() const { return mCachedStaticValue; }

/**
* Returns a reference to the simplest node which represents this node,
* after any compilation optimizations have been applied.
*
* Eg. a node like "CASE WHEN true THEN "some_field" WHEN other condition THEN ... END" can effectively
* be replaced entirely by a QgsExpressionNodeColumnRef referencing the "some_field" field, as the
* CASE WHEN ... will ALWAYS evalute to "some_field".
*
* Returns a reference to the current object if no optimizations were applied.
*
* \since QGIS 3.20
*/
const QgsExpressionNode *effectiveNode() const { return mCompiledSimplifiedNode ? mCompiledSimplifiedNode.get() : this; }

protected:

QgsExpressionNode() = default;

QgsExpressionNode( const QgsExpressionNode &other );
QgsExpressionNode &operator=( const QgsExpressionNode &other );

/**
* Copies the members of this node to the node provided in \a target.
* Needs to be called by all subclasses as part of their clone() implementation.
@@ -357,6 +376,18 @@ class CORE_EXPORT QgsExpressionNode SIP_ABSTRACT
* \since QGIS 3.20
*/
mutable QVariant mCachedStaticValue;

/**
* Contains a compiled node which represents a simplified version of this node
* as a result of compilation optimizations.
*
* Eg. a node like "CASE WHEN true THEN "some_field" WHEN other condition THEN ... END" can effectively
* be replaced entirely by a QgsExpressionNodeColumnRef referencing the "some_field" field, as the
* CASE WHEN ... will ALWAYS evalute to "some_field".
*
* \since QGIS 3.20
*/
mutable std::unique_ptr< QgsExpressionNode > mCompiledSimplifiedNode;
#endif

private:
@@ -1572,17 +1572,60 @@ QVariant QgsExpressionNodeCondition::evalNode( QgsExpression *parent, const QgsE

bool QgsExpressionNodeCondition::prepareNode( QgsExpression *parent, const QgsExpressionContext *context )
{
bool res;
bool foundAnyNonStaticConditions = false;
for ( WhenThen *cond : std::as_const( mConditions ) )
{
res = cond->mWhenExp->prepare( parent, context )
& cond->mThenExp->prepare( parent, context );
const bool res = cond->mWhenExp->prepare( parent, context )
&& cond->mThenExp->prepare( parent, context );
if ( !res )
return false;

foundAnyNonStaticConditions |= !cond->mWhenExp->hasCachedStaticValue();
if ( !foundAnyNonStaticConditions && QgsExpressionUtils::getTVLValue( cond->mWhenExp->cachedStaticValue(), parent ) == QgsExpressionUtils::True )
{
// ok, we now that we'll ALWAYS be picking the same condition, as the "WHEN" clause for this condition (and all previous conditions) is a static
// value, and the static value for this WHEN clause is True.
if ( cond->mThenExp->hasCachedStaticValue() )
{
// then "THEN" clause ALSO has a static value, so we can replace the whole node with a static value
mCachedStaticValue = cond->mThenExp->cachedStaticValue();
mHasCachedValue = true;
return true;
}
else
{
// we know at least that we'll ALWAYS be picking the same condition, so even though the THEN node is non-static we can effectively replace
// this whole QgsExpressionNodeCondition node with just the THEN node for this condition.
mCompiledSimplifiedNode.reset( cond->mThenExp->effectiveNode()->clone() );
return true;
}
}
}

if ( mElseExp )
return mElseExp->prepare( parent, context );
{
const bool res = mElseExp->prepare( parent, context );
if ( !res )
return false;

if ( !foundAnyNonStaticConditions )
{
// all condition nodes are static conditions and not TRUE, so we know we'll ALWAYS be picking the ELSE node
if ( mElseExp->hasCachedStaticValue() )
{
mCachedStaticValue = mElseExp->cachedStaticValue();
mHasCachedValue = true;
return true;
}
else
{
// so even though the ELSE node is non-static we can effectively replace
// this whole QgsExpressionNodeCondition node with just the ELSE node for this condition.
mCompiledSimplifiedNode.reset( mElseExp->effectiveNode()->clone() );
return true;
}
}
}

return true;
}
@@ -4336,6 +4336,112 @@ class TestQgsExpression: public QObject
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
}

void testPrecomputedNodesReplacedWithEffectiveNodes()
{
QgsFields fields;
fields.append( QgsField( QStringLiteral( "first_field" ), QVariant::Int ) );
fields.append( QgsField( QStringLiteral( "second_field" ), QVariant::Int ) );
fields.append( QgsField( QStringLiteral( "third_field" ), QVariant::Int ) );

QgsFeature f( fields );
f.setAttributes( QgsAttributes() << 11 << 20 << 300 );

QgsExpressionContext context;
context.setFields( fields );
context.setFeature( f );

// nothing we can do to optimize this expression
QgsExpression exp( QStringLiteral( "CASE WHEN \"first_field\" = 5 then \"second_field\" when \"first_field\" = 6 then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 60 );

exp = QgsExpression( QStringLiteral( "CASE WHEN \"first_field\" = 5 then \"second_field\" when \"first_field\" = 11 then \"second_field\" * 2 else 77 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 40 );

// slightly more complex expression
exp = QgsExpression( QStringLiteral( "CASE WHEN (upper(\"first_field\") = 'AA') then \"second_field\" when \"first_field\" = 11 then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 40 );

// first condition is non-static, second is static... still nothing we can do to optimize this one
exp = QgsExpression( QStringLiteral( "CASE WHEN (upper(\"first_field\") = 'AA') then \"second_field\" when 3 * 2 = 6 then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 40 );

// first condition is static but false, second condition is non static ... can't optimize
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 7 then \"second_field\" when \"second_field\" = 'B' then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 60 );

// first condition is static but NULL, second condition is non static ... can't optimize
exp = QgsExpression( QStringLiteral( "CASE WHEN NULL then \"second_field\" when \"second_field\" = 'B' then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 60 );

// first condition is static AND true, and THEN expression for this node is static -- yay, we CAN optimize this down to a static value for the whole node
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 6 then 7 + 4 when \"second_field\" = 'B' then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->cachedStaticValue().toInt(), 11 );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntCondition );
QCOMPARE( exp.evaluate( &context ).toInt(), 11 );

// first condition is static AND true, but THEN expression is non-static -- yay, we CAN still optimize this, because we will ALWAYS be returning the evaluated
// value for the THEN clause of the first condition, so we can effectively replace the entire node with the THEN expression of the first condition
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 6 then \"first_field\" when \"second_field\" = 'B' then \"second_field\" * 2 else \"second_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntColumnRef );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeColumnRef * >( exp.rootNode()->effectiveNode() )->name(), QStringLiteral( "first_field" ) );
QCOMPARE( exp.evaluate( &context ).toInt(), 11 );

// first condition is static AND false, second is static AND true, so we can effectively replace the entire node with the THEN expression of the second condition
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 7 then \"first_field\" when 'B'='B' then \"second_field\" else \"third_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntColumnRef );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeColumnRef * >( exp.rootNode()->effectiveNode() )->name(), QStringLiteral( "second_field" ) );
QCOMPARE( exp.evaluate( &context ).toInt(), 20 );

// first two conditions are static AND false, so we can effectively replace the entire node with the ELSE expression
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 7 then \"first_field\" when 'B'='C' then \"second_field\" else \"third_field\" end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntColumnRef );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeColumnRef * >( exp.rootNode()->effectiveNode() )->name(), QStringLiteral( "third_field" ) );
QCOMPARE( exp.evaluate( &context ).toInt(), 300 );

// slightly more complex -- second condition is static and TRUE, but uses a more complicated THEN node
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 7 then \"first_field\" when 'B'='B' then upper(\"second_field\") || \"first_field\" else \"third_field\" * 3 end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntBinaryOperator );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeBinaryOperator * >( exp.rootNode()->effectiveNode() )->opLeft()->nodeType(), QgsExpressionNode::NodeType::ntFunction );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeBinaryOperator * >( exp.rootNode()->effectiveNode() )->opRight()->nodeType(), QgsExpressionNode::NodeType::ntColumnRef );
QCOMPARE( exp.evaluate( &context ).toInt(), 2011 );

// EVEN more complex -- second condition is static and TRUE, and uses a nested CASE as the THEN node
// the whole root node can be replaced by a column ref to "second_field"
exp = QgsExpression( QStringLiteral( "CASE WHEN 3 * 2 = 7 then \"first_field\" when 'B'='B' then ( CASE WHEN 3*3=11 then \"first_field\" ELSE \"second_field\" END) else \"third_field\" end" ) );
QVERIFY( exp.prepare( &context ) );
QVERIFY( !exp.rootNode()->hasCachedStaticValue() );
QCOMPARE( exp.rootNode()->effectiveNode()->nodeType(), QgsExpressionNode::NodeType::ntColumnRef );
QCOMPARE( qgis::down_cast< const QgsExpressionNodeColumnRef * >( exp.rootNode()->effectiveNode() )->name(), QStringLiteral( "second_field" ) );
QCOMPARE( exp.evaluate( &context ).toInt(), 20 );
}
};

QGSTEST_MAIN( TestQgsExpression )

0 comments on commit 0a4b9a6

Please sign in to comment.