Skip to content

Commit

Permalink
[OGR provider] Optimize feature requests with OrderBy clauses on a Ge…
Browse files Browse the repository at this point in the history
…oPackage dataset

(fixes qgis#53198)

OrderBy clauses are translated into a
"SELECT ... FROM ... [WHERE ...] ORDER BY ... [LIMIT x]" SQL statement
evaluated by GDAL

On a 3.2 million feature GeoPackage, this decreases the RAM consumption
from 6 GB to 0.5 GB, and is 50% faster. If a limit clause is added with
a small number, then the speed-up is massive.
  • Loading branch information
rouault committed May 29, 2023
1 parent 24ae7a9 commit f5cdc9d
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 2 deletions.
138 changes: 137 additions & 1 deletion src/core/providers/ogr/qgsogrfeatureiterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include "qgssymbol.h"
#include "qgsgeometryengine.h"
#include "qgsdbquerylog.h"
#include "qgssettings.h"

#include <sqlite3.h>

#include <QTextCodec>
#include <QFile>
Expand Down Expand Up @@ -142,6 +145,132 @@ QgsOgrFeatureIterator::QgsOgrFeatureIterator( QgsOgrFeatureSource *source, bool
!( mRequest.flags() & QgsFeatureRequest::NoGeometry ) ||
( mSource->mOgrGeometryTypeFilter != wkbUnknown );

bool filterExpressionAlreadyTakenIntoAccount = false;

#if SQLITE_VERSION_NUMBER >= 3030000L
// For GeoPackage, try to compile order by expressions as a SQL statement
// Only SQLite >= 3.30.0 supports NULLS FIRST / NULLS LAST
// A potential further optimization would be to translate mFilterRect
// as a JOIN with the GPKG RTree when it exists, instead of having just OGR
// evaluating it as a post processing.
const auto constOrderBy = request.orderBy();
if ( !constOrderBy.isEmpty() &&
source->mDriverName == QLatin1String( "GPKG" ) &&
( request.filterType() == QgsFeatureRequest::FilterNone ||
request.filterType() == QgsFeatureRequest::FilterExpression ) &&
( mSource->mSubsetString.isEmpty() ||
!mSource->mSubsetString.startsWith( QLatin1String( "SELECT " ), Qt::CaseInsensitive ) ) )
{
QByteArray sql = QByteArray( "SELECT " );
for ( int i = 0; i < source->mFields.size(); ++i )
{
if ( i > 0 )
sql += QByteArray( ", " );
sql += QgsOgrProviderUtils::quotedIdentifier( source->mFields[i].name().toUtf8(), source->mDriverName );
}
if ( strcmp( OGR_L_GetGeometryColumn( mOgrLayer ), "" ) != 0 )
{
sql += QByteArray( ", " );
sql += QgsOgrProviderUtils::quotedIdentifier( OGR_L_GetGeometryColumn( mOgrLayer ), source->mDriverName );
}
sql += QByteArray( " FROM " );
sql += QgsOgrProviderUtils::quotedIdentifier( OGR_L_GetName( mOgrLayer ), source->mDriverName );

if ( request.filterType() == QgsFeatureRequest::FilterExpression )
{
QgsSQLiteExpressionCompiler compiler(
source->mFields,
request.flags() & QgsFeatureRequest::IgnoreStaticNodesDuringExpressionCompilation );
QgsSqlExpressionCompiler::Result result = compiler.compile( request.filterExpression() );
if ( result == QgsSqlExpressionCompiler::Complete || result == QgsSqlExpressionCompiler::Partial )
{
QString whereClause = compiler.result();
sql += QByteArray( " WHERE " );
if ( mSource->mSubsetString.isEmpty() )
{
sql += whereClause.toUtf8();
}
else
{
sql += QByteArray( "(" );
sql += mSource->mSubsetString.toUtf8();
sql += QByteArray( ") AND (" );
sql += whereClause.toUtf8();
sql += QByteArray( ")" );
}
mExpressionCompiled = ( result == QgsSqlExpressionCompiler::Complete );
mCompileStatus = ( mExpressionCompiled ? Compiled : PartiallyCompiled );
}
else
{
sql.clear();
}
}
else if ( !mSource->mSubsetString.isEmpty() )
{
sql += QByteArray( " WHERE " );
sql += mSource->mSubsetString.toUtf8();
}

if ( !sql.isEmpty() )
{
sql += QByteArray( " ORDER BY " );
bool firstOrderBy = true;
for ( const QgsFeatureRequest::OrderByClause &clause : constOrderBy )
{
QgsExpression expression = clause.expression();
QgsSQLiteExpressionCompiler compiler(
source->mFields, request.flags() & QgsFeatureRequest::IgnoreStaticNodesDuringExpressionCompilation );
QgsSqlExpressionCompiler::Result result = compiler.compile( &expression );
if ( result == QgsSqlExpressionCompiler::Complete &&
expression.rootNode()->nodeType() == QgsExpressionNode::ntColumnRef )
{
if ( !firstOrderBy )
sql += QByteArray( ", " );
sql += compiler.result().toUtf8();
sql += QByteArray( " COLLATE NOCASE" );
sql += clause.ascending() ? QByteArray( " ASC" ) : QByteArray( " DESC" );
if ( clause.nullsFirst() )
sql += QByteArray( " NULLS FIRST" );
else
sql += QByteArray( " NULLS LAST" );
}
else
{
sql.clear();
break;
}
firstOrderBy = false;
}
}

if ( !sql.isEmpty() )
{
mOrderByCompiled = true;

if ( mOrderByCompiled && request.limit() >= 0 )
{
sql += QByteArray( " LIMIT " );
sql += QString::number( request.limit() ).toUtf8();
}
QgsDebugMsgLevel( QStringLiteral( "Using optimized orderBy as: %1" ).arg( QString::fromUtf8( sql ) ), 4 );
filterExpressionAlreadyTakenIntoAccount = true;
if ( mOgrLayerOri && mOgrLayer != mOgrLayerOri )
{
GDALDatasetReleaseResultSet( mConn->ds, mOgrLayer );
mOgrLayer = mOgrLayerOri;
}
mOgrLayerOri = mOgrLayer;
mOgrLayer = QgsOgrProviderUtils::setSubsetString( mOgrLayer, mConn->ds, mSource->mEncoding, sql );
if ( !mOgrLayer )
{
close();
return;
}
}
}
#endif

QgsAttributeList attrs = ( mRequest.flags() & QgsFeatureRequest::SubsetOfAttributes ) ? mRequest.subsetOfAttributes() : mSource->mFields.allAttributesList();

// ensure that all attributes required for expression filter are being fetched
Expand Down Expand Up @@ -218,7 +347,7 @@ QgsOgrFeatureIterator::QgsOgrFeatureIterator( QgsOgrFeatureSource *source, bool
break;
}

if ( request.filterType() == QgsFeatureRequest::FilterExpression )
if ( request.filterType() == QgsFeatureRequest::FilterExpression && !filterExpressionAlreadyTakenIntoAccount )
{
QgsSqlExpressionCompiler *compiler = nullptr;
if ( source->mDriverName == QLatin1String( "SQLite" ) || source->mDriverName == QLatin1String( "GPKG" ) )
Expand Down Expand Up @@ -314,6 +443,13 @@ QgsOgrFeatureIterator::~QgsOgrFeatureIterator()
close();
}

bool QgsOgrFeatureIterator::prepareOrderBy( const QList<QgsFeatureRequest::OrderByClause> &orderBys )
{
Q_UNUSED( orderBys )
// Preparation has already been done in the constructor, so we just communicate the result
return mOrderByCompiled;
}

bool QgsOgrFeatureIterator::nextFeatureFilterExpression( QgsFeature &f )
{
if ( !mExpressionCompiled )
Expand Down
3 changes: 3 additions & 0 deletions src/core/providers/ogr/qgsogrfeatureiterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ class QgsOgrFeatureIterator final: public QgsAbstractFeatureIteratorFromSource<Q
bool fetchFeature( QgsFeature &feature ) override;
bool nextFeatureFilterExpression( QgsFeature &f ) override;

bool prepareOrderBy( const QList<QgsFeatureRequest::OrderByClause> &orderBys ) override;

private:

bool readFeature( const gdal::ogr_feature_unique_ptr &fet, QgsFeature &feature ) const;
Expand All @@ -101,6 +103,7 @@ class QgsOgrFeatureIterator final: public QgsAbstractFeatureIteratorFromSource<Q
bool mFetchGeometry = false;

bool mExpressionCompiled = false;
bool mOrderByCompiled = false;
// use std::set to get sorted ids (needed for efficient QgsFeatureRequest::FilterFids requests on OSM datasource)
std::set<QgsFeatureId> mFilterFids;
std::set<QgsFeatureId>::iterator mFilterFidsIt;
Expand Down
56 changes: 55 additions & 1 deletion tests/src/python/test_provider_ogr_gpkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
from qgis.utils import spatialite_connect

from providertestbase import ProviderTestCase
from utilities import unitTestDataPath
from utilities import compareWkt, unitTestDataPath

TEST_DATA_DIR = unitTestDataPath()

Expand Down Expand Up @@ -229,6 +229,60 @@ def partiallyCompiledFilters(self):
'name LIKE \'Ap\\_le\''
}

def testOrderByCompiled(self):
self.runOrderByTests()

# Below checks test particular aspects of the ORDER BY optimization for
# Geopackage

# Test orderBy + filter expression
request = QgsFeatureRequest().setFilterExpression('"cnt">=200').addOrderBy('num_char')
values = [f['pk'] for f in self.source.getFeatures(request)]
self.assertEqual(values, [2, 3, 4])
values = [f.geometry().asWkt() for f in self.source.getFeatures(request)]
# Check that we get geometries
assert compareWkt(values[0], "Point (-68.2 70.8)"), values[0]

# Test orderBy + subset string
request = QgsFeatureRequest().addOrderBy('num_char')
self.source.setSubsetString("cnt >= 200")
values = [f['pk'] for f in self.source.getFeatures(request)]
self.source.setSubsetString(None)
self.assertEqual(values, [2, 3, 4])

# Test orderBy + subset string + filter expression
request = QgsFeatureRequest().setFilterExpression('"cnt"<=300').addOrderBy('num_char')
self.source.setSubsetString("cnt >= 200")
values = [f['pk'] for f in self.source.getFeatures(request)]
self.source.setSubsetString(None)
self.assertEqual(values, [2, 3])

# Test orderBy + extent
# Note that currently the spatial filter will not use the spatial index
# (probably too tricky to implement on the OGR side since it would need
# to analyze the SQL SELECT, but QGIS could probably add the JOIN with
# the RTree)
extent = QgsRectangle(-70, 67, -60, 80)
request = QgsFeatureRequest().setFilterRect(extent).addOrderBy('num_char')
values = [f['pk'] for f in self.source.getFeatures(request)]
self.assertEqual(values, [2, 4])

# Test orderBy + subset string which is a SELECT
# (excluded by the optimization)
# For some weird reason, we need to re-open a new connection to the
# dataset (this weird behavior predates the optimization)
request = QgsFeatureRequest().addOrderBy('num_char')
tmpdir = tempfile.mkdtemp()
self.dirs_to_cleanup.append(tmpdir)
srcpath = os.path.join(TEST_DATA_DIR, 'provider')
shutil.copy(os.path.join(srcpath, 'geopackage.gpkg'), tmpdir)
datasource = os.path.join(tmpdir, 'geopackage.gpkg')
vl = QgsVectorLayer(datasource, 'test', 'ogr')
vl.setSubsetString("SELECT * FROM \"geopackage\" WHERE cnt >= 200")
values = [f['pk'] for f in vl.getFeatures(request)]
self.assertEqual(values, [2, 3, 4])
del vl


class ErrorReceiver():

Expand Down

0 comments on commit f5cdc9d

Please sign in to comment.