From f5cdc9d1cd703618f40c1dfecab1bf095b8af09b Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 29 May 2023 21:58:15 +0200 Subject: [PATCH] [OGR provider] Optimize feature requests with OrderBy clauses on a GeoPackage dataset (fixes #53198) OrderBy clauses are translated into a "SELECT ... FROM ... [WHERE ...] ORDER BY ... [LIMIT x]" SQL statement evaluated by GDAL On a 3.2 million feature GeoPackage, this decreases the RAM consumption from 6 GB to 0.5 GB, and is 50% faster. If a limit clause is added with a small number, then the speed-up is massive. --- .../providers/ogr/qgsogrfeatureiterator.cpp | 138 +++++++++++++++++- .../providers/ogr/qgsogrfeatureiterator.h | 3 + tests/src/python/test_provider_ogr_gpkg.py | 56 ++++++- 3 files changed, 195 insertions(+), 2 deletions(-) diff --git a/src/core/providers/ogr/qgsogrfeatureiterator.cpp b/src/core/providers/ogr/qgsogrfeatureiterator.cpp index 1637f5fafa79..6ab90660f616 100644 --- a/src/core/providers/ogr/qgsogrfeatureiterator.cpp +++ b/src/core/providers/ogr/qgsogrfeatureiterator.cpp @@ -27,6 +27,9 @@ #include "qgssymbol.h" #include "qgsgeometryengine.h" #include "qgsdbquerylog.h" +#include "qgssettings.h" + +#include #include #include @@ -142,6 +145,132 @@ QgsOgrFeatureIterator::QgsOgrFeatureIterator( QgsOgrFeatureSource *source, bool !( mRequest.flags() & QgsFeatureRequest::NoGeometry ) || ( mSource->mOgrGeometryTypeFilter != wkbUnknown ); + bool filterExpressionAlreadyTakenIntoAccount = false; + +#if SQLITE_VERSION_NUMBER >= 3030000L + // For GeoPackage, try to compile order by expressions as a SQL statement + // Only SQLite >= 3.30.0 supports NULLS FIRST / NULLS LAST + // A potential further optimization would be to translate mFilterRect + // as a JOIN with the GPKG RTree when it exists, instead of having just OGR + // evaluating it as a post processing. + const auto constOrderBy = request.orderBy(); + if ( !constOrderBy.isEmpty() && + source->mDriverName == QLatin1String( "GPKG" ) && + ( request.filterType() == QgsFeatureRequest::FilterNone || + request.filterType() == QgsFeatureRequest::FilterExpression ) && + ( mSource->mSubsetString.isEmpty() || + !mSource->mSubsetString.startsWith( QLatin1String( "SELECT " ), Qt::CaseInsensitive ) ) ) + { + QByteArray sql = QByteArray( "SELECT " ); + for ( int i = 0; i < source->mFields.size(); ++i ) + { + if ( i > 0 ) + sql += QByteArray( ", " ); + sql += QgsOgrProviderUtils::quotedIdentifier( source->mFields[i].name().toUtf8(), source->mDriverName ); + } + if ( strcmp( OGR_L_GetGeometryColumn( mOgrLayer ), "" ) != 0 ) + { + sql += QByteArray( ", " ); + sql += QgsOgrProviderUtils::quotedIdentifier( OGR_L_GetGeometryColumn( mOgrLayer ), source->mDriverName ); + } + sql += QByteArray( " FROM " ); + sql += QgsOgrProviderUtils::quotedIdentifier( OGR_L_GetName( mOgrLayer ), source->mDriverName ); + + if ( request.filterType() == QgsFeatureRequest::FilterExpression ) + { + QgsSQLiteExpressionCompiler compiler( + source->mFields, + request.flags() & QgsFeatureRequest::IgnoreStaticNodesDuringExpressionCompilation ); + QgsSqlExpressionCompiler::Result result = compiler.compile( request.filterExpression() ); + if ( result == QgsSqlExpressionCompiler::Complete || result == QgsSqlExpressionCompiler::Partial ) + { + QString whereClause = compiler.result(); + sql += QByteArray( " WHERE " ); + if ( mSource->mSubsetString.isEmpty() ) + { + sql += whereClause.toUtf8(); + } + else + { + sql += QByteArray( "(" ); + sql += mSource->mSubsetString.toUtf8(); + sql += QByteArray( ") AND (" ); + sql += whereClause.toUtf8(); + sql += QByteArray( ")" ); + } + mExpressionCompiled = ( result == QgsSqlExpressionCompiler::Complete ); + mCompileStatus = ( mExpressionCompiled ? Compiled : PartiallyCompiled ); + } + else + { + sql.clear(); + } + } + else if ( !mSource->mSubsetString.isEmpty() ) + { + sql += QByteArray( " WHERE " ); + sql += mSource->mSubsetString.toUtf8(); + } + + if ( !sql.isEmpty() ) + { + sql += QByteArray( " ORDER BY " ); + bool firstOrderBy = true; + for ( const QgsFeatureRequest::OrderByClause &clause : constOrderBy ) + { + QgsExpression expression = clause.expression(); + QgsSQLiteExpressionCompiler compiler( + source->mFields, request.flags() & QgsFeatureRequest::IgnoreStaticNodesDuringExpressionCompilation ); + QgsSqlExpressionCompiler::Result result = compiler.compile( &expression ); + if ( result == QgsSqlExpressionCompiler::Complete && + expression.rootNode()->nodeType() == QgsExpressionNode::ntColumnRef ) + { + if ( !firstOrderBy ) + sql += QByteArray( ", " ); + sql += compiler.result().toUtf8(); + sql += QByteArray( " COLLATE NOCASE" ); + sql += clause.ascending() ? QByteArray( " ASC" ) : QByteArray( " DESC" ); + if ( clause.nullsFirst() ) + sql += QByteArray( " NULLS FIRST" ); + else + sql += QByteArray( " NULLS LAST" ); + } + else + { + sql.clear(); + break; + } + firstOrderBy = false; + } + } + + if ( !sql.isEmpty() ) + { + mOrderByCompiled = true; + + if ( mOrderByCompiled && request.limit() >= 0 ) + { + sql += QByteArray( " LIMIT " ); + sql += QString::number( request.limit() ).toUtf8(); + } + QgsDebugMsgLevel( QStringLiteral( "Using optimized orderBy as: %1" ).arg( QString::fromUtf8( sql ) ), 4 ); + filterExpressionAlreadyTakenIntoAccount = true; + if ( mOgrLayerOri && mOgrLayer != mOgrLayerOri ) + { + GDALDatasetReleaseResultSet( mConn->ds, mOgrLayer ); + mOgrLayer = mOgrLayerOri; + } + mOgrLayerOri = mOgrLayer; + mOgrLayer = QgsOgrProviderUtils::setSubsetString( mOgrLayer, mConn->ds, mSource->mEncoding, sql ); + if ( !mOgrLayer ) + { + close(); + return; + } + } + } +#endif + QgsAttributeList attrs = ( mRequest.flags() & QgsFeatureRequest::SubsetOfAttributes ) ? mRequest.subsetOfAttributes() : mSource->mFields.allAttributesList(); // ensure that all attributes required for expression filter are being fetched @@ -218,7 +347,7 @@ QgsOgrFeatureIterator::QgsOgrFeatureIterator( QgsOgrFeatureSource *source, bool break; } - if ( request.filterType() == QgsFeatureRequest::FilterExpression ) + if ( request.filterType() == QgsFeatureRequest::FilterExpression && !filterExpressionAlreadyTakenIntoAccount ) { QgsSqlExpressionCompiler *compiler = nullptr; if ( source->mDriverName == QLatin1String( "SQLite" ) || source->mDriverName == QLatin1String( "GPKG" ) ) @@ -314,6 +443,13 @@ QgsOgrFeatureIterator::~QgsOgrFeatureIterator() close(); } +bool QgsOgrFeatureIterator::prepareOrderBy( const QList &orderBys ) +{ + Q_UNUSED( orderBys ) + // Preparation has already been done in the constructor, so we just communicate the result + return mOrderByCompiled; +} + bool QgsOgrFeatureIterator::nextFeatureFilterExpression( QgsFeature &f ) { if ( !mExpressionCompiled ) diff --git a/src/core/providers/ogr/qgsogrfeatureiterator.h b/src/core/providers/ogr/qgsogrfeatureiterator.h index 4de6848d1dc9..50355886ab3b 100644 --- a/src/core/providers/ogr/qgsogrfeatureiterator.h +++ b/src/core/providers/ogr/qgsogrfeatureiterator.h @@ -86,6 +86,8 @@ class QgsOgrFeatureIterator final: public QgsAbstractFeatureIteratorFromSource &orderBys ) override; + private: bool readFeature( const gdal::ogr_feature_unique_ptr &fet, QgsFeature &feature ) const; @@ -101,6 +103,7 @@ class QgsOgrFeatureIterator final: public QgsAbstractFeatureIteratorFromSource mFilterFids; std::set::iterator mFilterFidsIt; diff --git a/tests/src/python/test_provider_ogr_gpkg.py b/tests/src/python/test_provider_ogr_gpkg.py index afe242640b20..6085c886a5bf 100644 --- a/tests/src/python/test_provider_ogr_gpkg.py +++ b/tests/src/python/test_provider_ogr_gpkg.py @@ -61,7 +61,7 @@ from qgis.utils import spatialite_connect from providertestbase import ProviderTestCase -from utilities import unitTestDataPath +from utilities import compareWkt, unitTestDataPath TEST_DATA_DIR = unitTestDataPath() @@ -229,6 +229,60 @@ def partiallyCompiledFilters(self): 'name LIKE \'Ap\\_le\'' } + def testOrderByCompiled(self): + self.runOrderByTests() + + # Below checks test particular aspects of the ORDER BY optimization for + # Geopackage + + # Test orderBy + filter expression + request = QgsFeatureRequest().setFilterExpression('"cnt">=200').addOrderBy('num_char') + values = [f['pk'] for f in self.source.getFeatures(request)] + self.assertEqual(values, [2, 3, 4]) + values = [f.geometry().asWkt() for f in self.source.getFeatures(request)] + # Check that we get geometries + assert compareWkt(values[0], "Point (-68.2 70.8)"), values[0] + + # Test orderBy + subset string + request = QgsFeatureRequest().addOrderBy('num_char') + self.source.setSubsetString("cnt >= 200") + values = [f['pk'] for f in self.source.getFeatures(request)] + self.source.setSubsetString(None) + self.assertEqual(values, [2, 3, 4]) + + # Test orderBy + subset string + filter expression + request = QgsFeatureRequest().setFilterExpression('"cnt"<=300').addOrderBy('num_char') + self.source.setSubsetString("cnt >= 200") + values = [f['pk'] for f in self.source.getFeatures(request)] + self.source.setSubsetString(None) + self.assertEqual(values, [2, 3]) + + # Test orderBy + extent + # Note that currently the spatial filter will not use the spatial index + # (probably too tricky to implement on the OGR side since it would need + # to analyze the SQL SELECT, but QGIS could probably add the JOIN with + # the RTree) + extent = QgsRectangle(-70, 67, -60, 80) + request = QgsFeatureRequest().setFilterRect(extent).addOrderBy('num_char') + values = [f['pk'] for f in self.source.getFeatures(request)] + self.assertEqual(values, [2, 4]) + + # Test orderBy + subset string which is a SELECT + # (excluded by the optimization) + # For some weird reason, we need to re-open a new connection to the + # dataset (this weird behavior predates the optimization) + request = QgsFeatureRequest().addOrderBy('num_char') + tmpdir = tempfile.mkdtemp() + self.dirs_to_cleanup.append(tmpdir) + srcpath = os.path.join(TEST_DATA_DIR, 'provider') + shutil.copy(os.path.join(srcpath, 'geopackage.gpkg'), tmpdir) + datasource = os.path.join(tmpdir, 'geopackage.gpkg') + vl = QgsVectorLayer(datasource, 'test', 'ogr') + vl.setSubsetString("SELECT * FROM \"geopackage\" WHERE cnt >= 200") + values = [f['pk'] for f in vl.getFeatures(request)] + self.assertEqual(values, [2, 3, 4]) + del vl + class ErrorReceiver():