Skip to content

Commit

Permalink
[OGR provider] Performance improvements on huge GeoPackage database (f…
Browse files Browse the repository at this point in the history
…ixes #18402)

- Introduce an approximate feature count for GeoPackage in case there are
  at least 100,000 rows in a table.
- Add a super fast implementation of GetExtent() for GeoPackage when there
  is an RTree
- Do not require feature count when enumerating layers from the browser
  • Loading branch information
rouault committed Jun 4, 2018
1 parent e5e966b commit 4f53135
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 7 deletions.
3 changes: 2 additions & 1 deletion src/providers/ogr/qgsogrdataitems.cpp
Expand Up @@ -172,7 +172,8 @@ QList<QgsOgrDbLayerInfo *> QgsOgrLayerItem::subLayers( const QString &path, cons
{
// Collect mixed-geom layers
QMultiMap<int, QStringList> subLayersMap;
const QStringList subLayersList( layer.dataProvider()->subLayers( ) );
QgsOgrProvider *ogrProvider = qobject_cast<QgsOgrProvider *>( layer.dataProvider() );
const QStringList subLayersList( ogrProvider->subLayersWithoutFeatureCount( ) );
QMap< QString, int > mapLayerNameToCount;
bool uniqueNames = true;
int prevIdx = -1;
Expand Down
199 changes: 194 additions & 5 deletions src/providers/ogr/qgsogrprovider.cpp
Expand Up @@ -705,7 +705,7 @@ static OGRwkbGeometryType ogrWkbGeometryTypeFromName( const QString &typeName )
return wkbUnknown;
}

void QgsOgrProvider::addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer ) const
void QgsOgrProvider::addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer, bool withFeatureCount ) const
{
QgsOgrFeatureDefn &fdef = layer->GetLayerDefn();
// Get first column name,
Expand All @@ -731,7 +731,7 @@ void QgsOgrProvider::addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer

if ( wkbFlatten( layerGeomType ) != wkbUnknown )
{
int layerFeatureCount = layer->GetFeatureCount();
int layerFeatureCount = withFeatureCount ? layer->GetApproxFeatureCount() : -1;

QString geom = ogrWkbGeometryTypeName( layerGeomType );

Expand Down Expand Up @@ -822,6 +822,16 @@ void QgsOgrProvider::addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer
}

QStringList QgsOgrProvider::subLayers() const
{
return _subLayers( true );
}

QStringList QgsOgrProvider::subLayersWithoutFeatureCount() const
{
return _subLayers( false );
}

QStringList QgsOgrProvider::_subLayers( bool withFeatureCount ) const
{
if ( !mValid )
{
Expand All @@ -833,7 +843,7 @@ QStringList QgsOgrProvider::subLayers() const

if ( mOgrLayer && ( mIsSubLayer || layerCount() == 1 ) )
{
addSubLayerDetailsToSubLayerList( mLayerIndex, mOgrLayer );
addSubLayerDetailsToSubLayerList( mLayerIndex, mOgrLayer, withFeatureCount );
}
else
{
Expand All @@ -856,7 +866,7 @@ QStringList QgsOgrProvider::subLayers() const
if ( !layer )
continue;

addSubLayerDetailsToSubLayerList( i, layer.get() );
addSubLayerDetailsToSubLayerList( i, layer.get(), withFeatureCount );
if ( firstLayer == nullptr )
{
firstLayer = std::move( layer );
Expand Down Expand Up @@ -3838,7 +3848,7 @@ void QgsOgrProvider::recalculateFeatureCount()
// so we remove it if there's any and then put it back
if ( mOgrGeometryTypeFilter == wkbUnknown )
{
mFeaturesCounted = mOgrLayer->GetFeatureCount( true );
mFeaturesCounted = mOgrLayer->GetApproxFeatureCount();
if ( mFeaturesCounted == -1 )
{
mFeaturesCounted = QgsVectorDataProvider::UnknownCount;
Expand Down Expand Up @@ -5131,9 +5141,188 @@ GIntBig QgsOgrLayer::GetFeatureCount( bool force )
return OGR_L_GetFeatureCount( hLayer, force );
}

GIntBig QgsOgrLayer::GetApproxFeatureCount()
{
QMutexLocker locker( &ds->mutex );

// OGR_L_GetFeatureCount() can be super slow on huge geopackage files
// so implement some approximation strategy that has reasonable runtime.
QString driverName = GDALGetDriverShortName( GDALGetDatasetDriver( ds->hDS ) );
if ( driverName == QLatin1String( "GPKG" ) )
{
CPLPushErrorHandler( CPLQuietErrorHandler );
OGRLayerH hSqlLayer = GDALDatasetExecuteSQL(
ds->hDS, "SELECT 1 FROM gpkg_ogr_contents LIMIT 0", nullptr, nullptr );
CPLPopErrorHandler();
if ( hSqlLayer )
{
GDALDatasetReleaseResultSet( ds->hDS, hSqlLayer );
return OGR_L_GetFeatureCount( hLayer, TRUE );
}

// Enumerate features up to a limit of 100000.
const GIntBig nLimit = CPLAtoGIntBig(
CPLGetConfigOption( "QGIS_GPKG_FC_THRESHOLD", "100000" ) );
QByteArray layerName = OGR_L_GetName( hLayer );
QByteArray sql( "SELECT COUNT(*) FROM (SELECT 1 FROM " );
sql += QgsOgrProviderUtils::quotedIdentifier( layerName, driverName );
sql += " LIMIT ";
sql += CPLSPrintf( CPL_FRMT_GIB, nLimit );
sql += ")";
hSqlLayer = GDALDatasetExecuteSQL( ds->hDS, sql, nullptr, nullptr );
GIntBig res = -1;
if ( hSqlLayer )
{
gdal::ogr_feature_unique_ptr fet( OGR_L_GetNextFeature( hSqlLayer ) );
if ( fet )
{
res = OGR_F_GetFieldAsInteger64( fet.get(), 0 );
}
GDALDatasetReleaseResultSet( ds->hDS, hSqlLayer );
}
if ( res >= 0 && res < nLimit )
{
// Less than 100000 features ? This is the final count
return res;
}
if ( res == nLimit )
{
// If we reach the threshold, then use the min and max values of the rowid
// hoping there are not a lot of holes.
// Do it in 2 separate SQL queries otherwise SQLite apparently does a
// full table scan...
sql = "SELECT MAX(ROWID) FROM ";
sql += QgsOgrProviderUtils::quotedIdentifier( layerName, driverName );
hSqlLayer = GDALDatasetExecuteSQL( ds->hDS, sql, nullptr, nullptr );
GIntBig maxrowid = -1;
if ( hSqlLayer )
{
gdal::ogr_feature_unique_ptr fet( OGR_L_GetNextFeature( hSqlLayer ) );
if ( fet )
{
maxrowid = OGR_F_GetFieldAsInteger64( fet.get(), 0 );
}
GDALDatasetReleaseResultSet( ds->hDS, hSqlLayer );
}

sql = "SELECT MIN(ROWID) FROM ";
sql += QgsOgrProviderUtils::quotedIdentifier( layerName, driverName );
hSqlLayer = GDALDatasetExecuteSQL( ds->hDS, sql, nullptr, nullptr );
GIntBig minrowid = 0;
if ( hSqlLayer )
{
gdal::ogr_feature_unique_ptr fet( OGR_L_GetNextFeature( hSqlLayer ) );
if ( fet )
{
minrowid = OGR_F_GetFieldAsInteger64( fet.get(), 0 );
}
GDALDatasetReleaseResultSet( ds->hDS, hSqlLayer );
}

if ( maxrowid >= minrowid )
{
return maxrowid - minrowid + 1;
}
}
}

return OGR_L_GetFeatureCount( hLayer, TRUE );
}

static bool findMinOrMax( GDALDatasetH hDS, const QByteArray &rtreeName,
const char *varName, bool isMin, double &val )
{
// We proceed by dichotomic search since unfortunately SELECT MIN(minx)
// in a RTree is a slow operation
double minval = -1e10;
double maxval = 1e10;
val = 0.0;
double oldval = 0.0;
for ( int i = 0; i < 100 && maxval - minval > 1e-15; i++ )
{
val = ( minval + maxval ) / 2;
if ( i > 0 && val == oldval )
{
break;
}
oldval = val;
QByteArray sql = "SELECT 1 FROM ";
sql += rtreeName;
sql += " WHERE ";
sql += varName;
sql += isMin ? " < " : " > ";
sql += CPLSPrintf( "%.18g", val );
sql += " LIMIT 1";
auto hSqlLayer = GDALDatasetExecuteSQL(
hDS, sql, nullptr, nullptr );
GIntBig count = -1;
if ( hSqlLayer )
{
count = OGR_L_GetFeatureCount( hSqlLayer, true );
GDALDatasetReleaseResultSet( hDS, hSqlLayer );
}
if ( count < 0 )
{
return false;
}
if ( ( isMin && count == 0 ) || ( !isMin && count == 1 ) )
{
minval = val;
}
else
{
maxval = val;
}
}
return true;
}

OGRErr QgsOgrLayer::GetExtent( OGREnvelope *psExtent, bool bForce )
{
QMutexLocker locker( &ds->mutex );

// OGR_L_GetExtent() can be super slow on huge geopackage files
// so implement some approximation strategy that has reasonable runtime.
// Actually this should return a rather accurante answer.
QString driverName = GDALGetDriverShortName( GDALGetDatasetDriver( ds->hDS ) );
if ( driverName == QLatin1String( "GPKG" ) )
{
QByteArray layerName = OGR_L_GetName( hLayer );
QByteArray rtreeName =
QgsOgrProviderUtils::quotedIdentifier( "rtree_" + layerName + "_" + OGR_L_GetGeometryColumn( hLayer ), driverName );

// Check if there is a non-empty RTree
QByteArray sql( "SELECT 1 FROM " );
sql += rtreeName;
sql += " LIMIT 1";
CPLPushErrorHandler( CPLQuietErrorHandler );
OGRLayerH hSqlLayer = GDALDatasetExecuteSQL(
ds->hDS, sql, nullptr, nullptr );
CPLPopErrorHandler();
if ( !hSqlLayer )
{
return OGR_L_GetExtent( hLayer, psExtent, bForce );
}
bool hasFeatures = OGR_L_GetFeatureCount( hSqlLayer, true ) > 0;
GDALDatasetReleaseResultSet( ds->hDS, hSqlLayer );
if ( !hasFeatures )
{
return OGRERR_FAILURE;
}

double minx, miny, maxx, maxy;
if ( findMinOrMax( ds->hDS, rtreeName, "MINX", true, minx ) &&
findMinOrMax( ds->hDS, rtreeName, "MINY", true, miny ) &&
findMinOrMax( ds->hDS, rtreeName, "MAXX", false, maxx ) &&
findMinOrMax( ds->hDS, rtreeName, "MAXY", false, maxy ) )
{
psExtent->MinX = minx;
psExtent->MinY = miny;
psExtent->MaxX = maxx;
psExtent->MaxY = maxy;
return OGRERR_NONE;
}
}
return OGR_L_GetExtent( hLayer, psExtent, bForce );
}

Expand Down
8 changes: 7 additions & 1 deletion src/providers/ogr/qgsogrprovider.h
Expand Up @@ -100,6 +100,7 @@ class QgsOgrProvider : public QgsVectorDataProvider

QgsCoordinateReferenceSystem crs() const override;
QStringList subLayers() const override;
QStringList subLayersWithoutFeatureCount() const;
QString storageType() const override;
QgsFeatureIterator getFeatures( const QgsFeatureRequest &request ) const override;
QString subsetString() const override;
Expand Down Expand Up @@ -215,7 +216,9 @@ class QgsOgrProvider : public QgsVectorDataProvider
//! Does the real job of settings the subset string and adds an argument to disable update capabilities
bool _setSubsetString( const QString &theSQL, bool updateFeatureCount = true, bool updateCapabilities = true, bool hasExistingRef = true );

void addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer ) const;
void addSubLayerDetailsToSubLayerList( int i, QgsOgrLayer *layer, bool withFeatureCount ) const;

QStringList _subLayers( bool withFeatureCount ) const;

QgsFields mAttributeFields;

Expand Down Expand Up @@ -565,6 +568,9 @@ class QgsOgrLayer
//! Wrapper of OGR_L_GetLayerCount
GIntBig GetFeatureCount( bool force = false );

//! Return an approximate feature count
GIntBig GetApproxFeatureCount();

//! Wrapper of OGR_L_GetLayerCount
OGRErr GetExtent( OGREnvelope *psExtent, bool bForce );

Expand Down
34 changes: 34 additions & 0 deletions tests/src/python/test_provider_ogr_gpkg.py
Expand Up @@ -949,6 +949,40 @@ def testAddingTwoIntFieldsWithWidth(self):
self.assertTrue(vl.addAttribute(QgsField("c", QVariant.Int, "integer", 10)))
self.assertTrue(vl.commitChanges())

def testApproxFeatureCountAndExtent(self):
""" Test perf improvement for for https://issues.qgis.org/issues/18402 """

tmpfile = os.path.join(self.basetestpath, 'testApproxFeatureCountAndExtent.gpkg')
ds = ogr.GetDriverByName('GPKG').CreateDataSource(tmpfile)
lyr = ds.CreateLayer('test', geom_type=ogr.wkbPoint)
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(ogr.CreateGeometryFromWkt('POINT(0 1)'))
lyr.CreateFeature(f)
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(ogr.CreateGeometryFromWkt('POINT(2 3)'))
lyr.CreateFeature(f)
fid = f.GetFID()
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(ogr.CreateGeometryFromWkt('POINT(4 5)'))
lyr.CreateFeature(f)
lyr.DeleteFeature(fid)
ds = None
ds = ogr.Open(tmpfile, update=1)
ds.ExecuteSQL('DROP TABLE gpkg_ogr_contents')
ds = None

os.environ['QGIS_GPKG_FC_THRESHOLD'] = '1'
vl = QgsVectorLayer(u'{}'.format(tmpfile) + "|layername=" + "test", 'test', u'ogr')
self.assertTrue(vl.isValid())
fc = vl.featureCount()
del os.environ['QGIS_GPKG_FC_THRESHOLD']
self.assertEqual(fc, 3) # didn't notice the hole

reference = QgsGeometry.fromRect(QgsRectangle(0, 1, 4, 5))
provider_extent = QgsGeometry.fromRect(vl.extent())
self.assertTrue(QgsGeometry.compare(provider_extent.asPolygon()[0], reference.asPolygon()[0], 0.00001),
provider_extent.asPolygon()[0])


if __name__ == '__main__':
unittest.main()

0 comments on commit 4f53135

Please sign in to comment.