Skip to content
Permalink
Browse files

Add helper method to read encoding from shapefile .cpg or LDID inform…

…ation

Ported from the GDAL logic, which unfortunately is not available
for re-use directly from GDAL.
  • Loading branch information
nyalldawson committed Feb 10, 2020
1 parent 550c8bf commit bb71b713ec98bd8061692c7a05f9790720eefb0e
Showing with 210 additions and 0 deletions.
  1. +153 −0 src/core/qgsogrutils.cpp
  2. +8 −0 src/core/qgsogrutils.h
  3. +31 −0 tests/src/python/test_provider_shapefile.py
  4. +1 −0 tests/testdata/shapefile/iso-8859-1.cpg
  5. BIN tests/testdata/shapefile/iso-8859-1.dbf
  6. +1 −0 tests/testdata/shapefile/iso-8859-1.prj
  7. +1 −0 tests/testdata/shapefile/iso-8859-1.qpj
  8. BIN tests/testdata/shapefile/iso-8859-1.shp
  9. BIN tests/testdata/shapefile/iso-8859-1.shx
  10. BIN tests/testdata/shapefile/iso-8859-1_ldid.dbf
  11. +1 −0 tests/testdata/shapefile/iso-8859-1_ldid.prj
  12. +1 −0 tests/testdata/shapefile/iso-8859-1_ldid.qpj
  13. BIN tests/testdata/shapefile/iso-8859-1_ldid.shp
  14. BIN tests/testdata/shapefile/iso-8859-1_ldid.shx
  15. +1 −0 tests/testdata/shapefile/latin1.cpg
  16. BIN tests/testdata/shapefile/latin1.dbf
  17. +1 −0 tests/testdata/shapefile/latin1.prj
  18. +1 −0 tests/testdata/shapefile/latin1.qpj
  19. BIN tests/testdata/shapefile/latin1.shp
  20. BIN tests/testdata/shapefile/latin1.shx
  21. BIN tests/testdata/shapefile/system_encoding.dbf
  22. +1 −0 tests/testdata/shapefile/system_encoding.prj
  23. +1 −0 tests/testdata/shapefile/system_encoding.qpj
  24. BIN tests/testdata/shapefile/system_encoding.shp
  25. BIN tests/testdata/shapefile/system_encoding.shx
  26. +1 −0 tests/testdata/shapefile/utf8.cpg
  27. BIN tests/testdata/shapefile/utf8.dbf
  28. +1 −0 tests/testdata/shapefile/utf8.prj
  29. +1 −0 tests/testdata/shapefile/utf8.qpj
  30. BIN tests/testdata/shapefile/utf8.shp
  31. BIN tests/testdata/shapefile/utf8.shx
  32. +1 −0 tests/testdata/shapefile/windows-1252.cpg
  33. BIN tests/testdata/shapefile/windows-1252.dbf
  34. +1 −0 tests/testdata/shapefile/windows-1252.prj
  35. +1 −0 tests/testdata/shapefile/windows-1252.qpj
  36. BIN tests/testdata/shapefile/windows-1252.shp
  37. BIN tests/testdata/shapefile/windows-1252.shx
  38. BIN tests/testdata/shapefile/windows-1252_ldid.dbf
  39. +1 −0 tests/testdata/shapefile/windows-1252_ldid.prj
  40. +1 −0 tests/testdata/shapefile/windows-1252_ldid.qpj
  41. BIN tests/testdata/shapefile/windows-1252_ldid.shp
  42. BIN tests/testdata/shapefile/windows-1252_ldid.shx
@@ -25,6 +25,10 @@
#include <QUuid>
#include <cpl_error.h>
#include <QJsonDocument>
#include <QFileInfo>
#include <QDir>
#include <QTextStream>
#include <QDataStream>

#include "ogr_srs_api.h"

@@ -744,3 +748,152 @@ QgsCoordinateReferenceSystem QgsOgrUtils::OGRSpatialReferenceToCrs( OGRSpatialRe

return QgsCoordinateReferenceSystem::fromWkt( wkt );
}

QString QgsOgrUtils::readShapefileEncoding( const QString &path )
{
// unfortunately OGR's routines for calculating the shapefile encoding aren't exposed anywhere in the GDAL c api, so
// re-implement them here...
if ( !QFileInfo::exists( path ) )
return QString();

// first try to read cpg file, if present
const QFileInfo fi( path );
const QString baseName = fi.completeBaseName();
const QString cpgPath = fi.dir().filePath( QStringLiteral( "%1.%2" ).arg( baseName, fi.suffix() == QLatin1String( "SHP" ) ? QStringLiteral( "CPG" ) : QStringLiteral( "cpg" ) ) );
if ( QFile::exists( cpgPath ) )
{
QFile cpgFile( cpgPath );
if ( cpgFile.open( QIODevice::ReadOnly ) )
{
QTextStream cpgStream( &cpgFile );
const QString cpgString = cpgStream.readLine();
cpgFile.close();

if ( !cpgString.isEmpty() )
{
// from OGRShapeLayer::ConvertCodePage
// https://github.com/OSGeo/gdal/blob/master/gdal/ogr/ogrsf_frmts/shape/ogrshapelayer.cpp#L342
bool ok = false;
int cpgCodePage = cpgString.toInt( &ok );
if ( ok && ( ( cpgCodePage >= 437 && cpgCodePage <= 950 )
|| ( cpgCodePage >= 1250 && cpgCodePage <= 1258 ) ) )
{
return QStringLiteral( "CP%1" ).arg( cpgCodePage );
}
else if ( cpgString.startsWith( QLatin1String( "8859" ) ) )
{
if ( cpgString.length() > 4 && cpgString.at( 4 ) == '-' )
return QStringLiteral( "ISO-8859-%1" ).arg( cpgString.mid( 5 ) );
else
return QStringLiteral( "ISO-8859-%1" ).arg( cpgString.mid( 4 ) );
}
else if ( cpgString.startsWith( QLatin1String( "UTF-8" ), Qt::CaseInsensitive ) ||
cpgString.startsWith( QLatin1String( "UTF8" ), Qt::CaseInsensitive ) )
return QStringLiteral( "UTF-8" );
else if ( cpgString.startsWith( QLatin1String( "ANSI 1251" ), Qt::CaseInsensitive ) )
return QStringLiteral( "CP1251" );

return cpgString;
}
}
}

// fallback to LDID value, read from DBF file
const QString dbfPath = fi.dir().filePath( QStringLiteral( "%1.%2" ).arg( baseName, fi.suffix() == QLatin1String( "SHP" ) ? QStringLiteral( "DBF" ) : QStringLiteral( "dbf" ) ) );
if ( QFile::exists( dbfPath ) )
{
QFile dbfFile( dbfPath );
if ( dbfFile.open( QIODevice::ReadOnly ) )
{
if ( dbfFile.skip( 29 ) == 29 )
{
QDataStream dbfIn( &dbfFile );
dbfIn.setByteOrder( QDataStream::LittleEndian );
quint8 ldid;
dbfIn >> ldid;
dbfFile.close();

int nCP = -1; // Windows code page.

// http://www.autopark.ru/ASBProgrammerGuide/DBFSTRUC.HTM
switch ( ldid )
{
case 1: nCP = 437; break;
case 2: nCP = 850; break;
case 3: nCP = 1252; break;
case 4: nCP = 10000; break;
case 8: nCP = 865; break;
case 10: nCP = 850; break;
case 11: nCP = 437; break;
case 13: nCP = 437; break;
case 14: nCP = 850; break;
case 15: nCP = 437; break;
case 16: nCP = 850; break;
case 17: nCP = 437; break;
case 18: nCP = 850; break;
case 19: nCP = 932; break;
case 20: nCP = 850; break;
case 21: nCP = 437; break;
case 22: nCP = 850; break;
case 23: nCP = 865; break;
case 24: nCP = 437; break;
case 25: nCP = 437; break;
case 26: nCP = 850; break;
case 27: nCP = 437; break;
case 28: nCP = 863; break;
case 29: nCP = 850; break;
case 31: nCP = 852; break;
case 34: nCP = 852; break;
case 35: nCP = 852; break;
case 36: nCP = 860; break;
case 37: nCP = 850; break;
case 38: nCP = 866; break;
case 55: nCP = 850; break;
case 64: nCP = 852; break;
case 77: nCP = 936; break;
case 78: nCP = 949; break;
case 79: nCP = 950; break;
case 80: nCP = 874; break;
case 87: return QStringLiteral( "ISO-8859-1" );
case 88: nCP = 1252; break;
case 89: nCP = 1252; break;
case 100: nCP = 852; break;
case 101: nCP = 866; break;
case 102: nCP = 865; break;
case 103: nCP = 861; break;
case 104: nCP = 895; break;
case 105: nCP = 620; break;
case 106: nCP = 737; break;
case 107: nCP = 857; break;
case 108: nCP = 863; break;
case 120: nCP = 950; break;
case 121: nCP = 949; break;
case 122: nCP = 936; break;
case 123: nCP = 932; break;
case 124: nCP = 874; break;
case 134: nCP = 737; break;
case 135: nCP = 852; break;
case 136: nCP = 857; break;
case 150: nCP = 10007; break;
case 151: nCP = 10029; break;
case 200: nCP = 1250; break;
case 201: nCP = 1251; break;
case 202: nCP = 1254; break;
case 203: nCP = 1253; break;
case 204: nCP = 1257; break;
default: break;
}

if ( nCP != -1 )
{
return QStringLiteral( "CP%1" ).arg( nCP );
}
}
else
{
dbfFile.close();
}
}
}
return QString();
}
@@ -285,6 +285,14 @@ class CORE_EXPORT QgsOgrUtils
* \since QGIS 3.10.1
*/
static QgsCoordinateReferenceSystem OGRSpatialReferenceToCrs( OGRSpatialReferenceH srs );

/**
* Reads the encoding of the shapefile at the specified \a path (where \a path is the
* location of the ".shp" file).
*
* \since QGIS 3.12
*/
static QString readShapefileEncoding( const QString &path );
};

#endif // QGSOGRUTILS_H
@@ -625,6 +625,37 @@ def testOpenWithFilter(self):
# force close of data provider
vl.setDataSource('', 'test', 'ogr')

def testEncoding(self):
file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'iso-8859-1.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'ISO-8859-1')

file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'iso-8859-1_ldid.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'ISO-8859-1')

file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'latin1.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'ISO-8859-1')

file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'utf8.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'UTF-8')

file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'windows-1252.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'windows-1252')

file_path = os.path.join(TEST_DATA_DIR, 'shapefile', 'windows-1252_ldid.shp')
vl = QgsVectorLayer(file_path)
self.assertTrue(vl.isValid())
self.assertEqual(vl.dataProvider().encoding(), 'windows-1252')

def testCreateAttributeIndex(self):
tmpdir = tempfile.mkdtemp()
self.dirs_to_cleanup.append(tmpdir)
@@ -0,0 +1 @@
88591
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
latin1
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
UTF-8
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
1252
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
@@ -0,0 +1 @@
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]
Binary file not shown.
Binary file not shown.

0 comments on commit bb71b71

Please sign in to comment.
You can’t perform that action at this time.