diff --git a/python/core/auto_generated/qgsgml.sip.in b/python/core/auto_generated/qgsgml.sip.in index b610e6158e54..75219c33d90e 100644 --- a/python/core/auto_generated/qgsgml.sip.in +++ b/python/core/auto_generated/qgsgml.sip.in @@ -39,7 +39,6 @@ request is finished const QString &authcfg = QString() ) /PyName=getFeaturesUri/; %Docstring Does the Http GET request to the wfs server -Supports only UTF-8, UTF-16, ISO-8859-1, ISO-8859-1 XML encodings. :param uri: GML URL :param wkbType: wkbType to retrieve @@ -58,7 +57,6 @@ Supports only UTF-8, UTF-16, ISO-8859-1, ISO-8859-1 XML encodings. int getFeatures( const QByteArray &data, QgsWkbTypes::Type *wkbType, QgsRectangle *extent = 0 ); %Docstring Read from GML data. Constructor uri param is ignored -Supports only UTF-8, UTF-16, ISO-8859-1, ISO-8859-1 XML encodings. %End QMap featuresMap() const; diff --git a/src/core/qgsgml.cpp b/src/core/qgsgml.cpp index 5c3fdacbd064..8f05a3191f7e 100644 --- a/src/core/qgsgml.cpp +++ b/src/core/qgsgml.cpp @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "ogr_api.h" @@ -321,10 +323,7 @@ QgsGmlStreamingParser::QgsGmlStreamingParser( const QString &typeName, mTypeNameUTF8Len = strlen( mTypeNamePtr ); } - mParser = XML_ParserCreateNS( nullptr, NS_SEPARATOR ); - XML_SetUserData( mParser, this ); - XML_SetElementHandler( mParser, QgsGmlStreamingParser::start, QgsGmlStreamingParser::end ); - XML_SetCharacterDataHandler( mParser, QgsGmlStreamingParser::chars ); + createParser(); } static QString stripNS( const QString &string ) @@ -412,10 +411,7 @@ QgsGmlStreamingParser::QgsGmlStreamingParser( const QList &laye mEndian = QgsApplication::endian(); - mParser = XML_ParserCreateNS( nullptr, NS_SEPARATOR ); - XML_SetUserData( mParser, this ); - XML_SetElementHandler( mParser, QgsGmlStreamingParser::start, QgsGmlStreamingParser::end ); - XML_SetCharacterDataHandler( mParser, QgsGmlStreamingParser::chars ); + createParser(); } @@ -444,11 +440,40 @@ bool QgsGmlStreamingParser::processData( const QByteArray &data, bool atEnd ) return true; } -bool QgsGmlStreamingParser::processData( const QByteArray &data, bool atEnd, QString &errorMsg ) +bool QgsGmlStreamingParser::processData( const QByteArray &pdata, bool atEnd, QString &errorMsg ) { - if ( XML_Parse( mParser, data.data(), data.size(), atEnd ) == 0 ) + QByteArray data = pdata; + + if ( mCodec ) + { + // convert data to UTF-8 + QString strData = mCodec->toUnicode( pdata ); + data = strData.toUtf8(); + } + + if ( XML_Parse( mParser, data, data.size(), atEnd ) == XML_STATUS_ERROR ) { const XML_Error errorCode = XML_GetErrorCode( mParser ); + if ( !mCodec && errorCode == XML_ERROR_UNKNOWN_ENCODING ) + { + // Specified encoding is unknown, Expat only accepts UTF-8, UTF-16, ISO-8859-1 + // Try to get encoding string and convert data to utf-8 + QRegularExpression reEncoding( QStringLiteral( "" ), + QRegularExpression::CaseInsensitiveOption ); + QRegularExpressionMatch match = reEncoding.match( pdata ); + const QString encoding = match.hasMatch() ? match.captured( 1 ) : QString(); + mCodec = !encoding.isEmpty() ? QTextCodec::codecForName( encoding.toLatin1() ) : nullptr; + if ( mCodec ) + { + // recreate parser with UTF-8 encoding + XML_ParserFree( mParser ); + mParser = nullptr; + createParser( QByteArrayLiteral( "UTF-8" ) ); + + return processData( data, atEnd, errorMsg ); + } + } + errorMsg = QObject::tr( "Error: %1 on line %2, column %3" ) .arg( XML_ErrorString( errorCode ) ) .arg( XML_GetCurrentLineNumber( mParser ) ) @@ -1561,3 +1586,13 @@ int QgsGmlStreamingParser::totalWKBFragmentSize() const } return result; } + +void QgsGmlStreamingParser::createParser( const QByteArray &encoding ) +{ + Q_ASSERT( !mParser ); + + mParser = XML_ParserCreateNS( encoding.isEmpty() ? nullptr : encoding.data(), NS_SEPARATOR ); + XML_SetUserData( mParser, this ); + XML_SetElementHandler( mParser, QgsGmlStreamingParser::start, QgsGmlStreamingParser::end ); + XML_SetCharacterDataHandler( mParser, QgsGmlStreamingParser::chars ); +} diff --git a/src/core/qgsgml.h b/src/core/qgsgml.h index 990f43546df6..0072e3242e79 100644 --- a/src/core/qgsgml.h +++ b/src/core/qgsgml.h @@ -33,6 +33,7 @@ #include class QgsCoordinateReferenceSystem; +class QTextCodec; #ifndef SIP_RUN @@ -253,8 +254,11 @@ class CORE_EXPORT QgsGmlStreamingParser //! Safely (if empty) pop from mode stack ParseMode modeStackPop() { return mParseModeStack.isEmpty() ? None : mParseModeStack.pop(); } + //! create parser with specified encoding if any + void createParser( const QByteArray &encoding = QByteArray() ); + //! Expat parser - XML_Parser mParser; + XML_Parser mParser = nullptr; //! List of (feature, gml_id) pairs QVector mFeatureList; @@ -344,6 +348,8 @@ class CORE_EXPORT QgsGmlStreamingParser std::string mGeometryString; //! Whether we found a unhandled geometry element bool mFoundUnhandledGeometryElement; + //! text codec used to read data with an expat unsupported encoding + QTextCodec *mCodec = nullptr; }; #endif @@ -368,7 +374,6 @@ class CORE_EXPORT QgsGml : public QObject /** * Does the Http GET request to the wfs server - * Supports only UTF-8, UTF-16, ISO-8859-1, ISO-8859-1 XML encodings. * \param uri GML URL * \param wkbType wkbType to retrieve * \param extent retrieved extents @@ -387,7 +392,6 @@ class CORE_EXPORT QgsGml : public QObject /** * Read from GML data. Constructor uri param is ignored - * Supports only UTF-8, UTF-16, ISO-8859-1, ISO-8859-1 XML encodings. */ int getFeatures( const QByteArray &data, QgsWkbTypes::Type *wkbType, QgsRectangle *extent = nullptr ); diff --git a/tests/src/core/testqgsgml.cpp b/tests/src/core/testqgsgml.cpp index 3f645df9104a..7f4ed989fccc 100644 --- a/tests/src/core/testqgsgml.cpp +++ b/tests/src/core/testqgsgml.cpp @@ -17,6 +17,7 @@ #include "qgstest.h" #include #include +#include //qgis includes... #include @@ -82,6 +83,9 @@ class TestQgsGML : public QObject void testThroughOGRGeometry_urn_EPSG_4326(); void testAccents(); void testSameTypeameAsGeomName(); + void testUnknownEncoding_data(); + void testUnknownEncoding(); + void testUnhandledEncoding(); }; const QString data1( "( "xmlHeader" ); + QTest::addColumn( "encoding" ); + + QTest::newRow( "simple quote" ) << QStringLiteral( "" ) << QByteArrayLiteral( "ISO-8859-15" ); + QTest::newRow( "double quote" ) << QStringLiteral( "" ) << QByteArrayLiteral( "ISO-8859-15" ); + QTest::newRow( "UTF-8" ) << QStringLiteral( "" ) << QByteArrayLiteral( "UTF-8" ); + QTest::newRow( "No header" ) << QString() << QByteArrayLiteral( "UTF-8" ); +} + +void TestQgsGML::testUnknownEncoding() +{ + QFETCH( QString, xmlHeader ); + QFETCH( QByteArray, encoding ); + + QgsWkbTypes::Type wkbType; + + QTextCodec *codec = QTextCodec::codecForName( encoding ); + + QByteArray data = codec->fromUnicode( + QStringLiteral( + "%1" + "unknown" + "" + "" + "price: 10€" + "" + "" + "10,20" + "" + "" + "" + "" + "" ).arg( xmlHeader ) ); + + QgsFields fields; + fields.append( QgsField( QStringLiteral( "strfield" ), QVariant::String, QStringLiteral( "string" ) ) ); + + { + QgsGml gmlParser( QStringLiteral( "mytypename" ), QStringLiteral( "mygeom" ), fields ); + QCOMPARE( gmlParser.getFeatures( data, &wkbType ), 0 ); + QMap featureMaps = gmlParser.featuresMap(); + QCOMPARE( featureMaps.size(), 1 ); + QVERIFY( featureMaps.constFind( 0 ) != featureMaps.constEnd() ); + QCOMPARE( featureMaps[ 0 ]->attributes().size(), 1 ); + QCOMPARE( featureMaps[0]->attribute( QStringLiteral( "strfield" ) ).toString(), QString( "price: 10€" ) ); + delete featureMaps[ 0 ]; + } + + { + QgsGmlStreamingParser gmlParser( QStringLiteral( "mytypename" ), QStringLiteral( "mygeom" ), fields ); + QCOMPARE( gmlParser.processData( data.mid( 0, data.size() / 2 ), false ), true ); + QCOMPARE( gmlParser.getAndStealReadyFeatures().size(), 0 ); + QCOMPARE( gmlParser.processData( data.mid( data.size() / 2 ), true ), true ); + QCOMPARE( gmlParser.isException(), false ); + QVector features = gmlParser.getAndStealReadyFeatures(); + QCOMPARE( features.size(), 1 ); + QCOMPARE( features[ 0 ].first->attributes().size(), 1 ); + QCOMPARE( features[ 0 ].first->attribute( QStringLiteral( "strfield" ) ).toString(), QString( "price: 10€" ) ); + delete features[0].first; + } +} + +void TestQgsGML::testUnhandledEncoding() +{ + QgsWkbTypes::Type wkbType; + + QString data = QStringLiteral( + "" + "" + "unknown" + "" + "" + "price: 10€" + "" + "" + "10,20" + "" + "" + "" + "" + "" ); + + QgsFields fields; + fields.append( QgsField( QStringLiteral( "strfield" ), QVariant::String, QStringLiteral( "string" ) ) ); + + QgsGml gmlParser( QStringLiteral( "mytypename" ), QStringLiteral( "mygeom" ), fields ); + QCOMPARE( gmlParser.getFeatures( data.toUtf8(), &wkbType ), 0 ); + QMap featureMaps = gmlParser.featuresMap(); + QCOMPARE( featureMaps.size(), 0 ); +} + QGSTEST_MAIN( TestQgsGML ) #include "testqgsgml.moc"