Skip to content

Commit

Permalink
Workaround issue in Qt detection of encoding for html QByteArrays
Browse files Browse the repository at this point in the history
  • Loading branch information
nyalldawson committed Oct 3, 2014
1 parent 2cd1770 commit ade5b65
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 2 deletions.
43 changes: 41 additions & 2 deletions src/core/qgsnetworkcontentfetcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,47 @@ QString QgsNetworkContentFetcher::contentAsString() const
QByteArray array = mReply->readAll();

//correctly encode reply as unicode
QString content = QTextCodec::codecForHtml( array )->toUnicode( array );
return content;
QTextCodec* codec = codecForHtml( array );
return codec->toUnicode( array );
}

QTextCodec* QgsNetworkContentFetcher::codecForHtml( QByteArray& array ) const
{
//QTextCodec::codecForHtml fails to detect "<meta charset="utf-8"/>" type tags
//see https://bugreports.qt-project.org/browse/QTBUG-41011
//so test for that ourselves

//basic check
QTextCodec* codec = QTextCodec::codecForUtfText( array, 0 );
if ( codec )
{
return codec;
}

//check for meta charset tag
QByteArray header = array.left( 1024 ).toLower();
int pos = header.indexOf( "meta charset=" );
if ( pos != -1 )
{
pos += int( strlen( "meta charset=" ) ) + 1;
int pos2 = header.indexOf( '\"', pos );
QByteArray cs = header.mid( pos, pos2 - pos );
codec = QTextCodec::codecForName( cs );
if ( codec )
{
return codec;
}
}

//fallback to QTextCodec::codecForHtml
codec = QTextCodec::codecForHtml( array, codec );
if ( codec )
{
return codec;
}

//no luck, default to utf-8
return QTextCodec::codecForName( "UTF-8" );
}

void QgsNetworkContentFetcher::contentLoaded( bool ok )
Expand Down
6 changes: 6 additions & 0 deletions src/core/qgsnetworkcontentfetcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ class CORE_EXPORT QgsNetworkContentFetcher : public QObject

bool mContentLoaded;

/**Tries to create a text codec for decoding html content. Works around bugs in Qt's built in method.
* @param array input html byte array
* @returns QTextCodec for html content, if detected
*/
QTextCodec *codecForHtml( QByteArray &array ) const;

private slots:

/**Called when fetchUrlContent has finished loading a url. If
Expand Down
19 changes: 19 additions & 0 deletions tests/src/core/testqgsnetworkcontentfetcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class TestQgsNetworkContentFetcher: public QObject
void fetchEmptyUrl(); //test fetching blank url
void fetchBadUrl(); //test fetching bad url
void fetchUrlContent(); //test fetching url content
void fetchEncodedContent(); //test fetching url content encoded as utf-8

void contentLoaded();

Expand Down Expand Up @@ -107,6 +108,24 @@ void TestQgsNetworkContentFetcher::fetchUrlContent()
QVERIFY( mFetchedHtml.contains( QString( "QGIS" ) ) );
}

void TestQgsNetworkContentFetcher::fetchEncodedContent()
{
QgsNetworkContentFetcher fetcher;
//test fetching content from the QGIS homepage
mLoaded = false;
fetcher.fetchContent( QUrl::fromLocalFile( QString( TEST_DATA_DIR ) + QDir::separator() + "encoded_html.html" ) );
connect( &fetcher, SIGNAL( finished() ), this, SLOT( contentLoaded() ) );
while ( !mLoaded )
{
qApp->processEvents();
}
QVERIFY( fetcher.reply()->error() == QNetworkReply::NoError );

//test retrieved content and check for correct detection of encoding
QString mFetchedHtml = fetcher.contentAsString();
QVERIFY( mFetchedHtml.contains( QChar( 6040 ) ) );
}

void TestQgsNetworkContentFetcher::contentLoaded()
{
mLoaded = true;
Expand Down
11 changes: 11 additions & 0 deletions tests/testdata/encoded_html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;">
<meta charset="UTF-8">
<title>test</title>
</head>
<body>
<p>សាលា ម៉ាត</p>
</body></html>

0 comments on commit ade5b65

Please sign in to comment.