Skip to content
Permalink
Browse files

QgsDelimitedTextFile: fix parsing of files with CR end of line

Fixes #36392
Fixes #21976
Fixes #17190

We are obliged to do 'at hand' parsing due to QT not handling CR-only
end of lines.
As we are at it, also limit each line to 1 MB to avoid potential denial
of service (which was what close to what happened here before the CR-only
parsing fix)

Add tests for parsing CR-only end of lines, and exercising the at-hand
buffering logic
  • Loading branch information
rouault committed May 28, 2020
1 parent a261a06 commit 644a5647dbdef395993cf406678addfd6978e781
@@ -39,6 +39,10 @@ QgsDelimitedTextFile::QgsDelimitedTextFile( const QString &url )
// The default type is CSV
setTypeCSV();
if ( ! url.isNull() ) setFromUrl( url );

// For tests
QString bufferSizeStr( getenv( "QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE" ) );
mMaxBufferSize = bufferSizeStr.isEmpty() ? 1024 * 1024 : bufferSizeStr.toInt();
}


@@ -533,7 +537,6 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextRecord( QStringList &reco
return status;
}


QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
{
// Make sure the file is valid open
@@ -544,12 +547,14 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
mLineNumber = 0;
mRecordNumber = -1;
mRecordLineNumber = -1;
mBuffer = QString();
mPosInBuffer = 0;

// Skip header lines
for ( int i = mSkipLines; i-- > 0; )
{
if ( mStream->readLine().isNull() ) return RecordEOF;
mLineNumber++;
QString ignoredContent;
if ( nextLine( ignoredContent ) == RecordEOF ) return RecordEOF;
}
// Read the column names
Status result = RecordOk;
@@ -570,11 +575,79 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
Status status = reset();
if ( status != RecordOk ) return status;
}
if ( mLineNumber == 0 )
{
mPosInBuffer = 0;
mBuffer = mStream->read( mMaxBufferSize );
}

while ( ! mStream->atEnd() )
while ( !mBuffer.isEmpty() )
{
buffer = mStream->readLine();
if ( buffer.isNull() ) break;
// Identify position of \r , \n or \r\n
// We should rather use mStream->readLine(), but it fails to detect \r
// line endings.
int eolPos = mBuffer.indexOf( '\r', mPosInBuffer );
int nextPos = 0;
if ( eolPos >= 0 )
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
{
if ( mBuffer[eolPos + 1] == '\n' )
{
nextPos = eolPos + 2;
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
}
}
else
{
eolPos = mBuffer.indexOf( '\n', mPosInBuffer );
if ( eolPos >= 0 )
{
nextPos = eolPos + 1;
}
}
if ( eolPos < 0 )
{
if ( mPosInBuffer == 0 )
{
// If our current position was the beginning of the buffer and we
// didn't find any end of line character, then return the whole buffer
// (to avoid unbounded line sizes)
// and set the buffer to null so that we don't iterate any more.
buffer = mBuffer;
mBuffer = QString();
}
else
{
// Read more bytes from file to have up to mMaxBufferSize characters
// in our buffer (after having subset it from mPosInBuffer)
mBuffer = mBuffer.mid( mPosInBuffer );
mBuffer += mStream->read( mMaxBufferSize - mBuffer.size() );
mPosInBuffer = 0;
continue;
}
}
else
{
// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
mLineNumber++;
if ( skipBlank && buffer.isEmpty() ) continue;
return RecordOk;
@@ -428,6 +428,9 @@ class QgsDelimitedTextFile : public QObject
long mLineNumber = -1;
long mRecordLineNumber = -1;
long mRecordNumber = -1;
QString mBuffer;
int mPosInBuffer = 0;
int mMaxBufferSize = 0;
QStringList mCurrentRecord;
bool mHoldCurrentRecord = false;
// Maximum number of record (ie maximum record number visited)
@@ -933,6 +933,62 @@ def testEncodeuri(self):
uri = registry.encodeUri('delimitedtext', parts)
self.assertEqual(uri, 'file://' + filename)

def testCREndOfLineAndWorkingBuffer(self):
# Test CSV file with \r (CR) endings
# Test also that the logic to refill the buffer works properly
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '17'
try:
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')

url = MyUrl.fromLocalFile(basetestfile)
url.addQueryItem("type", "csv")
url.addQueryItem("geomType", "none")

vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
assert vl.isValid(), "{} is invalid".format(basetestfile)

fields = vl.fields()
self.assertEqual(len(fields), 2)
self.assertEqual(fields[0].name(), 'col0')
self.assertEqual(fields[1].name(), 'col1')

features = [f for f in vl.getFeatures()]
self.assertEqual(len(features), 2)
self.assertEqual(features[0]['col0'], 'value00')
self.assertEqual(features[0]['col1'], 'value01')
self.assertEqual(features[1]['col0'], 'value10')
self.assertEqual(features[1]['col1'], 'value11')

finally:
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']

def testSaturationOfWorkingBuffer(self):
# 10 bytes is sufficient to detect the header line, but not enough for the
# first record
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '10'
try:
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')

url = MyUrl.fromLocalFile(basetestfile)
url.addQueryItem("type", "csv")
url.addQueryItem("geomType", "none")

vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
assert vl.isValid(), "{} is invalid".format(basetestfile)

fields = vl.fields()
self.assertEqual(len(fields), 2)
self.assertEqual(fields[0].name(), 'col0')
self.assertEqual(fields[1].name(), 'col1')

features = [f for f in vl.getFeatures()]
self.assertEqual(len(features), 1)
self.assertEqual(features[0]['col0'], 'value00')
self.assertEqual(features[0]['col1'], 'va') # truncated

finally:
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']


if __name__ == '__main__':
unittest.main()
@@ -0,0 +1 @@
col0,col1value00,value01value10,value11

0 comments on commit 644a564

Please sign in to comment.
You can’t perform that action at this time.