Skip to content
Permalink
Browse files

[DelimitedText provider] Fix performance issue with files with Unix e…

…nd-of-line characters

Fixes #38068

The performance regression was introduced per commit 644a564
It resulted in quadratic performance in the size of the buffer for CSV files
with Unix '\n' end of line character, due to scanning repeatedly for `\r`
  • Loading branch information
rouault authored and nyalldawson committed Sep 14, 2020
1 parent 9924b15 commit a66784465a9dd8e9d344e37dbd0f304325ac1f6e
@@ -586,40 +586,63 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
// Identify position of \r , \n or \r\n
// We should rather use mStream->readLine(), but it fails to detect \r
// line endings.
int eolPos = mBuffer.indexOf( '\r', mPosInBuffer );
int nextPos = 0;
if ( eolPos >= 0 )
int eolPos = -1;
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
if ( mLineNumber == 0 )
{
if ( mBuffer[eolPos + 1] == '\n' )
// For the first line we don't know yet the end of line character, so
// manually scan for the first we find
const QChar *charBuffer = mBuffer.constData();
const int bufferSize = mBuffer.size();
for ( int pos = mPosInBuffer; pos < bufferSize; ++pos )
{
nextPos = eolPos + 2;
if ( charBuffer[pos] == '\r' || charBuffer[pos] == '\n' )
{
mFirstEOLChar = charBuffer[pos];
eolPos = pos;
break;
}
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
// Once we know the end of line character, use optimized indexOf()
eolPos = mBuffer.indexOf( mFirstEOLChar, mPosInBuffer );
}
}
else
if ( eolPos >= 0 )
{
eolPos = mBuffer.indexOf( '\n', mPosInBuffer );
if ( eolPos >= 0 )
int nextPos = eolPos + 1;
if ( mBuffer[eolPos] == '\r' )
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
{
if ( mBuffer[eolPos + 1] == '\n' )
{
nextPos = eolPos + 2;
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
}
}

// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
if ( eolPos < 0 )
else
{
if ( mPosInBuffer == 0 )
{
@@ -640,14 +663,6 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
continue;
}
}
else
{
// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
mLineNumber++;
if ( skipBlank && buffer.isEmpty() ) continue;
return RecordOk;
@@ -433,6 +433,7 @@ class QgsDelimitedTextFile : public QObject
QString mBuffer;
int mPosInBuffer = 0;
int mMaxBufferSize = 0;
QChar mFirstEOLChar = 0; // '\r' if EOL is "\r" or "\r\n", or `\n' if EOL is "\n"
QStringList mCurrentRecord;
bool mHoldCurrentRecord = false;
// Maximum number of record (ie maximum record number visited)

0 comments on commit a667844

Please sign in to comment.
You can’t perform that action at this time.