Skip to content

Commit

Permalink
Optimise the CSV import performance
Browse files Browse the repository at this point in the history
This commit bundles a number of smaller optimisations in the CSV parser
and import code. They do add up to a noticible speed gain though (at
least on some systems and configurations).
  • Loading branch information
MKleusberg committed Sep 13, 2017
1 parent 6ed8080 commit 0eb1f65
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 90 deletions.
34 changes: 17 additions & 17 deletions src/ImportCsvDialog.cpp
Expand Up @@ -104,7 +104,7 @@ void rollback(
class CSVImportProgress : public CSVProgress
{
public:
explicit CSVImportProgress(size_t filesize)
explicit CSVImportProgress(qint64 filesize)
{
m_pProgressDlg = new QProgressDialog(
QObject::tr("Importing CSV file..."),
Expand All @@ -124,7 +124,7 @@ class CSVImportProgress : public CSVProgress
m_pProgressDlg->show();
}

bool update(size_t pos)
bool update(qint64 pos)
{
m_pProgressDlg->setValue(pos);
qApp->processEvents();
Expand Down Expand Up @@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader);

// Parse file
parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool {
parseCSV(selectedFile, [this](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Skip first row if it is to be used as header
if(rowNum == 0 && ui->checkboxHeader->isChecked())
return true;
Expand All @@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview()

// Fill data section
ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1);
for(QStringList::const_iterator it=data.begin();it!=data.end();++it)
for(auto it=data.constBegin();it!=data.constEnd();++it)
{
// Generate vertical header items
if(it == data.begin())
Expand All @@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setItem(
rowNum,
std::distance(data.begin(), it),
new QTableWidgetItem(*it));
new QTableWidgetItem(QString(*it)));
}

return true;
Expand Down Expand Up @@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar()
checkInput();
}

CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count)
CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count)
{
// Parse all csv data
QFile file(fileName);
Expand All @@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::
CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar());

// Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one.
if(count == -1)
if(count == 0)
csv.setCSVProgress(new CSVImportProgress(file.size()));

QTextStream tstream(&file);
Expand All @@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
sqlb::FieldVector fieldList; // List of fields in the file

// Parse the first couple of records of the CSV file and only analyse them
parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool {
parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Has this row more columns than the previous one? Then add more fields to the field list as necessary.
for(int i=fieldList.size();i<data.size();i++)
{
Expand Down Expand Up @@ -436,7 +436,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return rollback(this, pdb, restorepointName, 0, tr("Creating restore point failed: %1").arg(pdb->lastError()));

// Create table
QStringList nullValues;
QVector<QByteArray> nullValues;
if(!importToExistingTable)
{
if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList))
Expand All @@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
if(f->isInteger() && f->notnull()) // If this is an integer column but NULL isn't allowed, insert 0
nullValues << "0";
else if(f->isInteger() && !f->notnull()) // If this is an integer column and NULL is allowed, insert NULL
nullValues << QString();
nullValues << QByteArray();
else // Otherwise (i.e. if this isn't an integer column), insert an empty string
nullValues << "";
}
Expand All @@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)

// Parse entire file
size_t lastRowNum = 0;
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool {
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Process the parser results row by row

#ifdef CSV_BENCHMARK
Expand All @@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return true;

// Bind all values
unsigned int bound_fields = 0;
for(int i=0;i<data.size();i++,bound_fields++)
int bound_fields = 0;
for(auto it=data.constBegin();it!=data.constEnd();++it,bound_fields++)
{
// Empty values need special treatment, but only when importing into an existing table where we could find out something about
// its table definition
if(importToExistingTable && data.at(i).isEmpty() && nullValues.size() > i)
if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields)
{
// This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into.
QString val = nullValues.at(i);
const QByteArray& val = nullValues.at(bound_fields);
if(!val.isNull()) // No need to bind NULL values here as that is the default bound value in SQLite
sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC);
} else {
// This is a non-empty value. Just add it to the statement
sqlite3_bind_text(stmt, i+1, static_cast<const char*>(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/ImportCsvDialog.h
Expand Up @@ -39,7 +39,7 @@ private slots:
DBBrowserDB* pdb;
QCompleter* encodingCompleter;

CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count = -1);
CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count = 0);
sqlb::FieldVector generateFieldList(const QString& filename);

void importCsv(const QString& f, const QString &n = QString());
Expand Down
79 changes: 48 additions & 31 deletions src/csvparser.cpp
Expand Up @@ -3,7 +3,7 @@
#include <QTextStream>
#include <algorithm>

CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
Expand All @@ -18,34 +18,49 @@ CSVParser::~CSVParser()
}

namespace {
inline void addColumn(QStringList& r, QString& field, bool trim)
inline void addColumn(QVector<QByteArray>& r, QString& field, bool trim)
{
if(trim)
r << field.trimmed();
r.push_back(field.trimmed().toUtf8());
else
r << field;
r.push_back(field.toUtf8());

field.clear();
field.reserve(128);
}

inline bool addRow(CSVParser::csvRowFunction& f, QVector<QByteArray>& r, size_t& rowCount)
{
if(!f(rowCount, r))
return false;

r.clear();
rowCount++;
return true;
}
}

CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords)
{
m_iParsedRows = 0;
m_insertFunction = insertFunction;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
ParseStates state = StateNormal; // State of the parser
QString sBuffer; // Buffer for reading in the file
QString fieldbuf; // Buffer for parsing the current field
QVector<QByteArray> record; // Buffer for parsing the current row
size_t parsedRows = 0; // Number of rows parsed so far

if(m_pCSVProgress)
m_pCSVProgress->start();

while(!stream.atEnd())
{
QString sBuffer = stream.read(m_nBufferSize);
sBuffer = stream.read(m_nBufferSize);
auto sBufferEnd = sBuffer.constEnd();

for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it)
{
QChar c = *it;
// Get next char
char16_t c = it->unicode();

switch(state)
{
case StateNormal:
Expand All @@ -61,38 +76,39 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();

// Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
else if(c == '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else
Expand Down Expand Up @@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// See above for details on this.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
sBuffer.append(stream.read(1));
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
sBufferEnd = sBuffer.constEnd();
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
Expand All @@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
}

if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
if(nMaxRecords > 0 && parsedRows >= nMaxRecords)
return ParserResult::ParserResultSuccess;
}

if(m_pCSVProgress && m_iParsedRows % 100 == 0)
if(m_pCSVProgress && parsedRows % 100 == 0)
{
if(!m_pCSVProgress->update(stream.pos()))
return ParserResult::ParserResultCancelled;
Expand All @@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}

Expand Down
32 changes: 9 additions & 23 deletions src/csvparser.h
@@ -1,8 +1,7 @@
#ifndef CSVPARSER_H
#define CSVPARSER_H

#include <QChar>
#include <QStringList>
#include <QVector>
#include <functional>

class QTextStream;
Expand All @@ -18,16 +17,16 @@ class CSVProgress
virtual ~CSVProgress() { }

virtual void start() = 0;
virtual bool update(size_t pos) = 0;
virtual bool update(qint64 pos) = 0;
virtual void end() = 0;
};

class CSVParser
{
public:
typedef std::function<bool(size_t, QStringList)> csvRowFunction;
typedef std::function<bool(size_t, QVector<QByteArray>)> csvRowFunction;

CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"');
~CSVParser();

enum ParserResult
Expand All @@ -42,10 +41,10 @@ class CSVParser
* @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns
* in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true.
* \param stream Stream with the CSV parser
* \param nMaxRecords Max records too read, -1 if unlimited
* \param nMaxRecords Max records too read, 0 if unlimited
* \return ParserResult value that indicated whether action finished normally, was cancelled or errored.
*/
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1);
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0);

void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }

Expand All @@ -57,26 +56,13 @@ class CSVParser
StateEndQuote
};

inline bool addRow(QStringList& r)
{
if(!m_insertFunction(m_iParsedRows, r))
return false;

r.clear();
m_iParsedRows++;
return true;
}

private:
bool m_bTrimFields;
QChar m_cFieldSeparator;
QChar m_cQuoteChar;
char16_t m_cFieldSeparator;
char16_t m_cQuoteChar;
CSVProgress* m_pCSVProgress;
csvRowFunction m_insertFunction;

qint64 m_iParsedRows; // Number of rows parsed so far

size_t m_nBufferSize; //! internal buffer read size
qint64 m_nBufferSize; //! internal buffer read size
};

#endif

4 comments on commit 0eb1f65

@MKleusberg
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit is worth benchmarking, too 😄 As far as I can see, we've reduced the import time from 3 minutes in DB4S 3.9.1 to around 2 minutes with the memory consumption patch. With any luck this should help reducing it even more.

@justinclift
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries. I'll get to this pretty soon. Just finished the ToDo list item I was working on, so good timing. 😄

@MKleusberg
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to hurry though - I'm taking the rest of the day off anyway 😄

@justinclift
Copy link
Member

@justinclift justinclift commented on 0eb1f65 Sep 13, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good. This has knocked another 20 seconds off the import time for the UK postcode CSV file. 😄

Interestingly, the Trim fields option seems to be noticeable now (~1 sec difference).

With Trim fields enabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100947ms. Of this 11512ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100742ms. Of this 11961ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
101307ms. Of this 11845ms were spent in the row function.

With Trim fields disabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99397ms. Of this 11533ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99306ms. Of this 11565ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99901ms. Of this 12040ms were spent in the row function.

Please sign in to comment.