Permalink
Browse files

Optimise the CSV import performance

This commit bundles a number of smaller optimisations in the CSV parser
and import code. They do add up to a noticible speed gain though (at
least on some systems and configurations).
  • Loading branch information...
MKleusberg committed Sep 13, 2017
1 parent 6ed8080 commit 0eb1f6579815ee1148323ce928d24eab9f0f8002
Showing with 93 additions and 90 deletions.
  1. +17 −17 src/ImportCsvDialog.cpp
  2. +1 −1 src/ImportCsvDialog.h
  3. +48 −31 src/csvparser.cpp
  4. +9 −23 src/csvparser.h
  5. +18 −18 src/tests/TestImport.cpp
@@ -104,7 +104,7 @@ void rollback(
class CSVImportProgress : public CSVProgress
{
public:
explicit CSVImportProgress(size_t filesize)
explicit CSVImportProgress(qint64 filesize)
{
m_pProgressDlg = new QProgressDialog(
QObject::tr("Importing CSV file..."),
@@ -124,7 +124,7 @@ class CSVImportProgress : public CSVProgress
m_pProgressDlg->show();
}

bool update(size_t pos)
bool update(qint64 pos)
{
m_pProgressDlg->setValue(pos);
qApp->processEvents();
@@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader);

// Parse file
parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool {
parseCSV(selectedFile, [this](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Skip first row if it is to be used as header
if(rowNum == 0 && ui->checkboxHeader->isChecked())
return true;
@@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview()

// Fill data section
ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1);
for(QStringList::const_iterator it=data.begin();it!=data.end();++it)
for(auto it=data.constBegin();it!=data.constEnd();++it)
{
// Generate vertical header items
if(it == data.begin())
@@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setItem(
rowNum,
std::distance(data.begin(), it),
new QTableWidgetItem(*it));
new QTableWidgetItem(QString(*it)));
}

return true;
@@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar()
checkInput();
}

CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count)
CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count)
{
// Parse all csv data
QFile file(fileName);
@@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::
CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar());

// Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one.
if(count == -1)
if(count == 0)
csv.setCSVProgress(new CSVImportProgress(file.size()));

QTextStream tstream(&file);
@@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
sqlb::FieldVector fieldList; // List of fields in the file

// Parse the first couple of records of the CSV file and only analyse them
parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool {
parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Has this row more columns than the previous one? Then add more fields to the field list as necessary.
for(int i=fieldList.size();i<data.size();i++)
{
@@ -436,7 +436,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return rollback(this, pdb, restorepointName, 0, tr("Creating restore point failed: %1").arg(pdb->lastError()));

// Create table
QStringList nullValues;
QVector<QByteArray> nullValues;
if(!importToExistingTable)
{
if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList))
@@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
if(f->isInteger() && f->notnull()) // If this is an integer column but NULL isn't allowed, insert 0
nullValues << "0";
else if(f->isInteger() && !f->notnull()) // If this is an integer column and NULL is allowed, insert NULL
nullValues << QString();
nullValues << QByteArray();
else // Otherwise (i.e. if this isn't an integer column), insert an empty string
nullValues << "";
}
@@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)

// Parse entire file
size_t lastRowNum = 0;
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool {
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Process the parser results row by row

#ifdef CSV_BENCHMARK
@@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return true;

// Bind all values
unsigned int bound_fields = 0;
for(int i=0;i<data.size();i++,bound_fields++)
int bound_fields = 0;
for(auto it=data.constBegin();it!=data.constEnd();++it,bound_fields++)
{
// Empty values need special treatment, but only when importing into an existing table where we could find out something about
// its table definition
if(importToExistingTable && data.at(i).isEmpty() && nullValues.size() > i)
if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields)
{
// This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into.
QString val = nullValues.at(i);
const QByteArray& val = nullValues.at(bound_fields);
if(!val.isNull()) // No need to bind NULL values here as that is the default bound value in SQLite
sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC);
} else {
// This is a non-empty value. Just add it to the statement
sqlite3_bind_text(stmt, i+1, static_cast<const char*>(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC);
}
}

@@ -39,7 +39,7 @@ private slots:
DBBrowserDB* pdb;
QCompleter* encodingCompleter;

CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count = -1);
CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count = 0);
sqlb::FieldVector generateFieldList(const QString& filename);

void importCsv(const QString& f, const QString &n = QString());
@@ -3,7 +3,7 @@
#include <QTextStream>
#include <algorithm>

CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
@@ -18,34 +18,49 @@ CSVParser::~CSVParser()
}

namespace {
inline void addColumn(QStringList& r, QString& field, bool trim)
inline void addColumn(QVector<QByteArray>& r, QString& field, bool trim)
{
if(trim)
r << field.trimmed();
r.push_back(field.trimmed().toUtf8());
else
r << field;
r.push_back(field.toUtf8());

field.clear();
field.reserve(128);
}

inline bool addRow(CSVParser::csvRowFunction& f, QVector<QByteArray>& r, size_t& rowCount)
{
if(!f(rowCount, r))
return false;

r.clear();
rowCount++;
return true;
}
}

CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords)
{
m_iParsedRows = 0;
m_insertFunction = insertFunction;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
ParseStates state = StateNormal; // State of the parser
QString sBuffer; // Buffer for reading in the file
QString fieldbuf; // Buffer for parsing the current field
QVector<QByteArray> record; // Buffer for parsing the current row
size_t parsedRows = 0; // Number of rows parsed so far

if(m_pCSVProgress)
m_pCSVProgress->start();

while(!stream.atEnd())
{
QString sBuffer = stream.read(m_nBufferSize);
sBuffer = stream.read(m_nBufferSize);
auto sBufferEnd = sBuffer.constEnd();

for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it)
{
QChar c = *it;
// Get next char
char16_t c = it->unicode();

switch(state)
{
case StateNormal:
@@ -61,38 +76,39 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();

// Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
else if(c == '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else
@@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// See above for details on this.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
sBuffer.append(stream.read(1));
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
sBufferEnd = sBuffer.constEnd();
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
@@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
}

if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
if(nMaxRecords > 0 && parsedRows >= nMaxRecords)
return ParserResult::ParserResultSuccess;
}

if(m_pCSVProgress && m_iParsedRows % 100 == 0)
if(m_pCSVProgress && parsedRows % 100 == 0)
{
if(!m_pCSVProgress->update(stream.pos()))
return ParserResult::ParserResultCancelled;
@@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}

@@ -1,8 +1,7 @@
#ifndef CSVPARSER_H
#define CSVPARSER_H

#include <QChar>
#include <QStringList>
#include <QVector>
#include <functional>

class QTextStream;
@@ -18,16 +17,16 @@ class CSVProgress
virtual ~CSVProgress() { }

virtual void start() = 0;
virtual bool update(size_t pos) = 0;
virtual bool update(qint64 pos) = 0;
virtual void end() = 0;
};

class CSVParser
{
public:
typedef std::function<bool(size_t, QStringList)> csvRowFunction;
typedef std::function<bool(size_t, QVector<QByteArray>)> csvRowFunction;

CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"');
~CSVParser();

enum ParserResult
@@ -42,10 +41,10 @@ class CSVParser
* @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns
* in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true.
* \param stream Stream with the CSV parser
* \param nMaxRecords Max records too read, -1 if unlimited
* \param nMaxRecords Max records too read, 0 if unlimited
* \return ParserResult value that indicated whether action finished normally, was cancelled or errored.
*/
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1);
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0);

void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }

@@ -57,26 +56,13 @@ class CSVParser
StateEndQuote
};

inline bool addRow(QStringList& r)
{
if(!m_insertFunction(m_iParsedRows, r))
return false;

r.clear();
m_iParsedRows++;
return true;
}

private:
bool m_bTrimFields;
QChar m_cFieldSeparator;
QChar m_cQuoteChar;
char16_t m_cFieldSeparator;
char16_t m_cQuoteChar;
CSVProgress* m_pCSVProgress;
csvRowFunction m_insertFunction;

qint64 m_iParsedRows; // Number of rows parsed so far

size_t m_nBufferSize; //! internal buffer read size
qint64 m_nBufferSize; //! internal buffer read size
};

#endif
Oops, something went wrong.

4 comments on commit 0eb1f65

@MKleusberg

This comment has been minimized.

Copy link
Member Author

MKleusberg replied Sep 13, 2017

This commit is worth benchmarking, too 😄 As far as I can see, we've reduced the import time from 3 minutes in DB4S 3.9.1 to around 2 minutes with the memory consumption patch. With any luck this should help reducing it even more.

@justinclift

This comment has been minimized.

Copy link
Member

justinclift replied Sep 13, 2017

No worries. I'll get to this pretty soon. Just finished the ToDo list item I was working on, so good timing. 😄

@MKleusberg

This comment has been minimized.

Copy link
Member Author

MKleusberg replied Sep 13, 2017

No need to hurry though - I'm taking the rest of the day off anyway 😄

@justinclift

This comment has been minimized.

Copy link
Member

justinclift replied Sep 13, 2017

Looks good. This has knocked another 20 seconds off the import time for the UK postcode CSV file. 😄

Interestingly, the Trim fields option seems to be noticeable now (~1 sec difference).

With Trim fields enabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100947ms. Of this 11512ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100742ms. Of this 11961ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
101307ms. Of this 11845ms were spent in the row function.

With Trim fields disabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99397ms. Of this 11533ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99306ms. Of this 11565ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99901ms. Of this 12040ms were spent in the row function.
Please sign in to comment.