Skip to content
Permalink
Browse files

Optimise the CSV import performance

This commit bundles a number of smaller optimisations in the CSV parser
and import code. They do add up to a noticible speed gain though (at
least on some systems and configurations).
  • Loading branch information
MKleusberg committed Sep 13, 2017
1 parent 6ed8080 commit 0eb1f6579815ee1148323ce928d24eab9f0f8002
Showing with 93 additions and 90 deletions.
  1. +17 −17 src/ImportCsvDialog.cpp
  2. +1 −1 src/ImportCsvDialog.h
  3. +48 −31 src/csvparser.cpp
  4. +9 −23 src/csvparser.h
  5. +18 −18 src/tests/TestImport.cpp
@@ -104,7 +104,7 @@ void rollback(
class CSVImportProgress : public CSVProgress
{
public:
explicit CSVImportProgress(size_t filesize)
explicit CSVImportProgress(qint64 filesize)
{
m_pProgressDlg = new QProgressDialog(
QObject::tr("Importing CSV file..."),
@@ -124,7 +124,7 @@ class CSVImportProgress : public CSVProgress
m_pProgressDlg->show();
}

bool update(size_t pos)
bool update(qint64 pos)
{
m_pProgressDlg->setValue(pos);
qApp->processEvents();
@@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader);

// Parse file
parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool {
parseCSV(selectedFile, [this](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Skip first row if it is to be used as header
if(rowNum == 0 && ui->checkboxHeader->isChecked())
return true;
@@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview()

// Fill data section
ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1);
for(QStringList::const_iterator it=data.begin();it!=data.end();++it)
for(auto it=data.constBegin();it!=data.constEnd();++it)
{
// Generate vertical header items
if(it == data.begin())
@@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setItem(
rowNum,
std::distance(data.begin(), it),
new QTableWidgetItem(*it));
new QTableWidgetItem(QString(*it)));
}

return true;
@@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar()
checkInput();
}

CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count)
CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count)
{
// Parse all csv data
QFile file(fileName);
@@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::
CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar());

// Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one.
if(count == -1)
if(count == 0)
csv.setCSVProgress(new CSVImportProgress(file.size()));

QTextStream tstream(&file);
@@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
sqlb::FieldVector fieldList; // List of fields in the file

// Parse the first couple of records of the CSV file and only analyse them
parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool {
parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Has this row more columns than the previous one? Then add more fields to the field list as necessary.
for(int i=fieldList.size();i<data.size();i++)
{
@@ -436,7 +436,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return rollback(this, pdb, restorepointName, 0, tr("Creating restore point failed: %1").arg(pdb->lastError()));

// Create table
QStringList nullValues;
QVector<QByteArray> nullValues;
if(!importToExistingTable)
{
if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList))
@@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
if(f->isInteger() && f->notnull()) // If this is an integer column but NULL isn't allowed, insert 0
nullValues << "0";
else if(f->isInteger() && !f->notnull()) // If this is an integer column and NULL is allowed, insert NULL
nullValues << QString();
nullValues << QByteArray();
else // Otherwise (i.e. if this isn't an integer column), insert an empty string
nullValues << "";
}
@@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)

// Parse entire file
size_t lastRowNum = 0;
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool {
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Process the parser results row by row

#ifdef CSV_BENCHMARK
@@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return true;

// Bind all values
unsigned int bound_fields = 0;
for(int i=0;i<data.size();i++,bound_fields++)
int bound_fields = 0;
for(auto it=data.constBegin();it!=data.constEnd();++it,bound_fields++)
{
// Empty values need special treatment, but only when importing into an existing table where we could find out something about
// its table definition
if(importToExistingTable && data.at(i).isEmpty() && nullValues.size() > i)
if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields)
{
// This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into.
QString val = nullValues.at(i);
const QByteArray& val = nullValues.at(bound_fields);
if(!val.isNull()) // No need to bind NULL values here as that is the default bound value in SQLite
sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC);
} else {
// This is a non-empty value. Just add it to the statement
sqlite3_bind_text(stmt, i+1, static_cast<const char*>(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC);
}
}

@@ -39,7 +39,7 @@ private slots:
DBBrowserDB* pdb;
QCompleter* encodingCompleter;

CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count = -1);
CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count = 0);
sqlb::FieldVector generateFieldList(const QString& filename);

void importCsv(const QString& f, const QString &n = QString());
@@ -3,7 +3,7 @@
#include <QTextStream>
#include <algorithm>

CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
@@ -18,34 +18,49 @@ CSVParser::~CSVParser()
}

namespace {
inline void addColumn(QStringList& r, QString& field, bool trim)
inline void addColumn(QVector<QByteArray>& r, QString& field, bool trim)
{
if(trim)
r << field.trimmed();
r.push_back(field.trimmed().toUtf8());
else
r << field;
r.push_back(field.toUtf8());

field.clear();
field.reserve(128);
}

inline bool addRow(CSVParser::csvRowFunction& f, QVector<QByteArray>& r, size_t& rowCount)
{
if(!f(rowCount, r))
return false;

r.clear();
rowCount++;
return true;
}
}

CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords)
{
m_iParsedRows = 0;
m_insertFunction = insertFunction;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
ParseStates state = StateNormal; // State of the parser
QString sBuffer; // Buffer for reading in the file
QString fieldbuf; // Buffer for parsing the current field
QVector<QByteArray> record; // Buffer for parsing the current row
size_t parsedRows = 0; // Number of rows parsed so far

if(m_pCSVProgress)
m_pCSVProgress->start();

while(!stream.atEnd())
{
QString sBuffer = stream.read(m_nBufferSize);
sBuffer = stream.read(m_nBufferSize);
auto sBufferEnd = sBuffer.constEnd();

for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it)
{
QChar c = *it;
// Get next char
char16_t c = it->unicode();

switch(state)
{
case StateNormal:
@@ -61,38 +76,39 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();

// Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
else if(c == '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else
@@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;

// See above for details on this.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
sBuffer.append(stream.read(1));
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
sBufferEnd = sBuffer.constEnd();
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}

// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
@@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
}

if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
if(nMaxRecords > 0 && parsedRows >= nMaxRecords)
return ParserResult::ParserResultSuccess;
}

if(m_pCSVProgress && m_iParsedRows % 100 == 0)
if(m_pCSVProgress && parsedRows % 100 == 0)
{
if(!m_pCSVProgress->update(stream.pos()))
return ParserResult::ParserResultCancelled;
@@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
addColumn(record, fieldbuf, m_bTrimFields);

if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}

@@ -1,8 +1,7 @@
#ifndef CSVPARSER_H
#define CSVPARSER_H

#include <QChar>
#include <QStringList>
#include <QVector>
#include <functional>

class QTextStream;
@@ -18,16 +17,16 @@ class CSVProgress
virtual ~CSVProgress() { }

virtual void start() = 0;
virtual bool update(size_t pos) = 0;
virtual bool update(qint64 pos) = 0;
virtual void end() = 0;
};

class CSVParser
{
public:
typedef std::function<bool(size_t, QStringList)> csvRowFunction;
typedef std::function<bool(size_t, QVector<QByteArray>)> csvRowFunction;

CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"');
~CSVParser();

enum ParserResult
@@ -42,10 +41,10 @@ class CSVParser
* @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns
* in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true.
* \param stream Stream with the CSV parser
* \param nMaxRecords Max records too read, -1 if unlimited
* \param nMaxRecords Max records too read, 0 if unlimited
* \return ParserResult value that indicated whether action finished normally, was cancelled or errored.
*/
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1);
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0);

void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }

@@ -57,26 +56,13 @@ class CSVParser
StateEndQuote
};

inline bool addRow(QStringList& r)
{
if(!m_insertFunction(m_iParsedRows, r))
return false;

r.clear();
m_iParsedRows++;
return true;
}

private:
bool m_bTrimFields;
QChar m_cFieldSeparator;
QChar m_cQuoteChar;
char16_t m_cFieldSeparator;
char16_t m_cQuoteChar;
CSVProgress* m_pCSVProgress;
csvRowFunction m_insertFunction;

qint64 m_iParsedRows; // Number of rows parsed so far

size_t m_nBufferSize; //! internal buffer read size
qint64 m_nBufferSize; //! internal buffer read size
};

#endif

4 comments on commit 0eb1f65

@MKleusberg

This comment has been minimized.

Copy link
Member Author

@MKleusberg MKleusberg replied Sep 13, 2017

This commit is worth benchmarking, too 😄 As far as I can see, we've reduced the import time from 3 minutes in DB4S 3.9.1 to around 2 minutes with the memory consumption patch. With any luck this should help reducing it even more.

@justinclift

This comment has been minimized.

Copy link
Member

@justinclift justinclift replied Sep 13, 2017

No worries. I'll get to this pretty soon. Just finished the ToDo list item I was working on, so good timing. 😄

@MKleusberg

This comment has been minimized.

Copy link
Member Author

@MKleusberg MKleusberg replied Sep 13, 2017

No need to hurry though - I'm taking the rest of the day off anyway 😄

@justinclift

This comment has been minimized.

Copy link
Member

@justinclift justinclift replied Sep 13, 2017

Looks good. This has knocked another 20 seconds off the import time for the UK postcode CSV file. 😄

Interestingly, the Trim fields option seems to be noticeable now (~1 sec difference).

With Trim fields enabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100947ms. Of this 11512ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
100742ms. Of this 11961ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv' took
101307ms. Of this 11845ms were spent in the row function.

With Trim fields disabled:

Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99397ms. Of this 11533ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99306ms. Of this 11565ms were spent in the row function.
Importing the file '/home/jc/Databases/National_Statistics_Postcode_Lookup_UK.csv'
took 99901ms. Of this 12040ms were spent in the row function.
Please sign in to comment.