Optimise the CSV import performance

This commit bundles a number of smaller optimisations in the CSV parser and import code. They do add up to a noticible speed gain though (at least on some systems and configurations).
sqlitebrowser · Sep 13, 2017 · 0eb1f65 · 0eb1f65 · MKleusberg · Sep 13, 2017
1 parent 6ed8080
commit 0eb1f65
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 90 deletions.
diff --git a/src/ImportCsvDialog.cpp b/src/ImportCsvDialog.cpp
@@ -104,7 +104,7 @@ void rollback(
 class CSVImportProgress : public CSVProgress
 {
 public:
-    explicit CSVImportProgress(size_t filesize)
+    explicit CSVImportProgress(qint64 filesize)
     {
         m_pProgressDlg = new QProgressDialog(
                     QObject::tr("Importing CSV file..."),
@@ -124,7 +124,7 @@ class CSVImportProgress : public CSVProgress
         m_pProgressDlg->show();
     }
 
-    bool update(size_t pos)
+    bool update(qint64 pos)
     {
         m_pProgressDlg->setValue(pos);
         qApp->processEvents();
@@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview()
     ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader);
 
     // Parse file
-    parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool {
+    parseCSV(selectedFile, [this](size_t rowNum, const QVector<QByteArray>& data) -> bool {
         // Skip first row if it is to be used as header
         if(rowNum == 0 && ui->checkboxHeader->isChecked())
             return true;
@@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview()
 
         // Fill data section
         ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1);
-        for(QStringList::const_iterator it=data.begin();it!=data.end();++it)
+        for(auto it=data.constBegin();it!=data.constEnd();++it)
         {
             // Generate vertical header items
             if(it == data.begin())
@@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview()
             ui->tablePreview->setItem(
                         rowNum,
                         std::distance(data.begin(), it),
-                        new QTableWidgetItem(*it));
+                        new QTableWidgetItem(QString(*it)));
         }
 
         return true;
@@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar()
     checkInput();
 }
 
-CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count)
+CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count)
 {
     // Parse all csv data
     QFile file(fileName);
@@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::
     CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar());
 
     // Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one.
-    if(count == -1)
+    if(count == 0)
         csv.setCSVProgress(new CSVImportProgress(file.size()));
 
     QTextStream tstream(&file);
@@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
     sqlb::FieldVector fieldList;        // List of fields in the file
 
     // Parse the first couple of records of the CSV file and only analyse them
-    parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool {
+    parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector<QByteArray>& data) -> bool {
         // Has this row more columns than the previous one? Then add more fields to the field list as necessary.
         for(int i=fieldList.size();i<data.size();i++)
         {
@@ -436,7 +436,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
         return rollback(this, pdb, restorepointName, 0, tr("Creating restore point failed: %1").arg(pdb->lastError()));
 
     // Create table
-    QStringList nullValues;
+    QVector<QByteArray> nullValues;
     if(!importToExistingTable)
     {
         if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList))
@@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
                 if(f->isInteger() && f->notnull())              // If this is an integer column but NULL isn't allowed, insert 0
                     nullValues << "0";
                 else if(f->isInteger() && !f->notnull())        // If this is an integer column and NULL is allowed, insert NULL
-                    nullValues << QString();
+                    nullValues << QByteArray();
                 else                                            // Otherwise (i.e. if this isn't an integer column), insert an empty string
                     nullValues << "";
             }
@@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
 
     // Parse entire file
     size_t lastRowNum = 0;
-    CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool {
+    CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector<QByteArray>& data) -> bool {
         // Process the parser results row by row
 
 #ifdef CSV_BENCHMARK
@@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
             return true;
 
         // Bind all values
-        unsigned int bound_fields = 0;
-        for(int i=0;i<data.size();i++,bound_fields++)
+        int bound_fields = 0;
+        for(auto it=data.constBegin();it!=data.constEnd();++it,bound_fields++)
         {
             // Empty values need special treatment, but only when importing into an existing table where we could find out something about
             // its table definition
-            if(importToExistingTable && data.at(i).isEmpty() && nullValues.size() > i)
+            if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields)
             {
                 // This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into.
-                QString val = nullValues.at(i);
+                const QByteArray& val = nullValues.at(bound_fields);
                 if(!val.isNull())       // No need to bind NULL values here as that is the default bound value in SQLite
-                    sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT);
+                    sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC);
             } else {
                 // This is a non-empty value. Just add it to the statement
-                sqlite3_bind_text(stmt, i+1, static_cast<const char*>(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT);
+                sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC);
             }
         }
 

diff --git a/src/ImportCsvDialog.h b/src/ImportCsvDialog.h
@@ -39,7 +39,7 @@ private slots:
     DBBrowserDB* pdb;
     QCompleter* encodingCompleter;
 
-    CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count = -1);
+    CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count = 0);
     sqlb::FieldVector generateFieldList(const QString& filename);
 
     void importCsv(const QString& f, const QString &n = QString());

diff --git a/src/csvparser.cpp b/src/csvparser.cpp
@@ -3,7 +3,7 @@
 #include <QTextStream>
 #include <algorithm>
 
-CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
+CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar)
     : m_bTrimFields(trimfields)
     , m_cFieldSeparator(fieldseparator)
     , m_cQuoteChar(quotechar)
@@ -18,34 +18,49 @@ CSVParser::~CSVParser()
 }
 
 namespace {
-inline void addColumn(QStringList& r, QString& field, bool trim)
+inline void addColumn(QVector<QByteArray>& r, QString& field, bool trim)
 {
     if(trim)
-        r << field.trimmed();
+        r.push_back(field.trimmed().toUtf8());
     else
-        r << field;
+        r.push_back(field.toUtf8());
+
     field.clear();
+    field.reserve(128);
+}
+
+inline bool addRow(CSVParser::csvRowFunction& f, QVector<QByteArray>& r, size_t& rowCount)
+{
+    if(!f(rowCount, r))
+        return false;
+
+    r.clear();
+    rowCount++;
+    return true;
 }
 }
 
-CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
+CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords)
 {
-    m_iParsedRows = 0;
-    m_insertFunction = insertFunction;
-    ParseStates state = StateNormal;
-    QString fieldbuf;
-    QStringList record;
+    ParseStates state = StateNormal;        // State of the parser
+    QString sBuffer;                        // Buffer for reading in the file
+    QString fieldbuf;                       // Buffer for parsing the current field
+    QVector<QByteArray> record;             // Buffer for parsing the current row
+    size_t parsedRows = 0;                  // Number of rows parsed so far
 
     if(m_pCSVProgress)
         m_pCSVProgress->start();
 
     while(!stream.atEnd())
     {
-        QString sBuffer = stream.read(m_nBufferSize);
+        sBuffer = stream.read(m_nBufferSize);
+        auto sBufferEnd = sBuffer.constEnd();
 
-        for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
+        for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it)
         {
-            QChar c = *it;
+            // Get next char
+            char16_t c = it->unicode();
+
             switch(state)
             {
             case StateNormal:
@@ -61,38 +76,39 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
                 else if(c == '\r')
                 {
                     // look ahead to check for linefeed
-                    QString::iterator nit = it + 1;
+                    auto nit = it + 1;
 
                     // In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
                     // boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
                     // because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
                     // next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
                     // case and, if this is what's happening, we'll just load an extra byte.
-                    if(nit == sBuffer.end() && !stream.atEnd())
+                    if(nit == sBufferEnd && !stream.atEnd())
                     {
                         // Load one more byte
                         sBuffer.append(stream.read(1));
+                        sBufferEnd = sBuffer.constEnd();
 
-                        // Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've
+                        // Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
                         // just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
-                        it = sBuffer.end() - 2;
-                        nit = sBuffer.end() - 1;
+                        it = sBufferEnd - 2;
+                        nit = sBufferEnd - 1;
                     }
 
                     // no linefeed, so assume that CR represents a newline
-                    if(nit != sBuffer.end() && *nit != '\n')
+                    if(nit != sBufferEnd && *nit != '\n')
                     {
                         addColumn(record, fieldbuf, m_bTrimFields);
 
-                        if(!addRow(record))
+                        if(!addRow(insertFunction, record, parsedRows))
                             return ParserResult::ParserResultError;
                     }
                 }
                 else if(c == '\n')
                 {
                     addColumn(record, fieldbuf, m_bTrimFields);
 
-                    if(!addRow(record))
+                    if(!addRow(insertFunction, record, parsedRows))
                         return ParserResult::ParserResultError;
                 }
                 else
@@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
                     state = StateNormal;
                     addColumn(record, fieldbuf, m_bTrimFields);
 
-                    if(!addRow(record))
+                    if(!addRow(insertFunction, record, parsedRows))
                         return ParserResult::ParserResultError;
                 }
                 else if(c == '\r')
                 {
                     // look ahead to check for linefeed
-                    QString::iterator nit = it + 1;
+                    auto nit = it + 1;
 
                     // See above for details on this.
-                    if(nit == sBuffer.end() && !stream.atEnd())
+                    if(nit == sBufferEnd && !stream.atEnd())
                     {
                         sBuffer.append(stream.read(1));
-                        it = sBuffer.end() - 2;
-                        nit = sBuffer.end() - 1;
+                        sBufferEnd = sBuffer.constEnd();
+                        it = sBufferEnd - 2;
+                        nit = sBufferEnd - 1;
                     }
 
                     // no linefeed, so assume that CR represents a newline
-                    if(nit != sBuffer.end() && *nit != '\n')
+                    if(nit != sBufferEnd && *nit != '\n')
                     {
                         addColumn(record, fieldbuf, m_bTrimFields);
 
-                        if(!addRow(record))
+                        if(!addRow(insertFunction, record, parsedRows))
                             return ParserResult::ParserResultError;
                     }
                 }
@@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
             break;
             }
 
-            if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
+            if(nMaxRecords > 0 && parsedRows >= nMaxRecords)
                 return ParserResult::ParserResultSuccess;
         }
 
-        if(m_pCSVProgress && m_iParsedRows % 100 == 0)
+        if(m_pCSVProgress && parsedRows % 100 == 0)
         {
             if(!m_pCSVProgress->update(stream.pos()))
                 return ParserResult::ParserResultCancelled;
@@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
     {
         addColumn(record, fieldbuf, m_bTrimFields);
 
-        if(!addRow(record))
+        if(!addRow(insertFunction, record, parsedRows))
             return ParserResult::ParserResultError;
     }
 

diff --git a/src/csvparser.h b/src/csvparser.h
@@ -1,8 +1,7 @@
 #ifndef CSVPARSER_H
 #define CSVPARSER_H
 
-#include <QChar>
-#include <QStringList>
+#include <QVector>
 #include <functional>
 
 class QTextStream;
@@ -18,16 +17,16 @@ class CSVProgress
     virtual ~CSVProgress() { }
 
     virtual void start() = 0;
-    virtual bool update(size_t pos) = 0;
+    virtual bool update(qint64 pos) = 0;
     virtual void end() = 0;
 };
 
 class CSVParser
 {
 public:
-    typedef std::function<bool(size_t, QStringList)> csvRowFunction;
+    typedef std::function<bool(size_t, QVector<QByteArray>)> csvRowFunction;
 
-    CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
+    CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"');
     ~CSVParser();
 
     enum ParserResult
@@ -42,10 +41,10 @@ class CSVParser
      * @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns
      *                       in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true.
      * \param stream Stream with the CSV parser
-     * \param nMaxRecords Max records too read, -1 if unlimited
+     * \param nMaxRecords Max records too read, 0 if unlimited
      * \return ParserResult value that indicated whether action finished normally, was cancelled or errored.
      */
-    ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1);
+    ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0);
 
     void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }
 
@@ -57,26 +56,13 @@ class CSVParser
         StateEndQuote
     };
 
-    inline bool addRow(QStringList& r)
-    {
-        if(!m_insertFunction(m_iParsedRows, r))
-            return false;
-
-        r.clear();
-        m_iParsedRows++;
-        return true;
-    }
-
 private:
     bool m_bTrimFields;
-    QChar m_cFieldSeparator;
-    QChar m_cQuoteChar;
+    char16_t m_cFieldSeparator;
+    char16_t m_cQuoteChar;
     CSVProgress* m_pCSVProgress;
-    csvRowFunction m_insertFunction;
-
-    qint64 m_iParsedRows;   // Number of rows parsed so far
 
-    size_t m_nBufferSize; //! internal buffer read size
+    qint64 m_nBufferSize;   //! internal buffer read size
 };
 
 #endif