Permalink
Browse files

Don't track column count when parsing CSV files

When parsing a CSV file we used to check the column count for each row
and track the highest number of columns that we found. This information
then could be used to create an INSERT statement large enough for all
the data.

This column number tracking code is removed by this commit. Instead it
analyses the first 20 rows only. It does that while generating the field
list.

Performance-wise this should take a (very) little longer but makes it
easier to improve the performance in other ways later which should more
than compensate this commit.

Feature-wise this should fix some (technically invalid) corner-case CSV
files with fewer fields in the title row than in the other rows. It
should also break some other (technically invalid) corner-case CSV files
if they are imported into an existing table and have less columns than
the existing table in their first 20 rows but later on the exact same
number. Both cases, I think, don't matter too much.
  • Loading branch information...
MKleusberg committed Sep 10, 2017
1 parent 67adb99 commit b7a00d301a2a469ba4a4b430e7b3f1b13fdc2842
Showing with 52 additions and 44 deletions.
  1. +50 −31 src/ImportCsvDialog.cpp
  2. +1 −1 src/ImportCsvDialog.h
  3. +0 −2 src/csvparser.cpp
  4. +1 −9 src/csvparser.h
  5. +0 −1 src/tests/TestImport.cpp
@@ -194,12 +194,15 @@ void ImportCsvDialog::updatePreview()
csv.parse(tstream, 20);
file.close();

// Analyse CSV file
sqlb::FieldVector fieldList = generateFieldList(selectedFile);

// Reset preview widget
ui->tablePreview->clear();
ui->tablePreview->setColumnCount(csv.columns());
ui->tablePreview->setColumnCount(fieldList.size());

// Exit if there are no lines to preview at all
if(csv.columns() == 0)
if(fieldList.size() == 0)
return;

// Use first row as header if necessary
@@ -293,12 +296,12 @@ void ImportCsvDialog::updateSelection(bool selected)
void ImportCsvDialog::matchSimilar()
{
auto item = ui->filePicker->currentItem();
auto selectedHeader = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1));
auto selectedHeader = generateFieldList(item->data(Qt::DisplayRole).toString());

for (int i = 0; i < ui->filePicker->count(); i++)
{
auto item = ui->filePicker->item(i);
auto header = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1));
auto header = generateFieldList(item->data(Qt::DisplayRole).toString());
bool matchingHeader = false;

if (selectedHeader.count() == header.count())
@@ -340,36 +343,50 @@ CSVParser ImportCsvDialog::parseCSV(const QString &fileName, qint64 count)
return csv;
}

sqlb::FieldVector ImportCsvDialog::generateFieldList(const CSVParser &parser)
sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
{
if (parser.csv().size() == 0) return sqlb::FieldVector();
// Parse the first couple of records of the CSV file and only analyse them
CSVParser parser = parseCSV(filename, 20);

// If there is no data, we don't return any fields
if(parser.csv().size() == 0)
return sqlb::FieldVector();

// How many columns are there in the CSV file?
int columns = 0;
for(int i=0;i<parser.csv().size();i++)
{
if(parser.csv().at(i).size() > columns)
columns = parser.csv().at(i).size();
}

// Generate field names. These are either taken from the first CSV row or are generated in the format of "fieldXY" depending on the user input
sqlb::FieldVector fieldList;
if(ui->checkboxHeader->isChecked())
for(int i=0;i<columns;i++)
{
for(QStringList::const_iterator it = parser.csv().at(0).begin();
it != parser.csv().at(0).end();
++it)
QString fieldname;

// Only take the names from the CSV file if the user wants that and if the first row in the CSV file has enough columns
if(ui->checkboxHeader->isChecked() && i < parser.csv().at(0).size())
{
// Remove invalid characters
QString thisfield = *it;
thisfield.replace("`", "");
thisfield.replace(" ", "");
thisfield.replace('"', "");
thisfield.replace("'","");
thisfield.replace(",","");
thisfield.replace(";","");

// Avoid empty field names
if(thisfield.isEmpty())
thisfield = QString("field%1").arg(std::distance(parser.csv().at(0).begin(), it) + 1);

fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(thisfield, "")));
// Take field name from CSV and remove invalid characters
fieldname = parser.csv().at(0).at(i);
fieldname.replace("`", "");
fieldname.replace(" ", "");
fieldname.replace('"', "");
fieldname.replace("'","");
fieldname.replace(",","");
fieldname.replace(";","");
}
} else {
for(size_t i=0; i < parser.columns(); ++i)
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(QString("field%1").arg(i+1), "")));

// If we don't have a field name by now, generate one
if(fieldname.isEmpty())
fieldname = QString("field%1").arg(i+1);

// TODO Here's also the place to do some sort of data type analysation of the CSV data

// Add field to the column list
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(fieldname, "")));
}

return fieldList;
@@ -396,11 +413,13 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
tableName = ui->editName->text();
}

// Analyse CSV file
sqlb::FieldVector fieldList = generateFieldList(fileName);

// Parse entire file
CSVParser csv = parseCSV(fileName);
if (csv.csv().size() == 0) return;

sqlb::FieldVector fieldList = generateFieldList(csv);

#ifdef CSV_BENCHMARK
qint64 timer_after_parsing = timer.elapsed();
#endif
@@ -415,7 +434,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
const sqlb::ObjectPtr obj = pdb->getObjectByName(sqlb::ObjectIdentifier("main", tableName));
if(obj && obj->type() == sqlb::Object::Types::Table)
{
if((size_t)obj.dynamicCast<sqlb::Table>()->fields().size() != csv.columns())
if(obj.dynamicCast<sqlb::Table>()->fields().size() != fieldList.size())
{
QMessageBox::warning(this, QApplication::applicationName(),
tr("There is already a table of that name and an import into an existing table is only possible if the number of columns match."));
@@ -471,7 +490,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)

// Prepare the INSERT statement. The prepared statement can then be reused for each row to insert
QString sQuery = QString("INSERT INTO %1 VALUES(").arg(sqlb::escapeIdentifier(tableName));
for(size_t i=1;i<=csv.columns();i++)
for(int i=1;i<=fieldList.size();i++)
sQuery.append(QString("?%1,").arg(i));
sQuery.chop(1); // Remove last comma
sQuery.append(")");
@@ -38,7 +38,7 @@ private slots:
QCompleter* encodingCompleter;

CSVParser parseCSV(const QString &f, qint64 count = -1);
sqlb::FieldVector generateFieldList(const CSVParser& parser);
sqlb::FieldVector generateFieldList(const QString& filename);

void importCsv(const QString& f, const QString &n = QString());

@@ -8,7 +8,6 @@ CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar&
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
, m_pCSVProgress(0)
, m_nColumns(0)
, m_nBufferSize(4096)
{
}
@@ -32,7 +31,6 @@ inline void addColumn(QStringList& r, QString& field, bool trim)
bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
{
m_vCSVData.clear();
m_nColumns = 0;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
@@ -44,12 +44,6 @@ class CSVParser
*/
const TCSVResult& csv() const { return m_vCSVData; }

/*!
* \brief columns
* \return Number of columns parsed
*/
size_t columns() const { return m_nColumns; }

void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }

private:
@@ -63,7 +57,6 @@ class CSVParser
inline void addRow(QStringList& r)
{
m_vCSVData.append(r);
m_nColumns = std::max<size_t>(r.size(), m_nColumns);
r.clear();
}

@@ -74,9 +67,8 @@ class CSVParser
CSVProgress* m_pCSVProgress;

TCSVResult m_vCSVData;
size_t m_nColumns;

size_t m_nBufferSize; //! internal buffer read size
};

#endif // CSVPARSER_H
#endif
@@ -48,7 +48,6 @@ void TestImport::csvImport()

// Check return values
QCOMPARE(csvparser.csv(), result);
QCOMPARE((int)csvparser.columns(), numfields);
}

void TestImport::csvImport_data()

0 comments on commit b7a00d3

Please sign in to comment.