Skip to content

Commit

Permalink
Don't track column count when parsing CSV files
Browse files Browse the repository at this point in the history
When parsing a CSV file we used to check the column count for each row
and track the highest number of columns that we found. This information
then could be used to create an INSERT statement large enough for all
the data.

This column number tracking code is removed by this commit. Instead it
analyses the first 20 rows only. It does that while generating the field
list.

Performance-wise this should take a (very) little longer but makes it
easier to improve the performance in other ways later which should more
than compensate this commit.

Feature-wise this should fix some (technically invalid) corner-case CSV
files with fewer fields in the title row than in the other rows. It
should also break some other (technically invalid) corner-case CSV files
if they are imported into an existing table and have less columns than
the existing table in their first 20 rows but later on the exact same
number. Both cases, I think, don't matter too much.
  • Loading branch information
MKleusberg committed Sep 10, 2017
1 parent 67adb99 commit b7a00d3
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 44 deletions.
81 changes: 50 additions & 31 deletions src/ImportCsvDialog.cpp
Expand Up @@ -194,12 +194,15 @@ void ImportCsvDialog::updatePreview()
csv.parse(tstream, 20);
file.close();

// Analyse CSV file
sqlb::FieldVector fieldList = generateFieldList(selectedFile);

// Reset preview widget
ui->tablePreview->clear();
ui->tablePreview->setColumnCount(csv.columns());
ui->tablePreview->setColumnCount(fieldList.size());

// Exit if there are no lines to preview at all
if(csv.columns() == 0)
if(fieldList.size() == 0)
return;

// Use first row as header if necessary
Expand Down Expand Up @@ -293,12 +296,12 @@ void ImportCsvDialog::updateSelection(bool selected)
void ImportCsvDialog::matchSimilar()
{
auto item = ui->filePicker->currentItem();
auto selectedHeader = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1));
auto selectedHeader = generateFieldList(item->data(Qt::DisplayRole).toString());

for (int i = 0; i < ui->filePicker->count(); i++)
{
auto item = ui->filePicker->item(i);
auto header = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1));
auto header = generateFieldList(item->data(Qt::DisplayRole).toString());
bool matchingHeader = false;

if (selectedHeader.count() == header.count())
Expand Down Expand Up @@ -340,36 +343,50 @@ CSVParser ImportCsvDialog::parseCSV(const QString &fileName, qint64 count)
return csv;
}

sqlb::FieldVector ImportCsvDialog::generateFieldList(const CSVParser &parser)
sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
{
if (parser.csv().size() == 0) return sqlb::FieldVector();
// Parse the first couple of records of the CSV file and only analyse them
CSVParser parser = parseCSV(filename, 20);

// If there is no data, we don't return any fields
if(parser.csv().size() == 0)
return sqlb::FieldVector();

// How many columns are there in the CSV file?
int columns = 0;
for(int i=0;i<parser.csv().size();i++)
{
if(parser.csv().at(i).size() > columns)
columns = parser.csv().at(i).size();
}

// Generate field names. These are either taken from the first CSV row or are generated in the format of "fieldXY" depending on the user input
sqlb::FieldVector fieldList;
if(ui->checkboxHeader->isChecked())
for(int i=0;i<columns;i++)
{
for(QStringList::const_iterator it = parser.csv().at(0).begin();
it != parser.csv().at(0).end();
++it)
QString fieldname;

// Only take the names from the CSV file if the user wants that and if the first row in the CSV file has enough columns
if(ui->checkboxHeader->isChecked() && i < parser.csv().at(0).size())
{
// Remove invalid characters
QString thisfield = *it;
thisfield.replace("`", "");
thisfield.replace(" ", "");
thisfield.replace('"', "");
thisfield.replace("'","");
thisfield.replace(",","");
thisfield.replace(";","");

// Avoid empty field names
if(thisfield.isEmpty())
thisfield = QString("field%1").arg(std::distance(parser.csv().at(0).begin(), it) + 1);

fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(thisfield, "")));
// Take field name from CSV and remove invalid characters
fieldname = parser.csv().at(0).at(i);
fieldname.replace("`", "");
fieldname.replace(" ", "");
fieldname.replace('"', "");
fieldname.replace("'","");
fieldname.replace(",","");
fieldname.replace(";","");
}
} else {
for(size_t i=0; i < parser.columns(); ++i)
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(QString("field%1").arg(i+1), "")));

// If we don't have a field name by now, generate one
if(fieldname.isEmpty())
fieldname = QString("field%1").arg(i+1);

// TODO Here's also the place to do some sort of data type analysation of the CSV data

// Add field to the column list
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(fieldname, "")));
}

return fieldList;
Expand All @@ -396,11 +413,13 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
tableName = ui->editName->text();
}

// Analyse CSV file
sqlb::FieldVector fieldList = generateFieldList(fileName);

// Parse entire file
CSVParser csv = parseCSV(fileName);
if (csv.csv().size() == 0) return;

sqlb::FieldVector fieldList = generateFieldList(csv);

#ifdef CSV_BENCHMARK
qint64 timer_after_parsing = timer.elapsed();
#endif
Expand All @@ -415,7 +434,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
const sqlb::ObjectPtr obj = pdb->getObjectByName(sqlb::ObjectIdentifier("main", tableName));
if(obj && obj->type() == sqlb::Object::Types::Table)
{
if((size_t)obj.dynamicCast<sqlb::Table>()->fields().size() != csv.columns())
if(obj.dynamicCast<sqlb::Table>()->fields().size() != fieldList.size())
{
QMessageBox::warning(this, QApplication::applicationName(),
tr("There is already a table of that name and an import into an existing table is only possible if the number of columns match."));
Expand Down Expand Up @@ -471,7 +490,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)

// Prepare the INSERT statement. The prepared statement can then be reused for each row to insert
QString sQuery = QString("INSERT INTO %1 VALUES(").arg(sqlb::escapeIdentifier(tableName));
for(size_t i=1;i<=csv.columns();i++)
for(int i=1;i<=fieldList.size();i++)
sQuery.append(QString("?%1,").arg(i));
sQuery.chop(1); // Remove last comma
sQuery.append(")");
Expand Down
2 changes: 1 addition & 1 deletion src/ImportCsvDialog.h
Expand Up @@ -38,7 +38,7 @@ private slots:
QCompleter* encodingCompleter;

CSVParser parseCSV(const QString &f, qint64 count = -1);
sqlb::FieldVector generateFieldList(const CSVParser& parser);
sqlb::FieldVector generateFieldList(const QString& filename);

void importCsv(const QString& f, const QString &n = QString());

Expand Down
2 changes: 0 additions & 2 deletions src/csvparser.cpp
Expand Up @@ -8,7 +8,6 @@ CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar&
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
, m_pCSVProgress(0)
, m_nColumns(0)
, m_nBufferSize(4096)
{
}
Expand All @@ -32,7 +31,6 @@ inline void addColumn(QStringList& r, QString& field, bool trim)
bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
{
m_vCSVData.clear();
m_nColumns = 0;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
Expand Down
10 changes: 1 addition & 9 deletions src/csvparser.h
Expand Up @@ -44,12 +44,6 @@ class CSVParser
*/
const TCSVResult& csv() const { return m_vCSVData; }

/*!
* \brief columns
* \return Number of columns parsed
*/
size_t columns() const { return m_nColumns; }

void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }

private:
Expand All @@ -63,7 +57,6 @@ class CSVParser
inline void addRow(QStringList& r)
{
m_vCSVData.append(r);
m_nColumns = std::max<size_t>(r.size(), m_nColumns);
r.clear();
}

Expand All @@ -74,9 +67,8 @@ class CSVParser
CSVProgress* m_pCSVProgress;

TCSVResult m_vCSVData;
size_t m_nColumns;

size_t m_nBufferSize; //! internal buffer read size
};

#endif // CSVPARSER_H
#endif
1 change: 0 additions & 1 deletion src/tests/TestImport.cpp
Expand Up @@ -48,7 +48,6 @@ void TestImport::csvImport()

// Check return values
QCOMPARE(csvparser.csv(), result);
QCOMPARE((int)csvparser.columns(), numfields);
}

void TestImport::csvImport_data()
Expand Down

0 comments on commit b7a00d3

Please sign in to comment.