Skip to content

Commit

Permalink
Merge pull request #1235 from PawelJastrzebski/issue-972-csv-import-fix2
Browse files Browse the repository at this point in the history
Issue 972 csv import - fix2
  • Loading branch information
ddimaria committed Apr 25, 2024
2 parents 6d4432f + 7fda4c1 commit fded55e
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 19 deletions.
61 changes: 55 additions & 6 deletions quadratic-core/src/controller/operations/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,20 @@ impl GridController {
insert_at: Pos,
) -> Result<Vec<Operation>> {
let error = |message: String| anyhow!("Error parsing CSV file {}: {}", file_name, message);
let width = csv::ReaderBuilder::new().from_reader(file).headers()?.len() as u32;

if width == 0 {
bail!("Empty CSV files cannot be processed");
}
let file = match String::from_utf8_lossy(file) {
std::borrow::Cow::Borrowed(_) => file,
std::borrow::Cow::Owned(_) => {
if let Some(utf) = read_utf16(file) {
return self.import_csv_operations(
sheet_id,
utf.as_bytes(),
file_name,
insert_at,
);
}
file
}
};

// first get the total number of lines so we can provide progress
let mut reader = csv::ReaderBuilder::new()
Expand All @@ -37,8 +46,14 @@ impl GridController {

let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(file);

let width = reader.headers()?.len() as u32;
if width == 0 {
bail!("empty files cannot be processed");
}

// then create operations using MAXIMUM_IMPORT_LINES to break up the SetCellValues operations
let mut ops = vec![] as Vec<Operation>;
let mut cell_values = CellValues::new(width, height);
Expand Down Expand Up @@ -259,12 +274,46 @@ impl GridController {
}
}

fn read_utf16(bytes: &[u8]) -> Option<String> {
if bytes.is_empty() && bytes.len() % 2 == 0 {
return None;
}

// convert u8 to u16
let mut utf16vec: Vec<u16> = Vec::with_capacity(bytes.len() / 2);
for chunk in bytes.to_owned().chunks_exact(2) {
let Ok(vec2) = <[u8; 2]>::try_from(chunk) else {
return None;
};
utf16vec.push(u16::from_ne_bytes(vec2));
}

// convert to string
let Ok(str) = String::from_utf16(utf16vec.as_slice()) else {
return None;
};

// strip invalid characters
let result: String = str.chars().filter(|&c| c.len_utf8() <= 2).collect();

Some(result)
}

#[cfg(test)]
mod test {
use super::read_utf16;
use crate::CellValue;

use super::*;

const INVALID_ENCODING_FILE: &[u8] =
include_bytes!("../../../../quadratic-rust-shared/data/csv/encoding_issue.csv");

#[test]
fn transmute_u8_to_u16() {
let result = read_utf16(INVALID_ENCODING_FILE).unwrap();
assert_eq!("issue, test, value\r\n0, 1, Invalid\r\n0, 2, Valid", result);
}

#[test]
fn imports_a_simple_csv() {
let mut gc = GridController::test();
Expand Down
71 changes: 58 additions & 13 deletions quadratic-core/src/controller/user_actions/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,10 @@ mod tests {

use super::*;

const SIMPLE_CSV: &str = r#"city,region,country,population
Southborough,MA,United States,9686
Northbridge,MA,United States,14061
Westborough,MA,United States,29313
Marlborough,MA,United States,38334
Springfield,MA,United States,152227
Springfield,MO,United States,150443
Springfield,NJ,United States,14976
Springfield,OH,United States,64325
Springfield,OR,United States,56032
Concord,NH,United States,42605
"#;
fn read_test_csv_file(file_name: &str) -> Vec<u8> {
let path = format!("../quadratic-rust-shared/data/csv/{file_name}");
std::fs::read(path).unwrap_or_else(|_| panic!("test csv file not found {}", file_name))
}

// const EXCEL_FILE: &str = "../quadratic-rust-shared/data/excel/temperature.xlsx";
const EXCEL_FILE: &str = "../quadratic-rust-shared/data/excel/basic.xlsx";
Expand All @@ -92,12 +84,13 @@ Concord,NH,United States,42605

#[test]
fn imports_a_simple_csv() {
let scv_file = read_test_csv_file("simple.csv");
let mut grid_controller = GridController::test();
let sheet_id = grid_controller.grid.sheets()[0].id;
let pos = Pos { x: 0, y: 0 };

let _ =
grid_controller.import_csv(sheet_id, SIMPLE_CSV.as_bytes(), "smallpop.csv", pos, None);
grid_controller.import_csv(sheet_id, scv_file.as_slice(), "smallpop.csv", pos, None);

print_table(
&grid_controller,
Expand Down Expand Up @@ -338,6 +331,58 @@ Concord,NH,United States,42605
// expect_js_call_count("jsRenderCellSheets", 33026, true);
// }

#[test]
fn should_import_with_title_header() {
let scv_file = read_test_csv_file("title_row.csv");
let mut gc = GridController::test();
let sheet_id = gc.grid.sheets()[0].id;
let pos = Pos { x: 0, y: 0 };

gc.import_csv(sheet_id, scv_file.as_slice(), "test.csv", pos, None)
.expect("import_csv");

print_table(&gc, sheet_id, Rect::new_span(pos, Pos { x: 3, y: 4 }));

assert_cell_value_row(&gc, sheet_id, 0, 2, 0, vec!["Sample report ", "", ""]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 2, vec!["c1", " c2", " Sample column3"]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 5, vec!["7", "8", "9"]);
}

#[test]
fn should_import_with_title_header_and_empty_first_row() {
let scv_file = read_test_csv_file("title_row_empty_first.csv");
let mut gc = GridController::test();
let sheet_id = gc.grid.sheets()[0].id;
let pos = Pos { x: 0, y: 0 };

gc.import_csv(sheet_id, scv_file.as_slice(), "test.csv", pos, None)
.expect("import_csv");

print_table(&gc, sheet_id, Rect::new_span(pos, Pos { x: 3, y: 4 }));

assert_cell_value_row(&gc, sheet_id, 0, 2, 0, vec!["Sample report ", "", ""]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 2, vec!["c1", " c2", " Sample column3"]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 5, vec!["7", "8", "9"]);
}

#[test]
fn should_import_utf16_with_invalid_characters() {
let scv_file = read_test_csv_file("encoding_issue.csv");

let mut gc = GridController::test();
let sheet_id = gc.grid.sheets()[0].id;
let pos = Pos { x: 0, y: 0 };

gc.import_csv(sheet_id, scv_file.as_slice(), "test.csv", pos, None)
.expect("import_csv");

print_table(&gc, sheet_id, Rect::new_span(pos, Pos { x: 2, y: 3 }));

assert_cell_value_row(&gc, sheet_id, 0, 2, 0, vec!["issue", " test", " value"]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 1, vec!["0", " 1", " Invalid"]);
assert_cell_value_row(&gc, sheet_id, 0, 2, 2, vec!["0", " 2", " Valid"]);
}

// #[test]
// fn imports_a_large_parquet() {
// let mut grid_controller = GridController::test();
Expand Down
Binary file added quadratic-rust-shared/data/csv/encoding_issue.csv
Binary file not shown.
11 changes: 11 additions & 0 deletions quadratic-rust-shared/data/csv/simple.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
city,region,country,population
Southborough,MA,United States,9686
Northbridge,MA,United States,14061
Westborough,MA,United States,29313
Marlborough,MA,United States,38334
Springfield,MA,United States,152227
Springfield,MO,United States,150443
Springfield,NJ,United States,14976
Springfield,OH,United States,64325
Springfield,OR,United States,56032
Concord,NH,United States,42605
3 changes: 3 additions & 0 deletions quadratic-rust-shared/data/csv/simple_space_separator.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
city region country population
Southborough MA "United States" 9686
Northbridge MA "United States" 14061
6 changes: 6 additions & 0 deletions quadratic-rust-shared/data/csv/title_row.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Sample report
"December 25, 2023 - January 7, 2024"
c1, c2, Sample column3
1,2,3
4,5,6
7,8,9
7 changes: 7 additions & 0 deletions quadratic-rust-shared/data/csv/title_row_empty_first.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

Sample report
"December 25, 2023 - January 7, 2024"
c1, c2, Sample column3
1,2,3
4,5,6
7,8,9

0 comments on commit fded55e

Please sign in to comment.