Skip to content

Commit

Permalink
fix csv quote escaping empty fields
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 15, 2022
1 parent 8bddbd7 commit da96209
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 24 deletions.
40 changes: 31 additions & 9 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,8 @@ mod test {
.with_path(Some(FOODS_CSV.to_string()))
.finish()
.unwrap();
dbg!(df);

assert_eq!(df.shape(), (27, 4));
}

#[test]
Expand Down Expand Up @@ -731,7 +732,6 @@ mod test {
.unwrap();

let col = df.column("variety").unwrap();
dbg!(&df);
assert_eq!(col.get(0), AnyValue::Utf8("Setosa"));
assert_eq!(col.get(2), AnyValue::Utf8("Setosa"));

Expand Down Expand Up @@ -786,8 +786,6 @@ mod test {
.with_ignore_parser_errors(true)
.finish()
.unwrap();

dbg!(df);
}

#[test]
Expand All @@ -797,7 +795,6 @@ mod test {
.with_projection(Some(vec![0, 2]))
.finish()
.unwrap();
dbg!(&df);
let col_1 = df.select_at_idx(0).unwrap();
assert_eq!(col_1.get(0), AnyValue::Utf8("vegetables"));
assert_eq!(col_1.get(1), AnyValue::Utf8("seafood"));
Expand Down Expand Up @@ -1410,7 +1407,7 @@ A3,\"B4_\"\"with_embedded_double_quotes\"\"\",C4,4";
assert_eq!(df.dtypes(), &[DataType::Utf8, DataType::Int64]);
let a = df.column("foo")?;
let a = a.utf8()?;
assert_eq!(a.get(0), Some(""));
assert_eq!(a.get(0), None);

Ok(())
}
Expand Down Expand Up @@ -1544,18 +1541,43 @@ foo,bar
let s = df.column("column_1")?;
let ca = s.utf8()?;
assert_eq!(
ca.into_no_null_iter().collect::<Vec<_>>(),
&["", "abc", "", "xyz"]
ca.into_iter().collect::<Vec<_>>(),
&[None, Some("abc"), None, Some("xyz")]
);

let csv = ",\nabc,333\n,666\nxyz,999";
let file = Cursor::new(csv);
let df = CsvReader::new(file).has_header(false).finish()?;
let expected = df![
"column_1" => ["", "abc", "", "xyz"],
"column_1" => [None, Some("abc"), None, Some("xyz")],
"column_2" => [None, Some(333i64), Some(666), Some(999)]
]?;
assert!(df.frame_equal_missing(&expected));
Ok(())
}

#[test]
fn test_trailing_empty_string_cols() -> Result<()> {
let csv = "colx\nabc\nxyz\n\"\"";
let file = Cursor::new(csv);
let df = CsvReader::new(file).finish()?;
let col = df.column("colx")?;
let col = col.utf8()?;
assert_eq!(
col.into_no_null_iter().collect::<Vec<_>>(),
&["abc", "xyz", ""]
);

let csv = "colx,coly\nabc,def\nxyz,mno\n,";
let file = Cursor::new(csv);
let df = CsvReader::new(file).finish()?;

assert_eq!(
df.get(1).unwrap(),
&[AnyValue::Utf8("xyz"), AnyValue::Utf8("mno")]
);
assert_eq!(df.get(2).unwrap(), &[AnyValue::Null, AnyValue::Null]);

Ok(())
}
}
7 changes: 7 additions & 0 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ impl ParsedBuffer<Utf8Type> for Utf8Field {
ignore_errors: bool,
needs_escaping: bool,
) -> Result<()> {
if bytes.is_empty() {
// append null
self.offsets.push(self.data.len() as i64);
self.validity.push(false);
return Ok(());
}

// Only for lossy utf8 we check utf8 now. Otherwise we check all utf8 at the end.
let parse_result = if delay_utf8_validation(self.encoding, ignore_errors) {
true
Expand Down
3 changes: 1 addition & 2 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,7 @@ pub(crate) fn parse_lines(
return Ok(end - start);
}

let b = skipwh(bytes, delimiter);
bytes = b;
bytes = skipwh(bytes, delimiter);
if bytes.is_empty() {
return Ok(original_bytes_len);
}
Expand Down
30 changes: 17 additions & 13 deletions polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -368,25 +368,29 @@ pub(crate) fn decompress(bytes: &[u8]) -> Option<Vec<u8>> {

// replace double quotes by single ones
pub(super) unsafe fn escape_field(bytes: &[u8], quote: u8, buf: &mut [u8]) -> usize {
let mut prev_quote = false;

let mut count = 0;
for c in bytes {
if *c == quote {
if prev_quote {
if bytes == [quote, quote] {
0
} else {
let mut prev_quote = false;

let mut count = 0;
for c in bytes {
if *c == quote {
if prev_quote {
prev_quote = false;
*buf.get_unchecked_mut(count) = *c;
count += 1;
} else {
prev_quote = true;
}
} else {
prev_quote = false;
*buf.get_unchecked_mut(count) = *c;
count += 1;
} else {
prev_quote = true;
}
} else {
prev_quote = false;
*buf.get_unchecked_mut(count) = *c;
count += 1;
}
count
}
count
}

#[cfg(test)]
Expand Down

0 comments on commit da96209

Please sign in to comment.