Skip to content

Commit

Permalink
csv-parser optimization
Browse files Browse the repository at this point in the history
First check ascii, if that fails we continue
from where ascii fails with utf8 checking
  • Loading branch information
ritchie46 committed Oct 19, 2021
1 parent 4e8c647 commit 4e8bb87
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 8 deletions.
15 changes: 15 additions & 0 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1168,4 +1168,19 @@ linenum,last_name,first_name

Ok(())
}

#[test]
fn test_utf8() -> Result<()> {
// first part is valid ascii. later we have removed some bytes from the emoji.
let invalid_utf8 = [
111, 10, 98, 97, 114, 10, 104, 97, 109, 10, 115, 112, 97, 109, 10, 106, 97, 109, 10,
107, 97, 109, 10, 108, 97, 109, 10, 207, 128, 10, 112, 97, 109, 10, 115, 116, 97, 109,
112, 10, 240, 159, 137, 10, 97, 115, 99, 105, 105, 10, 240, 159, 144, 172, 10, 99, 105,
97, 111,
];
let file = Cursor::new(invalid_utf8);
assert!(CsvReader::new(file).finish().is_err());

Ok(())
}
}
52 changes: 44 additions & 8 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,18 +341,54 @@ impl Buffer {
v.offsets.shrink_to_fit();
v.data.shrink_to_fit();

let mut is_valid = true;
if delay_utf8_validation(v.encoding, v.ignore_errors) {
let mut start = 0usize;
// We first check valid ascii as that is much cheaper.
// if we find an invalid ascii char, we continue from that offset
// with utf8 checking.
let mut valid_utf8 = true;
let mut valid_ascii = true;
let mut offset = 0;

for &end in &v.offsets[1..] {
let slice = v.data.get_unchecked(start..end as usize);
start = end as usize;
is_valid &= simdutf8::basic::from_utf8(slice).is_ok();
// first we scan through the data checking ascii
// if a byte is invalid we break the loop
// and use that loop idx to find the offsets in the utf8 buffer
// the offsets in the buffer are needed to recreate the substrings
// in utf8 checking
for (i, &v) in v.data.iter().enumerate() {
if v > 127 {
valid_ascii = false;
offset = i as i64;
break;
}
}

if !is_valid {
return Err(PolarsError::ComputeError("invalid utf8 data in csv".into()));
// if valid ascii, we are done here.
if !valid_ascii {
let mut idx = 0usize;

// find offset in offset buffer that matched the failing ascii bytes
for (i, &v) in v.offsets.iter().enumerate() {
if v > offset {
idx = i;
break;
}
}

// if the first byte was already invalid, we maximize by an offset of 0.
let mut start = v.offsets[std::cmp::max(0, idx - 1)] as usize;

// create substrings and check utf8 validity
for &end in &v.offsets[idx..] {
let slice = v.data.get_unchecked(start..end as usize);
start = end as usize;
valid_utf8 &= simdutf8::basic::from_utf8(slice).is_ok();
}

if !valid_utf8 {
return Err(PolarsError::ComputeError(
"invalid utf8 data in csv".into(),
));
}
}
}

Expand Down

0 comments on commit 4e8bb87

Please sign in to comment.