Skip to content

Commit

Permalink
fix invalid utf8 check
Browse files Browse the repository at this point in the history
This is quite a perf regression.
But because skip_this_line checked
from start of line instead of the
read bytes ptr, we could win most
of lossed regression
  • Loading branch information
ritchie46 committed Oct 19, 2021
1 parent 85aa974 commit 4e8c647
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
15 changes: 12 additions & 3 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,19 @@ impl Buffer {
v.offsets.shrink_to_fit();
v.data.shrink_to_fit();

let mut is_valid = true;
if delay_utf8_validation(v.encoding, v.ignore_errors) {
simdutf8::basic::from_utf8(v.data.as_slice()).map_err(|_| {
PolarsError::ComputeError("invalid utf8 data in csv".into())
})?;
let mut start = 0usize;

for &end in &v.offsets[1..] {
let slice = v.data.get_unchecked(start..end as usize);
start = end as usize;
is_valid &= simdutf8::basic::from_utf8(slice).is_ok();
}

if !is_valid {
return Err(PolarsError::ComputeError("invalid utf8 data in csv".into()));
}
}

let arr = Utf8Array::<i64>::from_data_unchecked_default(
Expand Down
21 changes: 14 additions & 7 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,14 +362,14 @@ impl<'a> Iterator for SplitFields<'a> {
}
}

fn skip_this_line(bytes: &[u8], quote: Option<u8>) -> (&[u8], usize) {
fn skip_this_line(bytes: &[u8], quote: Option<u8>, offset: usize) -> (&[u8], usize) {
let pos = match quote {
Some(quote) => find_quoted(bytes, quote, b'\n'),
None => bytes.iter().position(|x| *x == b'\n'),
};
match pos {
None => (&[], bytes.len()),
Some(pos) => (&bytes[pos + 1..], pos + 1),
None => (&[], bytes.len() + offset),
Some(pos) => (&bytes[pos + 1..], pos + 1 + offset),
}
}

Expand Down Expand Up @@ -416,7 +416,7 @@ pub(crate) fn parse_lines(
if let Some(c) = comment_char {
// line is a comment -> skip
if bytes[0] == c {
let (bytes_rem, len) = skip_this_line(bytes, quote_char);
let (bytes_rem, len) = skip_this_line(bytes, quote_char, 0);
bytes = bytes_rem;
read += len;
continue;
Expand Down Expand Up @@ -503,9 +503,16 @@ pub(crate) fn parse_lines(
next_projected = p
}
None => {
let (bytes_rem, len) = skip_this_line(bytes, quote_char);
bytes = bytes_rem;
read += len;
let offset = read_sol - 1;
if let Some(b'\n') = bytes.get(0) {
bytes = &bytes[read_sol..];
read += read_sol
} else {
let (bytes_rem, len) =
skip_this_line(&bytes[offset..], quote_char, offset);
bytes = bytes_rem;
read += len;
}
break;
}
}
Expand Down

0 comments on commit 4e8c647

Please sign in to comment.