Skip to content

Commit

Permalink
csv parser improve performance (#2373)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 14, 2022
1 parent 8dd4574 commit 927d78a
Showing 1 changed file with 28 additions and 14 deletions.
42 changes: 28 additions & 14 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ impl<'a> SplitFields<'a> {
}
}

#[inline]
fn find_quoted(bytes: &[u8], quote_char: u8, needle: u8) -> Option<usize> {
let mut in_field = false;

Expand Down Expand Up @@ -370,6 +371,7 @@ impl<'a> Iterator for SplitFields<'a> {
}
}

#[inline]
fn skip_this_line(bytes: &[u8], quote: Option<u8>, offset: usize) -> (&[u8], usize) {
let pos = match quote {
Some(quote) => find_quoted(bytes, quote, b'\n'),
Expand All @@ -381,6 +383,16 @@ fn skip_this_line(bytes: &[u8], quote: Option<u8>, offset: usize) -> (&[u8], usi
}
}

#[inline]
fn update_bytes_ptr(bytes: &mut &[u8], read_sol: usize, quote_char: Option<u8>) {
if let Some(b'\n') = bytes.get(read_sol - 1) {
*bytes = &bytes[read_sol..];
} else {
let (bytes_rem, _) = skip_this_line(&bytes[read_sol - 1..], quote_char, 0);
*bytes = bytes_rem;
}
}

/// Parse CSV.
///
/// # Arguments
Expand Down Expand Up @@ -408,15 +420,6 @@ pub(crate) fn parse_lines(
let original_bytes_len = bytes.len();
let n_lines = n_lines as u32;

let update_bytes_ptr = |bytes: &mut &[u8], read_sol: usize| {
if let Some(b'\n') = bytes.get(0) {
*bytes = &bytes[read_sol..];
} else {
let (bytes_rem, _) = skip_this_line(bytes, quote_char, 0);
*bytes = bytes_rem;
}
};

let mut line_count = 0u32;
loop {
if line_count > n_lines {
Expand Down Expand Up @@ -451,13 +454,12 @@ pub(crate) fn parse_lines(

let mut iter = SplitFields::new(bytes, delimiter, quote_char);
let mut idx = 0u32;
let mut read_sol = 0;
loop {
let mut read_sol = 0;

match iter.next() {
// end of line
None => {
update_bytes_ptr(&mut bytes, read_sol);
bytes = &bytes[read_sol..];
break;
}
Some((mut field, needs_escaping)) => {
Expand Down Expand Up @@ -520,9 +522,21 @@ pub(crate) fn parse_lines(

// if we have all projected columns we are done with this line
match next_projection {
Some(p) => next_projected = p,
Some(p) => {
// benchmarking showed it is 6% faster to take the min of these two
// not needed for correctness.
// bytes = &bytes[std::cmp::min(read_sol, bytes.len())..];
next_projected = p
}
None => {
update_bytes_ptr(&mut bytes, read_sol);
// if let Some(b'\n') = bytes.get(0) {
// bytes = &bytes[read_sol..];
// } else {
// let (bytes_rem, _) = skip_this_line(bytes, quote_char, 0);
// bytes = bytes_rem;
// }

update_bytes_ptr(&mut bytes, read_sol, quote_char);
break;
}
}
Expand Down

0 comments on commit 927d78a

Please sign in to comment.