Skip to content

Commit

Permalink
fix null row skipping in csv parsing (#2622)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 12, 2022
1 parent ee8b623 commit 55d7149
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 31 deletions.
29 changes: 25 additions & 4 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -953,8 +953,7 @@ id090,id048,id0000067778,24,2,51862,4,9,

#[test]
fn test_new_line_escape() {
let s = r#"
"sepal.length","sepal.width","petal.length","petal.width","variety"
let s = r#""sepal.length","sepal.width","petal.length","petal.width","variety"
5.1,3.5,1.4,.2,"Setosa
texts after new line character"
4.9,3,1.4,.2,"Setosa"
Expand Down Expand Up @@ -1228,8 +1227,7 @@ bar,bar";

#[test]
fn test_no_quotes() -> Result<()> {
let rolling_stones = r#"
linenum,last_name,first_name
let rolling_stones = r#"linenum,last_name,first_name
1,Jagger,Mick
2,O"Brian,Mary
3,Richards,Keith
Expand Down Expand Up @@ -1537,4 +1535,27 @@ foo,bar
);
Ok(())
}

#[test]
fn test_empty_string_cols() -> Result<()> {
let csv = "\nabc\n\nxyz\n";
let file = Cursor::new(csv);
let df = CsvReader::new(file).has_header(false).finish()?;
let s = df.column("column_1")?;
let ca = s.utf8()?;
assert_eq!(
ca.into_no_null_iter().collect::<Vec<_>>(),
&["", "abc", "", "xyz"]
);

let csv = ",\nabc,333\n,666\nxyz,999";
let file = Cursor::new(csv);
let df = CsvReader::new(file).has_header(false).finish()?;
let expected = df![
"column_1" => ["", "abc", "", "xyz"],
"column_2" => [None, Some(333i64), Some(666), Some(999)]
]?;
assert!(df.frame_equal_missing(&expected));
Ok(())
}
}
7 changes: 6 additions & 1 deletion polars/polars-io/src/csv_core/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,12 @@ impl<'a> CoreReader<'a> {

fn find_starting_point<'b>(&self, mut bytes: &'b [u8]) -> Result<&'b [u8]> {
// Skip all leading white space and the occasional utf8-bom
bytes = skip_line_ending(skip_whitespace(skip_bom(bytes)).0).0;
bytes = skip_whitespace(skip_bom(bytes));
// \n\n can be a empty string row of a single column
// in other cases we skip it.
if self.schema.fields().len() > 1 {
bytes = skip_line_ending(bytes)
}

// If there is a header we skip it.
if self.has_header {
Expand Down
33 changes: 10 additions & 23 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ pub(crate) fn is_whitespace(b: u8) -> bool {
}

#[inline]
fn skip_condition<F>(input: &[u8], f: F) -> (&[u8], usize)
fn skip_condition<F>(input: &[u8], f: F) -> &[u8]
where
F: Fn(u8) -> bool,
{
if input.is_empty() {
return (input, 0);
return input;
}
let mut read = 0;
let len = input.len();
Expand All @@ -81,7 +81,7 @@ where
}
read += 1;
}
(&input[read..], read)
&input[read..]
}

/// Makes sure that the bytes stream starts with
Expand All @@ -96,18 +96,16 @@ pub(crate) fn skip_header(input: &[u8]) -> (&[u8], usize) {
(&input[pos..], pos)
}

/// Remove whitespace and line endings from the start of file.
/// Remove whitespace from the start of buffer.
#[inline]
pub(crate) fn skip_whitespace(input: &[u8]) -> (&[u8], usize) {
skip_condition(input, |b| is_whitespace(b) || is_line_ending(b))
pub(crate) fn skip_whitespace(input: &[u8]) -> &[u8] {
skip_condition(input, is_whitespace)
}

#[inline]
/// Can be used to skip whitespace, but exclude the delimiter
pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> (&[u8], usize) {
skip_condition(input, |b| {
b != exclude && (is_whitespace(b) || is_line_ending(b))
})
pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> &[u8] {
skip_condition(input, |b| b != exclude && (is_whitespace(b)))
}

/// Local version of slice::starts_with (as it won't inline)
Expand All @@ -128,7 +126,7 @@ pub(crate) fn drop_quotes(input: &[u8]) -> &[u8] {
}

#[inline]
pub(crate) fn skip_line_ending(input: &[u8]) -> (&[u8], usize) {
pub(crate) fn skip_line_ending(input: &[u8]) -> &[u8] {
skip_condition(input, is_line_ending)
}

Expand Down Expand Up @@ -406,7 +404,7 @@ pub(crate) fn parse_lines(
return Ok(end - start);
}

let (b, _) = skip_whitespace_exclude(bytes, delimiter);
let b = skip_whitespace_exclude(bytes, delimiter);
bytes = b;
if bytes.is_empty() {
return Ok(original_bytes_len);
Expand Down Expand Up @@ -536,17 +534,6 @@ pub(crate) fn parse_lines(
mod test {
use super::*;

#[test]
fn test_skip() {
let input = b" hello";
assert_eq!(skip_whitespace(input).0, b"hello");
let input = b"\n hello";
assert_eq!(skip_whitespace(input).0, b"hello");
let input = b"\t\n\r
hello";
assert_eq!(skip_whitespace(input).0, b"hello");
}

#[test]
fn test_splitfields() {
let input = "\"foo\",\"bar\"";
Expand Down
12 changes: 9 additions & 3 deletions polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ pub fn infer_file_schema(
// It may later.
let encoding = CsvEncoding::LossyUtf8;

let bytes = skip_line_ending(skip_bom(reader_bytes)).0;
let bytes = skip_line_ending(skip_bom(reader_bytes));
let mut lines = SplitLines::new(bytes, b'\n').skip(*skip_rows);

// get or create header names
Expand Down Expand Up @@ -183,10 +183,16 @@ pub fn infer_file_schema(
})
.collect::<Result<_>>()?
} else {
byterecord
let mut column_names: Vec<String> = byterecord
.enumerate()
.map(|(i, _s)| format!("column_{}", i + 1))
.collect()
.collect();
// needed because SplitLines does not return the \n char, so SplitFields does not catch
// the latest value if ending with ','
if header_line.ends_with(b",") {
column_names.push(format!("column_{}", column_names.len() + 1))
}
column_names
}
} else {
return Err(PolarsError::NoData("empty csv".into()));
Expand Down

0 comments on commit 55d7149

Please sign in to comment.