fix null row skipping in csv parsing (#2622)

pola-rs · Feb 12, 2022 · 55d7149 · 55d7149
1 parent ee8b623
commit 55d7149
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 31 deletions.
diff --git a/polars/polars-io/src/csv.rs b/polars/polars-io/src/csv.rs
@@ -953,8 +953,7 @@ id090,id048,id0000067778,24,2,51862,4,9,
 
     #[test]
     fn test_new_line_escape() {
-        let s = r#"
- "sepal.length","sepal.width","petal.length","petal.width","variety"
+        let s = r#""sepal.length","sepal.width","petal.length","petal.width","variety"
  5.1,3.5,1.4,.2,"Setosa
  texts after new line character"
  4.9,3,1.4,.2,"Setosa"
@@ -1228,8 +1227,7 @@ bar,bar";
 
     #[test]
     fn test_no_quotes() -> Result<()> {
-        let rolling_stones = r#"
-linenum,last_name,first_name
+        let rolling_stones = r#"linenum,last_name,first_name
 1,Jagger,Mick
 2,O"Brian,Mary
 3,Richards,Keith
@@ -1537,4 +1535,27 @@ foo,bar
         );
         Ok(())
     }
+
+    #[test]
+    fn test_empty_string_cols() -> Result<()> {
+        let csv = "\nabc\n\nxyz\n";
+        let file = Cursor::new(csv);
+        let df = CsvReader::new(file).has_header(false).finish()?;
+        let s = df.column("column_1")?;
+        let ca = s.utf8()?;
+        assert_eq!(
+            ca.into_no_null_iter().collect::<Vec<_>>(),
+            &["", "abc", "", "xyz"]
+        );
+
+        let csv = ",\nabc,333\n,666\nxyz,999";
+        let file = Cursor::new(csv);
+        let df = CsvReader::new(file).has_header(false).finish()?;
+        let expected = df![
+            "column_1" => ["", "abc", "", "xyz"],
+            "column_2" => [None, Some(333i64), Some(666), Some(999)]
+        ]?;
+        assert!(df.frame_equal_missing(&expected));
+        Ok(())
+    }
 }
diff --git a/polars/polars-io/src/csv_core/csv.rs b/polars/polars-io/src/csv_core/csv.rs
@@ -278,7 +278,12 @@ impl<'a> CoreReader<'a> {
 
     fn find_starting_point<'b>(&self, mut bytes: &'b [u8]) -> Result<&'b [u8]> {
         // Skip all leading white space and the occasional utf8-bom
-        bytes = skip_line_ending(skip_whitespace(skip_bom(bytes)).0).0;
+        bytes = skip_whitespace(skip_bom(bytes));
+        // \n\n can be a empty string row of a single column
+        // in other cases we skip it.
+        if self.schema.fields().len() > 1 {
+            bytes = skip_line_ending(bytes)
+        }
 
         // If there is a header we skip it.
         if self.has_header {

diff --git a/polars/polars-io/src/csv_core/parser.rs b/polars/polars-io/src/csv_core/parser.rs
@@ -65,12 +65,12 @@ pub(crate) fn is_whitespace(b: u8) -> bool {
 }
 
 #[inline]
-fn skip_condition<F>(input: &[u8], f: F) -> (&[u8], usize)
+fn skip_condition<F>(input: &[u8], f: F) -> &[u8]
 where
     F: Fn(u8) -> bool,
 {
     if input.is_empty() {
-        return (input, 0);
+        return input;
     }
     let mut read = 0;
     let len = input.len();
@@ -81,7 +81,7 @@ where
         }
         read += 1;
     }
-    (&input[read..], read)
+    &input[read..]
 }
 
 /// Makes sure that the bytes stream starts with
@@ -96,18 +96,16 @@ pub(crate) fn skip_header(input: &[u8]) -> (&[u8], usize) {
     (&input[pos..], pos)
 }
 
-/// Remove whitespace and line endings from the start of file.
+/// Remove whitespace from the start of buffer.
 #[inline]
-pub(crate) fn skip_whitespace(input: &[u8]) -> (&[u8], usize) {
-    skip_condition(input, |b| is_whitespace(b) || is_line_ending(b))
+pub(crate) fn skip_whitespace(input: &[u8]) -> &[u8] {
+    skip_condition(input, is_whitespace)
 }
 
 #[inline]
 /// Can be used to skip whitespace, but exclude the delimiter
-pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> (&[u8], usize) {
-    skip_condition(input, |b| {
-        b != exclude && (is_whitespace(b) || is_line_ending(b))
-    })
+pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> &[u8] {
+    skip_condition(input, |b| b != exclude && (is_whitespace(b)))
 }
 
 /// Local version of slice::starts_with (as it won't inline)
@@ -128,7 +126,7 @@ pub(crate) fn drop_quotes(input: &[u8]) -> &[u8] {
 }
 
 #[inline]
-pub(crate) fn skip_line_ending(input: &[u8]) -> (&[u8], usize) {
+pub(crate) fn skip_line_ending(input: &[u8]) -> &[u8] {
     skip_condition(input, is_line_ending)
 }
 
@@ -406,7 +404,7 @@ pub(crate) fn parse_lines(
             return Ok(end - start);
         }
 
-        let (b, _) = skip_whitespace_exclude(bytes, delimiter);
+        let b = skip_whitespace_exclude(bytes, delimiter);
         bytes = b;
         if bytes.is_empty() {
             return Ok(original_bytes_len);
@@ -536,17 +534,6 @@ pub(crate) fn parse_lines(
 mod test {
     use super::*;
 
-    #[test]
-    fn test_skip() {
-        let input = b"    hello";
-        assert_eq!(skip_whitespace(input).0, b"hello");
-        let input = b"\n        hello";
-        assert_eq!(skip_whitespace(input).0, b"hello");
-        let input = b"\t\n\r
-        hello";
-        assert_eq!(skip_whitespace(input).0, b"hello");
-    }
-
     #[test]
     fn test_splitfields() {
         let input = "\"foo\",\"bar\"";

diff --git a/polars/polars-io/src/csv_core/utils.rs b/polars/polars-io/src/csv_core/utils.rs
@@ -136,7 +136,7 @@ pub fn infer_file_schema(
     // It may later.
     let encoding = CsvEncoding::LossyUtf8;
 
-    let bytes = skip_line_ending(skip_bom(reader_bytes)).0;
+    let bytes = skip_line_ending(skip_bom(reader_bytes));
     let mut lines = SplitLines::new(bytes, b'\n').skip(*skip_rows);
 
     // get or create header names
@@ -183,10 +183,16 @@ pub fn infer_file_schema(
                 })
                 .collect::<Result<_>>()?
         } else {
-            byterecord
+            let mut column_names: Vec<String> = byterecord
                 .enumerate()
                 .map(|(i, _s)| format!("column_{}", i + 1))
-                .collect()
+                .collect();
+            // needed because SplitLines does not return the \n char, so SplitFields does not catch
+            // the latest value if ending with ','
+            if header_line.ends_with(b",") {
+                column_names.push(format!("column_{}", column_names.len() + 1))
+            }
+            column_names
         }
     } else {
         return Err(PolarsError::NoData("empty csv".into()));