csv-parser optimization

First check ascii, if that fails we continue from where ascii fails with utf8 checking
pola-rs · Oct 19, 2021 · 4e8bb87 · 4e8bb87
1 parent 4e8c647
commit 4e8bb87
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 8 deletions.
diff --git a/polars/polars-io/src/csv.rs b/polars/polars-io/src/csv.rs
@@ -1168,4 +1168,19 @@ linenum,last_name,first_name
 
         Ok(())
     }
+
+    #[test]
+    fn test_utf8() -> Result<()> {
+        // first part is valid ascii. later we have removed some bytes from the emoji.
+        let invalid_utf8 = [
+            111, 10, 98, 97, 114, 10, 104, 97, 109, 10, 115, 112, 97, 109, 10, 106, 97, 109, 10,
+            107, 97, 109, 10, 108, 97, 109, 10, 207, 128, 10, 112, 97, 109, 10, 115, 116, 97, 109,
+            112, 10, 240, 159, 137, 10, 97, 115, 99, 105, 105, 10, 240, 159, 144, 172, 10, 99, 105,
+            97, 111,
+        ];
+        let file = Cursor::new(invalid_utf8);
+        assert!(CsvReader::new(file).finish().is_err());
+
+        Ok(())
+    }
 }
diff --git a/polars/polars-io/src/csv_core/buffer.rs b/polars/polars-io/src/csv_core/buffer.rs
@@ -341,18 +341,54 @@ impl Buffer {
                 v.offsets.shrink_to_fit();
                 v.data.shrink_to_fit();
 
-                let mut is_valid = true;
                 if delay_utf8_validation(v.encoding, v.ignore_errors) {
-                    let mut start = 0usize;
+                    // We first check valid ascii as that is much cheaper.
+                    // if we find an invalid ascii char, we continue from that offset
+                    // with utf8 checking.
+                    let mut valid_utf8 = true;
+                    let mut valid_ascii = true;
+                    let mut offset = 0;
 
-                    for &end in &v.offsets[1..] {
-                        let slice = v.data.get_unchecked(start..end as usize);
-                        start = end as usize;
-                        is_valid &= simdutf8::basic::from_utf8(slice).is_ok();
+                    // first we scan through the data checking ascii
+                    // if a byte is invalid we break the loop
+                    // and use that loop idx to find the offsets in the utf8 buffer
+                    // the offsets in the buffer are needed to recreate the substrings
+                    // in utf8 checking
+                    for (i, &v) in v.data.iter().enumerate() {
+                        if v > 127 {
+                            valid_ascii = false;
+                            offset = i as i64;
+                            break;
+                        }
                     }
 
-                    if !is_valid {
-                        return Err(PolarsError::ComputeError("invalid utf8 data in csv".into()));
+                    // if valid ascii, we are done here.
+                    if !valid_ascii {
+                        let mut idx = 0usize;
+
+                        // find offset in offset buffer that matched the failing ascii bytes
+                        for (i, &v) in v.offsets.iter().enumerate() {
+                            if v > offset {
+                                idx = i;
+                                break;
+                            }
+                        }
+
+                        // if the first byte was already invalid, we maximize by an offset of 0.
+                        let mut start = v.offsets[std::cmp::max(0, idx - 1)] as usize;
+
+                        // create substrings and check utf8 validity
+                        for &end in &v.offsets[idx..] {
+                            let slice = v.data.get_unchecked(start..end as usize);
+                            start = end as usize;
+                            valid_utf8 &= simdutf8::basic::from_utf8(slice).is_ok();
+                        }
+
+                        if !valid_utf8 {
+                            return Err(PolarsError::ComputeError(
+                                "invalid utf8 data in csv".into(),
+                            ));
+                        }
                     }
                 }