Skip to content

Commit

Permalink
csv check quoted null values (#2873)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 11, 2022
1 parent 10948a5 commit ef66b85
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 20 deletions.
10 changes: 7 additions & 3 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use crate::csv::CsvEncoding;
use crate::csv_core::csv::RunningSize;
use crate::csv_core::parser::drop_quotes;
use crate::csv_core::utils::escape_field;
use arrow::array::Utf8Array;
use arrow::bitmap::MutableBitmap;
Expand Down Expand Up @@ -67,12 +66,17 @@ where
&mut self,
bytes: &[u8],
ignore_errors: bool,
_needs_escaping: bool,
needs_escaping: bool,
) -> Result<()> {
if bytes.is_empty() {
self.append_null()
} else {
let bytes = drop_quotes(bytes);
let bytes = if needs_escaping {
&bytes[1..bytes.len() - 1]
} else {
bytes
};

// legacy comment (remember this if you decide to use Results again):
// its faster to work on options.
// if we need to throw an error, we parse again to be able to throw the error
Expand Down
22 changes: 5 additions & 17 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,23 +116,6 @@ pub(crate) fn skip_whitespace_line_ending_exclude(input: &[u8], exclude: u8) ->
})
}

/// Local version of slice::starts_with (as it won't inline)
#[inline]
fn starts_with(bytes: &[u8], needle: u8) -> bool {
!bytes.is_empty() && bytes[0] == needle
}

/// Slice `"100"` to `100`, if slice starts with `"` it does not check that it ends with `"`, but
/// assumes this. Be aware of this.
#[inline]
pub(crate) fn drop_quotes(input: &[u8]) -> &[u8] {
if starts_with(input, b'"') {
&input[1..input.len() - 1]
} else {
input
}
}

#[inline]
pub(crate) fn skip_line_ending(input: &[u8]) -> &[u8] {
skip_condition(input, is_line_ending)
Expand Down Expand Up @@ -479,6 +462,11 @@ pub(crate) fn parse_lines(
// if we have null values argument, check if this field equal null value
if let Some(null_values) = &null_values {
if let Some(null_value) = null_values.get(processed_fields) {
let field = if needs_escaping && !field.is_empty() {
&field[1..field.len() - 1]
} else {
field
};
if field == null_value.as_bytes() {
add_null = true;
}
Expand Down
17 changes: 17 additions & 0 deletions py-polars/tests/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,3 +365,20 @@ def test_write_csv_delimiter() -> None:
df.to_csv(f, sep="\t")
f.seek(0)
assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n"


def test_escaped_null_values() -> None:
csv = """
"a","b","c"
"a","n/a","NA"
"None","2","3.0"
"""
f = io.StringIO(csv)
df = pl.read_csv(
f,
null_values={"a": "None", "b": "n/a", "c": "NA"},
dtypes={"a": pl.Utf8, "b": pl.Int64, "c": pl.Float64},
)
assert df[1, "a"] is None
assert df[0, "b"] is None
assert df[0, "c"] is None

0 comments on commit ef66b85

Please sign in to comment.