Skip to content

Commit

Permalink
fix(rust, python): csv, read escaped "" as missing (#5912)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 27, 2022
1 parent 52295bb commit 486c7e7
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 6 deletions.
13 changes: 7 additions & 6 deletions polars/polars-io/src/csv/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ where
let bytes = skip_whitespace(bytes);
return self.parse_bytes(bytes, ignore_errors, needs_escaping);
}
if ignore_errors {
if ignore_errors || bytes.is_empty() {
self.append_null()
} else {
return Err(PolarsError::ComputeError("".into()));
Expand Down Expand Up @@ -349,16 +349,17 @@ impl ParsedBuffer for BooleanChunkedBuilder {
&mut self,
bytes: &[u8],
ignore_errors: bool,
_needs_escaping: bool,
needs_escaping: bool,
) -> PolarsResult<()> {
let bytes = if needs_escaping {
&bytes[1..bytes.len() - 1]
} else {
bytes
};
if bytes.eq_ignore_ascii_case(b"false") {
self.append_value(false);
} else if bytes.eq_ignore_ascii_case(b"true") {
self.append_value(true);
} else if bytes.eq_ignore_ascii_case(b"\"false\"") {
self.append_value(false);
} else if bytes.eq_ignore_ascii_case(b"\"true\"") {
self.append_value(true);
} else if ignore_errors || bytes.is_empty() {
self.append_null();
} else {
Expand Down
22 changes: 22 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,3 +961,25 @@ def test_csv_single_categorical_null() -> None:

assert df.dtypes == [pl.Utf8, pl.Categorical, pl.Utf8]
assert df.to_dict(False) == {"x": ["A"], "y": [None], "z": ["A"]}


def test_csv_quoted_missing() -> None:
csv = '''"col1"|"col2"|"col3"|"col4"
"0"|"Free text with a line
break"|"123"|"456"
"1"|"Free text without a linebreak"|""|"789"
"0"|"Free text with
two
linebreaks"|"101112"|"131415"''' # noqa: W291
assert pl.read_csv(csv.encode(), sep="|", dtypes={"col3": pl.Int32}).to_dict(
False
) == {
"col1": [0, 1, 0],
"col2": [
"Free text with a line\nbreak",
"Free text without a linebreak",
"Free text with \ntwo \nlinebreaks",
],
"col3": [123, None, 101112],
"col4": [456, 789, 131415],
}

0 comments on commit 486c7e7

Please sign in to comment.