Skip to content

Commit

Permalink
first check ascii before we check utf8 in lossy encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 8, 2022
1 parent 838f05a commit 92cf2d2
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ impl ParsedBuffer<Utf8Type> for Utf8Field {
let parse_result = if delay_utf8_validation(self.encoding, ignore_errors) {
true
} else {
simdutf8::basic::from_utf8(bytes).is_ok()
bytes.is_ascii() || simdutf8::basic::from_utf8(bytes).is_ok()
};
let data_len = self.data.len();

Expand Down
9 changes: 5 additions & 4 deletions py-polars/tests/test_strings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import polars as pl

def test_extract_binary():
df = pl.DataFrame({'foo': ['aron', 'butler', 'charly', 'david']})
out = df.filter(pl.col('foo').str.extract('^(a)', 1) == 'a').to_series()
assert out[0] == "aron"

def test_extract_binary() -> None:
df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})
out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()
assert out[0] == "aron"

0 comments on commit 92cf2d2

Please sign in to comment.