Skip to content

Commit

Permalink
fix(rust, python): Accept quote_char in SplitLines constructor to pro…
Browse files Browse the repository at this point in the history
…perly parse newlines (#5143)
  • Loading branch information
dannyvankooten committed Oct 8, 2022
1 parent cd46de7 commit ba92cdf
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 7 deletions.
16 changes: 12 additions & 4 deletions polars/polars-io/src/csv/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub(crate) fn next_line_position(
}
debug_assert!(pos <= input.len());
let new_input = unsafe { input.get_unchecked(pos..) };
let line = SplitLines::new(new_input, eol_char).next();
let line = SplitLines::new(new_input, quote_char.unwrap_or(b'"'), eol_char).next();

match (line, expected_fields) {
// count the fields, and determine if they are equal to what we expect from the schema
Expand Down Expand Up @@ -214,13 +214,15 @@ pub(crate) fn get_line_stats(
/// For instance: "This is a valid field\nI have multiples lines" is a valid string field, that contains multiple lines.
pub(crate) struct SplitLines<'a> {
v: &'a [u8],
quote_char: u8,
end_line_char: u8,
}

impl<'a> SplitLines<'a> {
pub(crate) fn new(slice: &'a [u8], end_line_char: u8) -> Self {
pub(crate) fn new(slice: &'a [u8], quote_char: u8, end_line_char: u8) -> Self {
Self {
v: slice,
quote_char,
end_line_char,
}
}
Expand All @@ -244,7 +246,7 @@ impl<'a> Iterator for SplitLines<'a> {
Some(&c) => {
pos += 1;

if c == b'"' {
if c == self.quote_char {
// toggle between string field enclosure
// if we encounter a starting '"' -> in_field = true;
// if we encounter a closing '"' -> in_field = false;
Expand Down Expand Up @@ -641,9 +643,15 @@ mod test {
#[test]
fn test_splitlines() {
let input = "1,\"foo\n\"\n2,\"foo\n\"\n";
let mut lines = SplitLines::new(input.as_bytes(), b'\n');
let mut lines = SplitLines::new(input.as_bytes(), b'"', b'\n');
assert_eq!(lines.next(), Some("1,\"foo\n\"".as_bytes()));
assert_eq!(lines.next(), Some("2,\"foo\n\"".as_bytes()));
assert_eq!(lines.next(), None);

let input2 = "1,'foo\n'\n2,'foo\n'\n";
let mut lines2 = SplitLines::new(input2.as_bytes(), b'\'', b'\n');
assert_eq!(lines2.next(), Some("1,'foo\n'".as_bytes()));
assert_eq!(lines2.next(), Some("2,'foo\n'".as_bytes()));
assert_eq!(lines2.next(), None);
}
}
4 changes: 2 additions & 2 deletions polars/polars-io/src/csv/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ pub fn infer_file_schema(
if bytes.is_empty() {
return Err(PolarsError::NoData("empty csv".into()));
}
let mut lines = SplitLines::new(bytes, eol_char).skip(*skip_rows);
let mut lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
// it can be that we have a single line without eol char
let has_eol = bytes.contains(&eol_char);

Expand Down Expand Up @@ -295,7 +295,7 @@ pub fn infer_file_schema(
};
if !has_header {
// re-init lines so that the header is included in type inference.
lines = SplitLines::new(bytes, eol_char).skip(*skip_rows);
lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
}

let header_length = headers.len();
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/src/ndjson_core/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ fn parse_lines<'a>(

let total_bytes = bytes.len();
let mut offset = 0;
for line in SplitLines::new(bytes, NEWLINE) {
for line in SplitLines::new(bytes, QUOTE_CHAR, NEWLINE) {
offset += 1; // the newline
offset += parse_impl(line, buffers, &mut buf)?;
}
Expand Down
17 changes: 17 additions & 0 deletions polars/tests/it/io/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,23 @@ fn test_escape_double_quotes() {
)));
}

#[test]
fn test_newline_in_custom_quote_char() {
// newline inside custom quote char (default is ") should parse correctly
let csv = r#"column_1,column_2
1,'foo
bar'
2,'bar'
"#;

let file = Cursor::new(csv);
let df = CsvReader::new(file)
.with_quote_char(Some(b'\''))
.finish()
.unwrap();
assert_eq!(df.shape(), (2, 2));
}

#[test]
fn test_escape_2() {
// this is is harder than it looks.
Expand Down

0 comments on commit ba92cdf

Please sign in to comment.