Skip to content

Commit

Permalink
csv parser fallback on chrono if datetime pattern fails (#3436)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 19, 2022
1 parent 58b17b0 commit 2d3727e
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 34 deletions.
87 changes: 53 additions & 34 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,50 @@ impl<T: PolarsNumericType> DatetimeField<T> {
}
}

#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
fn slow_datetime_parser<T>(
buf: &mut DatetimeField<T>,
bytes: &[u8],
ignore_errors: bool,
) -> Result<()>
where
T: PolarsNumericType,
DatetimeInfer<T::Native>: TryFrom<Pattern>,
{
let val = if bytes.is_ascii() {
// Safety:
// we just checked it is ascii
unsafe { std::str::from_utf8_unchecked(bytes) }
} else if ignore_errors {
buf.builder.append_null();
return Ok(());
} else if !ignore_errors && std::str::from_utf8(bytes).is_err() {
return Err(PolarsError::ComputeError("invalid utf8".into()));
} else {
buf.builder.append_null();
return Ok(());
};

match infer_pattern_single(val) {
None => {
buf.builder.append_null();
Ok(())
}
Some(pattern) => match DatetimeInfer::<T::Native>::try_from(pattern) {
Ok(mut infer) => {
let parsed = infer.parse(val);
buf.compiled = Some(infer);
buf.builder.append_option(parsed);
Ok(())
}
Err(_) => {
buf.builder.append_null();
Ok(())
}
},
}
}

#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
impl<T> ParsedBuffer for DatetimeField<T>
where
Expand All @@ -293,44 +337,19 @@ where
_needs_escaping: bool,
) -> Result<()> {
match &mut self.compiled {
None => {
let val = if bytes.is_ascii() {
// Safety:
// we just checked it is ascii
unsafe { std::str::from_utf8_unchecked(bytes) }
} else if ignore_errors {
self.builder.append_null();
return Ok(());
} else if !ignore_errors && std::str::from_utf8(bytes).is_err() {
return Err(PolarsError::ComputeError("invalid utf8".into()));
} else {
self.builder.append_null();
return Ok(());
};

match infer_pattern_single(val) {
None => {
self.builder.append_null();
None => slow_datetime_parser(self, bytes, ignore_errors),
Some(compiled) => {
match compiled.parse_bytes(bytes) {
Some(parsed) => {
self.builder.append_value(parsed);
Ok(())
}
Some(pattern) => match DatetimeInfer::<T::Native>::try_from(pattern) {
Ok(mut infer) => {
let parsed = infer.parse(val);
self.compiled = Some(infer);
self.builder.append_option(parsed);
Ok(())
}
Err(_) => {
self.builder.append_null();
Ok(())
}
},
// fall back on chrono parser
// this is a lot slower, we need to do utf8 checking and use
// the slower parser
None => slow_datetime_parser(self, bytes, ignore_errors),
}
}
Some(compiled) => {
self.builder.append_option(compiled.parse_bytes(bytes));
Ok(())
}
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,3 +414,11 @@ def quoting_round_trip() -> None:
read_df = pl.read_csv(f)

assert read_df.frame_equal(df)


def fallback_chrono_parser() -> None:
data = """date_1,date_2
2021-01-01,2021-1-1
2021-02-02,2021-2-2
2021-10-10,2021-10-10"""
assert pl.read_csv(data.encode(), parse_dates=True).null_count().row(0) == (0, 0)

0 comments on commit 2d3727e

Please sign in to comment.