Skip to content

Commit

Permalink
CSV datetime inference 3x performance improvement (#2950)
Browse files Browse the repository at this point in the history
* 3x csv parse dates performance improvement
  • Loading branch information
ritchie46 committed Apr 7, 2022
1 parent 02fa05f commit 02c080a
Show file tree
Hide file tree
Showing 7 changed files with 264 additions and 30 deletions.
44 changes: 21 additions & 23 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,13 +292,23 @@ where
ignore_errors: bool,
_needs_escaping: bool,
) -> Result<()> {
// we only check ascii, as we don't expect non ascii values in dates
if bytes.is_ascii() {
// Safety:
// we just checked it is ascii
let val = unsafe { std::str::from_utf8_unchecked(bytes) };
match &mut self.compiled {
None => match infer_pattern_single(val) {
match &mut self.compiled {
None => {
let val = if bytes.is_ascii() {
// Safety:
// we just checked it is ascii
unsafe { std::str::from_utf8_unchecked(bytes) }
} else if ignore_errors {
self.builder.append_null();
return Ok(());
} else if !ignore_errors && std::str::from_utf8(bytes).is_err() {
return Err(PolarsError::ComputeError("invalid utf8".into()));
} else {
self.builder.append_null();
return Ok(());
};

match infer_pattern_single(val) {
None => {
self.builder.append_null();
Ok(())
Expand All @@ -315,24 +325,12 @@ where
Ok(())
}
},
},
Some(compiled) => {
let parsed = compiled.parse(val);
self.builder.append_option(parsed);
Ok(())
}
}
} else if ignore_errors {
self.builder.append_null();
Ok(())
} else {
Err(PolarsError::ComputeError(
format!(
"could not parse {:?} as date/time",
std::str::from_utf8(bytes)
)
.into(),
))
Some(compiled) => {
self.builder.append_option(compiled.parse_bytes(bytes));
Ok(())
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions polars/polars-time/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ description = "Time related code for the polars dataframe library"

[dependencies]
chrono = "0.4"
lexical = { version = "6", default-features = false, features = ["std", "parse-floats", "parse-integers"] }
polars-arrow = { version = "0.20.0", path = "../polars-arrow", features = ["compute"] }
polars-core = { version = "0.20.0", path = "../polars-core", features = ["temporal", "dtype-date", "dtype-datetime"] }

Expand Down
44 changes: 44 additions & 0 deletions polars/polars-time/src/chunkedarray/utf8/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use super::patterns;
#[cfg(feature = "dtype-date")]
use crate::chunkedarray::date::naive_date_to_date;
use crate::chunkedarray::utf8::patterns::Pattern;
use crate::chunkedarray::utf8::strptime;
use chrono::{NaiveDate, NaiveDateTime};
use polars_arrow::export::arrow::array::{ArrayRef, PrimitiveArray};
use polars_core::prelude::*;
Expand All @@ -12,6 +13,8 @@ pub struct DatetimeInfer<T> {
patterns: &'static [&'static str],
latest: &'static str,
transform: fn(&str, &str) -> Option<T>,
transform_bytes: fn(&[u8], &[u8], u16) -> Option<T>,
fmt_len: u16,
logical_type: DataType,
}

Expand All @@ -24,12 +27,16 @@ impl TryFrom<Pattern> for DatetimeInfer<i64> {
patterns: patterns::DATETIME_D_M_Y,
latest: patterns::DATETIME_D_M_Y[0],
transform: transform_datetime_us,
transform_bytes: transform_datetime_us_bytes,
fmt_len: 0,
logical_type: DataType::Datetime(TimeUnit::Microseconds, None),
}),
Pattern::DatetimeYMD => Ok(DatetimeInfer {
patterns: patterns::DATETIME_Y_M_D,
latest: patterns::DATETIME_Y_M_D[0],
transform: transform_datetime_us,
transform_bytes: transform_datetime_us_bytes,
fmt_len: 0,
logical_type: DataType::Datetime(TimeUnit::Microseconds, None),
}),
_ => Err(PolarsError::ComputeError(
Expand All @@ -49,12 +56,16 @@ impl TryFrom<Pattern> for DatetimeInfer<i32> {
patterns: patterns::DATE_D_M_Y,
latest: patterns::DATE_D_M_Y[0],
transform: transform_date,
transform_bytes: transform_date_bytes,
fmt_len: 0,
logical_type: DataType::Date,
}),
Pattern::DateYMD => Ok(DatetimeInfer {
patterns: patterns::DATE_Y_M_D,
latest: patterns::DATE_Y_M_D[0],
transform: transform_date,
transform_bytes: transform_date_bytes,
fmt_len: 0,
logical_type: DataType::Date,
}),
_ => Err(PolarsError::ComputeError(
Expand All @@ -71,6 +82,7 @@ impl<T: NativeType> DatetimeInfer<T> {
// try other patterns
None => {
for fmt in self.patterns {
self.fmt_len = 0;
if let Some(parsed) = (self.transform)(val, fmt) {
self.latest = fmt;
return Some(parsed);
Expand All @@ -81,6 +93,29 @@ impl<T: NativeType> DatetimeInfer<T> {
}
}

pub fn parse_bytes(&mut self, val: &[u8]) -> Option<T> {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(self.latest.as_bytes())?;
}
match (self.transform_bytes)(val, self.latest.as_bytes(), self.fmt_len) {
Some(parsed) => Some(parsed),
// try other patterns
None => {
for fmt in self.patterns {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
}
if let Some(parsed) = (self.transform_bytes)(val, fmt.as_bytes(), self.fmt_len)
{
self.latest = fmt;
return Some(parsed);
}
}
None
}
}
}

fn coerce_utf8(&mut self, ca: &Utf8Chunked) -> Series {
let chunks = ca
.downcast_iter()
Expand Down Expand Up @@ -115,6 +150,11 @@ fn transform_date(val: &str, fmt: &str) -> Option<i32> {
.map(naive_date_to_date)
}

#[cfg(feature = "dtype-date")]
fn transform_date_bytes(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<i32> {
unsafe { strptime::parse(val, fmt, fmt_len).map(|ndt| naive_date_to_date(ndt.date())) }
}

fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
let out = NaiveDateTime::parse_from_str(val, fmt)
.ok()
Expand All @@ -139,6 +179,10 @@ fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
}
}

fn transform_datetime_us_bytes(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<i64> {
unsafe { strptime::parse(val, fmt, fmt_len).map(datetime_to_timestamp_us) }
}

fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
let out = NaiveDateTime::parse_from_str(val, fmt)
.ok()
Expand Down
8 changes: 5 additions & 3 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod infer;
mod patterns;
mod strptime;

use super::*;
#[cfg(feature = "dtype-date")]
Expand Down Expand Up @@ -47,12 +48,13 @@ where
// 20210319 23:58:50
"%Y%m%d %H:%M:%S",
// 2019-04-18T02:45:55
"%FT%H:%M:%S",
// %F cannot be parse by polars native parser
"%Y-%m-%dT%H:%M:%S",
// 2019-04-18T02:45:55.555000000
// microseconds
"%FT%H:%M:%S.%6f",
"%Y-%m-%dT%H:%M:%S.%6f",
// nanoseconds
"%FT%H:%M:%S.%9f",
"%Y-%m-%dT%H:%M:%S.%9f",
] {
if convert(val, fmt).is_ok() {
return Some(fmt);
Expand Down
12 changes: 8 additions & 4 deletions polars/polars-time/src/chunkedarray/utf8/patterns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ pub(super) static DATE_Y_M_D: &[&str] = &[
"%y_%m_%d",
];

/// NOTE: don't use single letter dates like %F
/// polars parsers does not support them, so it will be slower
pub(super) static DATETIME_D_M_Y: &[&str] = &[
// 31/12/21 12:54:98
"%d/%m/%y %H:%M:%S",
Expand All @@ -38,6 +40,8 @@ pub(super) static DATETIME_D_M_Y: &[&str] = &[
"%d-%m-%y",
];

/// NOTE: don't use single letter dates like %F
/// polars parsers does not support them, so it will be slower
pub(super) static DATETIME_Y_M_D: &[&str] = &[
// 21/12/31 12:54:98
"%y/%m/%d %H:%M:%S",
Expand All @@ -57,14 +61,14 @@ pub(super) static DATETIME_Y_M_D: &[&str] = &[
// 20210319 23:58:50
"%Y%m%d %H:%M:%S",
// 2019-04-18T02:45:55
"%FT%H:%M:%S",
"%Y-%m-%dT%H:%M:%S",
// 2019-04-18T02:45:55.555000000
// microseconds
"%FT%H:%M:%S.%6f",
"%Y-%m-%dT%H:%M:%S.%6f",
// nanoseconds
"%FT%H:%M:%S.%9f",
"%Y-%m-%dT%H:%M:%S.%9f",
// no times
"%F",
"%Y-%m-%d",
"%Y/%m/%d",
];

Expand Down

0 comments on commit 02c080a

Please sign in to comment.