Skip to content

Commit

Permalink
improve date/datetime inference (#2925)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 18, 2022
1 parent 331aa0d commit 407a4e2
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 26 deletions.
4 changes: 2 additions & 2 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use polars_core::prelude::*;
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::chunkedarray::utf8::Pattern;
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::prelude::utf8::infer::{compile_single, DatetimeInfer};
use polars_time::prelude::utf8::infer::{infer_pattern_single, DatetimeInfer};

pub(crate) trait PrimitiveParser: PolarsNumericType {
fn parse(bytes: &[u8]) -> Option<Self::Native>;
Expand Down Expand Up @@ -287,7 +287,7 @@ where
// we just checked it is ascii
let val = unsafe { std::str::from_utf8_unchecked(bytes) };
match &mut self.compiled {
None => match compile_single(val) {
None => match infer_pattern_single(val) {
None => {
self.builder.append_null();
Ok(())
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ fn infer_field_schema(string: &str, parse_dates: bool) -> DataType {
// Utf8 for them
if string.starts_with('"') {
if parse_dates {
match date_infer::compile_single(&string[1..string.len() - 1]) {
match date_infer::infer_pattern_single(&string[1..string.len() - 1]) {
Some(Pattern::DatetimeYMD | Pattern::DatetimeDMY) => {
DataType::Datetime(TimeUnit::Microseconds, None)
}
Expand All @@ -109,7 +109,7 @@ fn infer_field_schema(string: &str, parse_dates: bool) -> DataType {
} else if INTEGER_RE.is_match(string) {
DataType::Int64
} else if parse_dates {
match date_infer::compile_single(string) {
match date_infer::infer_pattern_single(string) {
Some(Pattern::DatetimeYMD | Pattern::DatetimeDMY) => {
DataType::Datetime(TimeUnit::Microseconds, None)
}
Expand Down
134 changes: 115 additions & 19 deletions polars/polars-time/src/chunkedarray/utf8/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ impl TryFrom<Pattern> for DatetimeInfer<i64> {
Pattern::DatetimeDMY => Ok(DatetimeInfer {
patterns: patterns::DATETIME_D_M_Y,
latest: patterns::DATETIME_D_M_Y[0],
transform: transform_datetime,
transform: transform_datetime_us,
logical_type: DataType::Datetime(TimeUnit::Microseconds, None),
}),
Pattern::DatetimeYMD => Ok(DatetimeInfer {
patterns: patterns::DATETIME_Y_M_D,
latest: patterns::DATETIME_Y_M_D[0],
transform: transform_datetime,
transform: transform_datetime_us,
logical_type: DataType::Datetime(TimeUnit::Microseconds, None),
}),
_ => Err(PolarsError::ComputeError(
Expand Down Expand Up @@ -81,7 +81,7 @@ impl<T: NativeType> DatetimeInfer<T> {
}
}

pub fn coerce_utf8(&mut self, ca: &Utf8Chunked) -> Series {
fn coerce_utf8(&mut self, ca: &Utf8Chunked) -> Series {
let chunks = ca
.downcast_iter()
.into_iter()
Expand All @@ -92,7 +92,7 @@ impl<T: NativeType> DatetimeInfer<T> {
Arc::new(PrimitiveArray::from_trusted_len_iter(iter)) as ArrayRef
})
.collect();
match self.logical_type {
let mut out = match self.logical_type {
DataType::Date => Int32Chunked::from_chunks(ca.name(), chunks)
.into_series()
.cast(&self.logical_type)
Expand All @@ -102,7 +102,9 @@ impl<T: NativeType> DatetimeInfer<T> {
.cast(&self.logical_type)
.unwrap(),
_ => unreachable!(),
}
};
out.rename(ca.name());
out
}
}

Expand All @@ -113,13 +115,67 @@ fn transform_date(val: &str, fmt: &str) -> Option<i32> {
.map(naive_date_to_date)
}

fn transform_datetime(val: &str, fmt: &str) -> Option<i64> {
NaiveDateTime::parse_from_str(val, fmt)
fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
let out = NaiveDateTime::parse_from_str(val, fmt)
.ok()
.map(datetime_to_timestamp_ns);
match out {
Some(out) => Some(out),
None => NaiveDate::parse_from_str(val, fmt)
.ok()
.map(|nd| datetime_to_timestamp_ns(nd.and_hms(0, 0, 0))),
}
}

fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
let out = NaiveDateTime::parse_from_str(val, fmt)
.ok()
.map(datetime_to_timestamp_us);
match out {
Some(out) => Some(out),
None => NaiveDate::parse_from_str(val, fmt)
.ok()
.map(|nd| datetime_to_timestamp_us(nd.and_hms(0, 0, 0))),
}
}

fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
let out = NaiveDateTime::parse_from_str(val, fmt)
.ok()
.map(datetime_to_timestamp_us)
.map(datetime_to_timestamp_ms);
match out {
Some(out) => Some(out),
None => NaiveDate::parse_from_str(val, fmt)
.ok()
.map(|nd| datetime_to_timestamp_ms(nd.and_hms(0, 0, 0))),
}
}

pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
// Dates come first, because we see datetimes as superset of dates
match infer_pattern_date_single(val) {
Some(pat) => Some(pat),
None => infer_pattern_datetime_single(val),
}
}

fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
if patterns::DATETIME_D_M_Y
.iter()
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DatetimeDMY)
} else if patterns::DATETIME_Y_M_D
.iter()
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DatetimeYMD)
} else {
None
}
}

pub fn compile_single(val: &str) -> Option<Pattern> {
fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
if patterns::DATE_D_M_Y
.iter()
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
Expand All @@ -130,17 +186,57 @@ pub fn compile_single(val: &str) -> Option<Pattern> {
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DateYMD)
} else if patterns::DATETIME_D_M_Y
.iter()
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DatetimeDMY)
} else if patterns::DATETIME_Y_M_D
.iter()
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DatetimeYMD)
} else {
None
}
}

#[cfg(feature = "dtype-datetime")]
pub(crate) fn to_datetime(ca: &Utf8Chunked, tu: TimeUnit) -> Result<DatetimeChunked> {
match ca.first_non_null() {
None => Ok(Int64Chunked::full_null(ca.name(), ca.len()).into_datetime(tu, None)),
Some(idx) => {
let subset = ca.slice(idx as i64, ca.len());
let pattern = subset
.into_iter()
.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
.ok_or_else(|| {
PolarsError::ComputeError(
"Could not find an appropriate format to parse dates, please define a fmt"
.into(),
)
})?;
let mut infer = DatetimeInfer::<i64>::try_from(pattern).unwrap();
match tu {
TimeUnit::Nanoseconds => infer.transform = transform_datetime_ns,
TimeUnit::Microseconds => infer.transform = transform_datetime_us,
TimeUnit::Milliseconds => infer.transform = transform_datetime_ms,
}
infer.coerce_utf8(ca).datetime().map(|ca| {
let mut ca = ca.clone();
ca.set_time_unit(tu);
ca
})
}
}
}
#[cfg(feature = "dtype-date")]
pub(crate) fn to_date(ca: &Utf8Chunked) -> Result<DateChunked> {
match ca.first_non_null() {
None => Ok(Int32Chunked::full_null(ca.name(), ca.len()).into_date()),
Some(idx) => {
let subset = ca.slice(idx as i64, ca.len());
let pattern = subset
.into_iter()
.find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
.ok_or_else(|| {
PolarsError::ComputeError(
"Could not find an appropriate format to parse dates, please define a fmt"
.into(),
)
})?;
let mut infer = DatetimeInfer::<i32>::try_from(pattern).unwrap();
infer.coerce_utf8(ca).date().cloned()
}
}
}
4 changes: 2 additions & 2 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ impl Utf8Methods for Utf8Chunked {
fn as_date(&self, fmt: Option<&str>) -> Result<DateChunked> {
let fmt = match fmt {
Some(fmt) => fmt,
None => sniff_fmt_date(self)?,
None => return infer::to_date(self),
};

let mut ca: Int32Chunked = match self.has_validity() {
Expand Down Expand Up @@ -348,7 +348,7 @@ impl Utf8Methods for Utf8Chunked {
fn as_datetime(&self, fmt: Option<&str>, tu: TimeUnit) -> Result<DatetimeChunked> {
let fmt = match fmt {
Some(fmt) => fmt,
None => sniff_fmt_datetime(self)?,
None => return infer::to_datetime(self, tu),
};

let func = match tu {
Expand Down
10 changes: 9 additions & 1 deletion polars/polars-time/src/chunkedarray/utf8/patterns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ pub(super) static DATETIME_D_M_Y: &[&str] = &[
// nanoseconds
"%d-%m-%YT%H:%M:%S.%9f",
"%d-%m-%yT%H:%M:%S.%9f",
"%d/%m/%Y 00:00:00",
"%d-%m-%Y 00:00:00",
// no times
"%d-%m-%Y",
"%d-%m-%y",
];

pub(super) static DATETIME_Y_M_D: &[&str] = &[
Expand Down Expand Up @@ -58,9 +63,12 @@ pub(super) static DATETIME_Y_M_D: &[&str] = &[
"%FT%H:%M:%S.%6f",
// nanoseconds
"%FT%H:%M:%S.%9f",
// no times
"%F",
"%Y/%m/%d",
];

#[derive(Eq, Hash, PartialEq)]
#[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)]
pub enum Pattern {
DateDMY,
DateYMD,
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,3 +565,11 @@ def test_default_negative_every_offset_dynamic_groupby() -> None:
}
)
assert out.frame_equal(expected)


def test_strptime_dates_datetimes() -> None:
s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"])
assert s.str.strptime(pl.Datetime).to_list() == [
datetime(2021, 4, 22, 0, 0),
datetime(2022, 1, 4, 0, 0),
]

0 comments on commit 407a4e2

Please sign in to comment.