From f3b6c1429135a76cfc420dd23b20ce2b75626417 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 9 Apr 2023 12:23:37 +0100 Subject: [PATCH] feat(rust, python): auto-infer fmt for tz-aware date strings (#7405) Co-authored-by: MarcoGorelli <> --- polars/polars-arrow/src/kernels/time.rs | 6 +- .../src/chunked_array/temporal/datetime.rs | 2 +- polars/polars-io/src/csv/buffer.rs | 24 ++- polars/polars-io/src/csv/utils.rs | 22 +- polars/polars-io/src/ndjson_core/buffer.rs | 6 +- .../src/dsl/function_expr/strings.rs | 8 +- .../src/chunkedarray/utf8/infer.rs | 196 ++++++++++++++---- .../polars-time/src/chunkedarray/utf8/mod.rs | 24 ++- .../src/chunkedarray/utf8/patterns.rs | 10 + polars/polars-time/src/date_range.rs | 2 +- .../tests/unit/datatypes/test_temporal.py | 8 +- .../tests/unit/namespaces/test_strptime.py | 65 +++++- 12 files changed, 289 insertions(+), 84 deletions(-) diff --git a/polars/polars-arrow/src/kernels/time.rs b/polars/polars-arrow/src/kernels/time.rs index 83b88176278f..6cdf9d182a46 100644 --- a/polars/polars-arrow/src/kernels/time.rs +++ b/polars/polars-arrow/src/kernels/time.rs @@ -90,7 +90,7 @@ pub fn replace_timezone( Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu)?, Err(_) => match parse_offset(to) { Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu)?, - Err(_) => polars_bail!(ComputeError: "unable to parse time zone: {}", to), + Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", to), }, }, Err(_) => match parse_offset(from) { @@ -98,10 +98,10 @@ pub fn replace_timezone( Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu)?, Err(_) => match parse_offset(to) { Ok(to_tz) => convert_to_timestamp(from_tz, to_tz, arr, tu)?, - Err(_) => polars_bail!(ComputeError: "unable to parse time zone: {}", to), + Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", to), }, }, - Err(_) => polars_bail!(ComputeError: "unable to parse time zone: {}", from), + Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", from), }, }) } diff --git a/polars/polars-core/src/chunked_array/temporal/datetime.rs b/polars/polars-core/src/chunked_array/temporal/datetime.rs index 706296738e66..c374060284ab 100644 --- a/polars/polars-core/src/chunked_array/temporal/datetime.rs +++ b/polars/polars-core/src/chunked_array/temporal/datetime.rs @@ -26,7 +26,7 @@ fn validate_time_zone(tz: TimeZone) -> PolarsResult<()> { Ok(_) => Ok(()), Err(_) => match tz.parse::() { Ok(_) => Ok(()), - Err(_) => polars_bail!(ComputeError: "unable to parse timezone: '{}'", tz), + Err(_) => polars_bail!(ComputeError: "unable to parse time zone: '{}'", tz), }, } } diff --git a/polars/polars-io/src/csv/buffer.rs b/polars/polars-io/src/csv/buffer.rs index ebef67eb09cd..c3e69c9fe127 100644 --- a/polars/polars-io/src/csv/buffer.rs +++ b/polars/polars-io/src/csv/buffer.rs @@ -428,18 +428,20 @@ where buf.builder.append_null(); Ok(()) } - Some(pattern) => match DatetimeInfer::::try_from(pattern) { - Ok(mut infer) => { - let parsed = infer.parse(val); - buf.compiled = Some(infer); - buf.builder.append_option(parsed); - Ok(()) - } - Err(_) => { - buf.builder.append_null(); - Ok(()) + Some(pattern_with_offset) => { + match DatetimeInfer::::try_from(pattern_with_offset.pattern) { + Ok(mut infer) => { + let parsed = infer.parse(val, pattern_with_offset.offset); + buf.compiled = Some(infer); + buf.builder.append_option(parsed); + Ok(()) + } + Err(_) => { + buf.builder.append_null(); + Ok(()) + } } - }, + } } } diff --git a/polars/polars-io/src/csv/utils.rs b/polars/polars-io/src/csv/utils.rs index a017079c424b..4364c19f10c5 100644 --- a/polars/polars-io/src/csv/utils.rs +++ b/polars/polars-io/src/csv/utils.rs @@ -110,10 +110,13 @@ fn infer_field_schema(string: &str, try_parse_dates: bool) -> DataType { #[cfg(feature = "polars-time")] { match date_infer::infer_pattern_single(&string[1..string.len() - 1]) { - Some(Pattern::DatetimeYMD | Pattern::DatetimeDMY) => { - DataType::Datetime(TimeUnit::Microseconds, None) - } - Some(Pattern::DateYMD | Pattern::DateDMY) => DataType::Date, + Some(pattern_with_offset) => match pattern_with_offset.pattern { + Pattern::DatetimeYMD | Pattern::DatetimeDMY => { + DataType::Datetime(TimeUnit::Microseconds, None) + } + Pattern::DateYMD | Pattern::DateDMY => DataType::Date, + _ => DataType::Utf8, // TODO: support tz-aware patterns + }, None => DataType::Utf8, } } @@ -136,10 +139,13 @@ fn infer_field_schema(string: &str, try_parse_dates: bool) -> DataType { #[cfg(feature = "polars-time")] { match date_infer::infer_pattern_single(string) { - Some(Pattern::DatetimeYMD | Pattern::DatetimeDMY) => { - DataType::Datetime(TimeUnit::Microseconds, None) - } - Some(Pattern::DateYMD | Pattern::DateDMY) => DataType::Date, + Some(pattern_with_offset) => match pattern_with_offset.pattern { + Pattern::DatetimeYMD | Pattern::DatetimeDMY => { + DataType::Datetime(TimeUnit::Microseconds, None) + } + Pattern::DateYMD | Pattern::DateDMY => DataType::Date, + _ => DataType::Utf8, // TODO: support tz-aware patterns + }, None => DataType::Utf8, } } diff --git a/polars/polars-io/src/ndjson_core/buffer.rs b/polars/polars-io/src/ndjson_core/buffer.rs index aff9adce04f6..06a4e055c113 100644 --- a/polars/polars-io/src/ndjson_core/buffer.rs +++ b/polars/polars-io/src/ndjson_core/buffer.rs @@ -154,9 +154,9 @@ where Value::String(s) => s, _ => return None, }; - infer_pattern_single(val).and_then(|pattern| { - match DatetimeInfer::::try_from(pattern) { - Ok(mut infer) => infer.parse(val), + infer_pattern_single(val).and_then(|pattern_with_offset| { + match DatetimeInfer::::try_from(pattern_with_offset.pattern) { + Ok(mut infer) => infer.parse(val, pattern_with_offset.offset), Err(_) => None, } }) diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs index 00e40f59eb04..8393f1966c6e 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs @@ -338,8 +338,7 @@ pub(super) fn strptime(s: &Series, options: &StrpTimeOptions) -> PolarsResult { - let tz = match (tz, tz_aware, options.utc) { - (Some(tz), false, false) => Some(tz.clone()), + match (tz, tz_aware, options.utc) { (Some(_), true, _) => polars_bail!( ComputeError: "cannot use strptime with both a tz-aware format and a tz-aware dtype, \ @@ -347,11 +346,10 @@ pub(super) fn strptime(s: &Series, options: &StrpTimeOptions) -> PolarsResult polars_bail!( ComputeError: - "cannot use strptime with both 'utc=True' and tz-aware datetime, \ + "cannot use strptime with both 'utc=True' and tz-aware dtype, \ please drop time zone from the dtype" ), - (None, _, true) => Some("UTC".to_string()), - (None, _, false) => None, + _ => (), }; if options.exact { ca.as_datetime( diff --git a/polars/polars-time/src/chunkedarray/utf8/infer.rs b/polars/polars-time/src/chunkedarray/utf8/infer.rs index f516a5571c64..b847c4bd1b51 100644 --- a/polars/polars-time/src/chunkedarray/utf8/infer.rs +++ b/polars/polars-time/src/chunkedarray/utf8/infer.rs @@ -1,9 +1,9 @@ -use chrono::{NaiveDate, NaiveDateTime}; +use chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime}; use polars_arrow::export::arrow::array::PrimitiveArray; use polars_core::prelude::*; use polars_core::utils::arrow::types::NativeType; -use super::patterns; +use super::patterns::{self, PatternWithOffset}; #[cfg(feature = "dtype-date")] use crate::chunkedarray::date::naive_date_to_date; use crate::chunkedarray::utf8::patterns::Pattern; @@ -82,10 +82,11 @@ impl StrpTimeParser for DatetimeInfer { pub struct DatetimeInfer { patterns: &'static [&'static str], latest_fmt: &'static str, - transform: fn(&str, &str) -> Option, + transform: fn(&str, &str, Option, bool) -> Option, transform_bytes: StrpTimeState, fmt_len: u16, pub logical_type: DataType, + utc: bool, } #[cfg(feature = "dtype-datetime")] @@ -101,6 +102,7 @@ impl TryFrom for DatetimeInfer { transform_bytes: StrpTimeState::default(), fmt_len: 0, logical_type: DataType::Datetime(TimeUnit::Microseconds, None), + utc: false, }), Pattern::DatetimeYMD => Ok(DatetimeInfer { patterns: patterns::DATETIME_Y_M_D, @@ -109,6 +111,16 @@ impl TryFrom for DatetimeInfer { transform_bytes: StrpTimeState::default(), fmt_len: 0, logical_type: DataType::Datetime(TimeUnit::Microseconds, None), + utc: false, + }), + Pattern::DatetimeYMDZ => Ok(DatetimeInfer { + patterns: patterns::DATETIME_Y_M_D_Z, + latest_fmt: patterns::DATETIME_Y_M_D_Z[0], + transform: transform_tzaware_datetime_us, + transform_bytes: StrpTimeState::default(), + fmt_len: 0, + logical_type: DataType::Datetime(TimeUnit::Microseconds, None), + utc: false, }), _ => polars_bail!(ComputeError: "could not convert pattern"), } @@ -128,6 +140,7 @@ impl TryFrom for DatetimeInfer { transform_bytes: StrpTimeState::default(), fmt_len: 0, logical_type: DataType::Date, + utc: false, }), Pattern::DateYMD => Ok(DatetimeInfer { patterns: patterns::DATE_Y_M_D, @@ -136,6 +149,7 @@ impl TryFrom for DatetimeInfer { transform_bytes: StrpTimeState::default(), fmt_len: 0, logical_type: DataType::Date, + utc: false, }), _ => polars_bail!(ComputeError: "could not convert pattern"), } @@ -143,14 +157,14 @@ impl TryFrom for DatetimeInfer { } impl DatetimeInfer { - pub fn parse(&mut self, val: &str) -> Option { - match (self.transform)(val, self.latest_fmt) { + pub fn parse(&mut self, val: &str, offset: Option) -> Option { + match (self.transform)(val, self.latest_fmt, offset, self.utc) { Some(parsed) => Some(parsed), // try other patterns None => { for fmt in self.patterns { self.fmt_len = 0; - if let Some(parsed) = (self.transform)(val, fmt) { + if let Some(parsed) = (self.transform)(val, fmt, offset, self.utc) { self.latest_fmt = fmt; return Some(parsed); } @@ -160,13 +174,13 @@ impl DatetimeInfer { } } - fn coerce_utf8(&mut self, ca: &Utf8Chunked) -> Series { + fn coerce_utf8(&mut self, ca: &Utf8Chunked, offset: Option) -> Series { let chunks = ca .downcast_iter() .map(|array| { let iter = array .into_iter() - .map(|opt_val| opt_val.and_then(|val| self.parse(val))); + .map(|opt_val| opt_val.and_then(|val| self.parse(val, offset))); Box::new(PrimitiveArray::from_trusted_len_iter(iter)) as ArrayRef }) .collect(); @@ -187,14 +201,19 @@ impl DatetimeInfer { } #[cfg(feature = "dtype-date")] -fn transform_date(val: &str, fmt: &str) -> Option { +fn transform_date(val: &str, fmt: &str, _offset: Option, _utc: bool) -> Option { NaiveDate::parse_from_str(val, fmt) .ok() .map(naive_date_to_date) } #[cfg(feature = "dtype-datetime")] -pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option { +pub(crate) fn transform_datetime_ns( + val: &str, + fmt: &str, + _offset: Option, + _utc: bool, +) -> Option { let out = NaiveDateTime::parse_from_str(val, fmt) .ok() .map(datetime_to_timestamp_ns); @@ -205,8 +224,29 @@ pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option { }) } +fn transform_tzaware_datetime_ns( + val: &str, + fmt: &str, + offset: Option, + utc: bool, +) -> Option { + let dt = DateTime::parse_from_str(val, fmt); + match utc { + true => dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc())), + false => match Some(dt.ok()?.timezone()) == offset { + true => dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc())), + false => None, + }, + } +} + #[cfg(feature = "dtype-datetime")] -pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option { +pub(crate) fn transform_datetime_us( + val: &str, + fmt: &str, + _offset: Option, + _utc: bool, +) -> Option { let out = NaiveDateTime::parse_from_str(val, fmt) .ok() .map(datetime_to_timestamp_us); @@ -217,8 +257,29 @@ pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option { }) } +fn transform_tzaware_datetime_us( + val: &str, + fmt: &str, + offset: Option, + utc: bool, +) -> Option { + let dt = DateTime::parse_from_str(val, fmt); + match utc { + true => dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc())), + false => match Some(dt.ok()?.timezone()) == offset { + true => dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc())), + false => None, + }, + } +} + #[cfg(feature = "dtype-datetime")] -pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option { +pub(crate) fn transform_datetime_ms( + val: &str, + fmt: &str, + _offset: Option, + _utc: bool, +) -> Option { let out = NaiveDateTime::parse_from_str(val, fmt) .ok() .map(datetime_to_timestamp_ms); @@ -229,38 +290,72 @@ pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option { }) } -pub fn infer_pattern_single(val: &str) -> Option { +fn transform_tzaware_datetime_ms( + val: &str, + fmt: &str, + offset: Option, + utc: bool, +) -> Option { + let dt = DateTime::parse_from_str(val, fmt); + match utc { + true => dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc())), + false => match Some(dt.ok()?.timezone()) == offset { + true => dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc())), + false => None, + }, + } +} + +pub fn infer_pattern_single(val: &str) -> Option { // Dates come first, because we see datetimes as superset of dates infer_pattern_date_single(val).or_else(|| infer_pattern_datetime_single(val)) } -fn infer_pattern_datetime_single(val: &str) -> Option { +fn infer_pattern_datetime_single(val: &str) -> Option { if patterns::DATETIME_D_M_Y.iter().any(|fmt| { NaiveDateTime::parse_from_str(val, fmt).is_ok() || NaiveDate::parse_from_str(val, fmt).is_ok() }) { - Some(Pattern::DatetimeDMY) + Some(PatternWithOffset { + pattern: Pattern::DatetimeDMY, + offset: None, + }) } else if patterns::DATETIME_Y_M_D.iter().any(|fmt| { NaiveDateTime::parse_from_str(val, fmt).is_ok() || NaiveDate::parse_from_str(val, fmt).is_ok() }) { - Some(Pattern::DatetimeYMD) + Some(PatternWithOffset { + pattern: Pattern::DatetimeYMD, + offset: None, + }) } else { - None + patterns::DATETIME_Y_M_D_Z + .iter() + .find_map(|fmt| DateTime::parse_from_str(val, fmt).ok()) + .map(|dt| PatternWithOffset { + pattern: Pattern::DatetimeYMDZ, + offset: Some(dt.timezone()), + }) } } -fn infer_pattern_date_single(val: &str) -> Option { +fn infer_pattern_date_single(val: &str) -> Option { if patterns::DATE_D_M_Y .iter() .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok()) { - Some(Pattern::DateDMY) + Some(PatternWithOffset { + pattern: Pattern::DateDMY, + offset: None, + }) } else if patterns::DATE_Y_M_D .iter() .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok()) { - Some(Pattern::DateYMD) + Some(PatternWithOffset { + pattern: Pattern::DateYMD, + offset: None, + }) } else { None } @@ -271,30 +366,55 @@ pub(crate) fn to_datetime( ca: &Utf8Chunked, tu: TimeUnit, tz: Option<&TimeZone>, + utc: bool, ) -> PolarsResult { match ca.first_non_null() { None => Ok(Int64Chunked::full_null(ca.name(), ca.len()).into_datetime(tu, tz.cloned())), Some(idx) => { let subset = ca.slice(idx as i64, ca.len()); - let pattern = subset + let pattern_with_offset = subset .into_iter() .find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single)) .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?; - let mut infer = DatetimeInfer::::try_from(pattern).unwrap(); - match tu { - TimeUnit::Nanoseconds => infer.transform = transform_datetime_ns, - TimeUnit::Microseconds => infer.transform = transform_datetime_us, - TimeUnit::Milliseconds => infer.transform = transform_datetime_ms, + let mut infer = DatetimeInfer::::try_from(pattern_with_offset.pattern)?; + match (tu, pattern_with_offset.offset) { + (TimeUnit::Nanoseconds, None) => infer.transform = transform_datetime_ns, + (TimeUnit::Microseconds, None) => infer.transform = transform_datetime_us, + (TimeUnit::Milliseconds, None) => infer.transform = transform_datetime_ms, + (TimeUnit::Nanoseconds, _) => infer.transform = transform_tzaware_datetime_ns, + (TimeUnit::Microseconds, _) => infer.transform = transform_tzaware_datetime_us, + (TimeUnit::Milliseconds, _) => infer.transform = transform_tzaware_datetime_ms, + } + infer.utc = utc; + if tz.is_some() && pattern_with_offset.offset.is_some() { + polars_bail!(ComputeError: "cannot parse tz-aware values with tz-aware dtype - please drop the time zone from the dtype.") + } + match pattern_with_offset.offset { + #[cfg(feature = "timezones")] + Some(offset) => infer.coerce_utf8(ca, Some(offset)).datetime().map(|ca| { + let mut ca = ca.clone(); + ca.set_time_unit(tu); + match utc { + true => Ok(ca.replace_time_zone(Some("UTC"))?), + false => Ok(ca + .replace_time_zone(Some("UTC"))? + .convert_time_zone(offset.to_string())?), + } + })?, + _ => infer.coerce_utf8(ca, None).datetime().map(|ca| { + let mut ca = ca.clone(); + ca.set_time_unit(tu); + match (tz, utc) { + #[cfg(feature = "timezones")] + (Some(tz), false) => Ok(ca.replace_time_zone(Some(tz))?), + #[cfg(feature = "timezones")] + (None, true) => Ok(ca.replace_time_zone(Some("UTC"))?), + #[cfg(feature = "timezones")] + (Some(_), true) => unreachable!(), // has already been validated in strptime + _ => Ok(ca), + } + })?, } - infer.coerce_utf8(ca).datetime().map(|ca| { - let mut ca = ca.clone(); - ca.set_time_unit(tu); - match tz { - #[cfg(feature = "timezones")] - Some(tz) => Ok(ca.replace_time_zone(Some(tz))?), - _ => Ok(ca), - } - })? } } } @@ -304,12 +424,12 @@ pub(crate) fn to_date(ca: &Utf8Chunked) -> PolarsResult { None => Ok(Int32Chunked::full_null(ca.name(), ca.len()).into_date()), Some(idx) => { let subset = ca.slice(idx as i64, ca.len()); - let pattern = subset + let pattern_with_offset = subset .into_iter() .find_map(|opt_val| opt_val.and_then(infer_pattern_date_single)) .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?; - let mut infer = DatetimeInfer::::try_from(pattern).unwrap(); - infer.coerce_utf8(ca).date().cloned() + let mut infer = DatetimeInfer::::try_from(pattern_with_offset.pattern).unwrap(); + infer.coerce_utf8(ca, None).date().cloned() } } } diff --git a/polars/polars-time/src/chunkedarray/utf8/mod.rs b/polars/polars-time/src/chunkedarray/utf8/mod.rs index f81b1fe52a8e..160e5b08c415 100644 --- a/polars/polars-time/src/chunkedarray/utf8/mod.rs +++ b/polars/polars-time/src/chunkedarray/utf8/mod.rs @@ -386,13 +386,13 @@ pub trait Utf8Methods: AsUtf8 { tu: TimeUnit, cache: bool, tz_aware: bool, - _utc: bool, + utc: bool, tz: Option<&TimeZone>, ) -> PolarsResult { let utf8_ca = self.as_utf8(); let fmt = match fmt { Some(fmt) => fmt, - None => return infer::to_datetime(utf8_ca, tu, tz), + None => return infer::to_datetime(utf8_ca, tu, tz, utc), }; let fmt = strptime::compile_fmt(fmt); let cache = cache && utf8_ca.len() > 50; @@ -415,7 +415,7 @@ pub trait Utf8Methods: AsUtf8 { DateTime::parse_from_str(s, &fmt) .ok() .map(|dt| { - if !_utc { + if !utc { if let Some(tz_found) = tz { polars_ensure!( tz_found == dt.timezone(), @@ -457,7 +457,7 @@ pub trait Utf8Methods: AsUtf8 { .collect::>()?; ca.rename(utf8_ca.name()); - if !_utc { + if !utc { let tz = tz.map(|of| format!("{of}")); Ok(ca.into_datetime(tu, tz)) } else { @@ -484,7 +484,7 @@ pub trait Utf8Methods: AsUtf8 { // Safety: // fmt_len is correct, it was computed with this `fmt` str. match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } { - None => transform(s, &fmt), + None => transform(s, &fmt, None, utc), Some(ndt) => Some(func(ndt)), } }; @@ -520,18 +520,24 @@ pub trait Utf8Methods: AsUtf8 { .map(|opt_s| { opt_s.and_then(|s| { if cache { - *cache_map.entry(s).or_insert_with(|| transform(s, &fmt)) + *cache_map + .entry(s) + .or_insert_with(|| transform(s, &fmt, None, false)) } else { - transform(s, &fmt) + transform(s, &fmt, None, false) } }) }) .collect_trusted() }; ca.rename(utf8_ca.name()); - match tz { + match (tz, utc) { #[cfg(feature = "timezones")] - Some(tz) => ca.into_datetime(tu, None).replace_time_zone(Some(tz)), + (Some(tz), false) => ca.into_datetime(tu, None).replace_time_zone(Some(tz)), + #[cfg(feature = "timezones")] + (None, true) => ca.into_datetime(tu, None).replace_time_zone(Some("UTC")), + #[cfg(feature = "timezones")] + (Some(_), true) => unreachable!(), // has already been validated in strptime _ => Ok(ca.into_datetime(tu, None)), } } diff --git a/polars/polars-time/src/chunkedarray/utf8/patterns.rs b/polars/polars-time/src/chunkedarray/utf8/patterns.rs index fbd388bbfb62..65fac4016a18 100644 --- a/polars/polars-time/src/chunkedarray/utf8/patterns.rs +++ b/polars/polars-time/src/chunkedarray/utf8/patterns.rs @@ -1,6 +1,8 @@ //! Patterns are grouped together by order of month, day, year. This is to prevent //! parsing different orders of dates in a single column. +use chrono::FixedOffset; + pub(super) static DATE_D_M_Y: &[&str] = &[ // 8-Jul-2001 "%v", // 8-Jul-2001 @@ -140,10 +142,18 @@ pub(super) static DATETIME_Y_M_D: &[&str] = &[ "%FT%H:%M:%S%.f", ]; +pub(super) static DATETIME_Y_M_D_Z: &[&str] = &["%+"]; + #[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)] pub enum Pattern { DateDMY, DateYMD, DatetimeYMD, DatetimeDMY, + DatetimeYMDZ, +} + +pub struct PatternWithOffset { + pub pattern: Pattern, + pub offset: Option, } diff --git a/polars/polars-time/src/date_range.rs b/polars/polars-time/src/date_range.rs index 76be8f67bb39..801a311a3eb7 100644 --- a/polars/polars-time/src/date_range.rs +++ b/polars/polars-time/src/date_range.rs @@ -52,7 +52,7 @@ pub fn date_range_impl( ) .into_datetime(tu, _tz.cloned()) } - _ => polars_bail!(ComputeError: "unable to parse time zone: {}", tz), + _ => polars_bail!(ComputeError: "unable to parse time zone: '{}'", tz), }, }, _ => Int64Chunked::new_vec( diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index f5b86bc50728..c8f0e607cb58 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -647,7 +647,7 @@ def test_date_range_lazy_with_expressions( def test_date_range_invalid_time_zone() -> None: - with pytest.raises(ComputeError, match="unable to parse time zone: foo"): + with pytest.raises(ComputeError, match="unable to parse time zone: 'foo'"): pl.date_range( datetime(2001, 1, 1), datetime(2001, 1, 3), interval="1d", time_zone="foo" ) @@ -1949,7 +1949,7 @@ def test_strptime_empty(time_unit: TimeUnit, time_zone: str | None) -> None: def test_strptime_with_invalid_tz() -> None: - with pytest.raises(ComputeError, match="unable to parse time zone: foo"): + with pytest.raises(ComputeError, match="unable to parse time zone: 'foo'"): pl.Series(["2020-01-01 03:00:00"]).str.strptime(pl.Datetime("us", "foo")) with pytest.raises( ComputeError, @@ -1960,7 +1960,7 @@ def test_strptime_with_invalid_tz() -> None: ) with pytest.raises( ComputeError, - match="cannot use strptime with both 'utc=True' and tz-aware datetime", + match="cannot use strptime with both 'utc=True' and tz-aware dtype", ): pl.Series(["2020-01-01 03:00:00"]).str.strptime( pl.Datetime("us", "foo"), "%Y-%m-%d %H:%M:%S", utc=True @@ -1977,7 +1977,7 @@ def test_strptime_unguessable_format() -> None: def test_convert_time_zone_invalid() -> None: ts = pl.Series(["2020-01-01"]).str.strptime(pl.Datetime) - with pytest.raises(ComputeError, match="unable to parse timezone: 'foo'"): + with pytest.raises(ComputeError, match="unable to parse time zone: 'foo'"): ts.dt.replace_time_zone("UTC").dt.convert_time_zone("foo") diff --git a/py-polars/tests/unit/namespaces/test_strptime.py b/py-polars/tests/unit/namespaces/test_strptime.py index e8a168b9b47b..9923d662ec9e 100644 --- a/py-polars/tests/unit/namespaces/test_strptime.py +++ b/py-polars/tests/unit/namespaces/test_strptime.py @@ -164,6 +164,36 @@ def test_strptime_dates_datetimes() -> None: ("2018-09-05T14:24:02.123", datetime(2018, 9, 5, 14, 24, 2, 123000)), ("2019-04-18T02:45:55.555000000", datetime(2019, 4, 18, 2, 45, 55, 555000)), ("2019-04-18T22:45:55.555123", datetime(2019, 4, 18, 22, 45, 55, 555123)), + ( + "2018-09-05T04:05:01+01:00", + datetime(2018, 9, 5, 4, 5, 1, tzinfo=timezone(timedelta(hours=1))), + ), + ( + "2018-09-05T04:24:01.9+01:00", + datetime(2018, 9, 5, 4, 24, 1, 900000, tzinfo=timezone(timedelta(hours=1))), + ), + ( + "2018-09-05T04:24:02.11+01:00", + datetime(2018, 9, 5, 4, 24, 2, 110000, tzinfo=timezone(timedelta(hours=1))), + ), + ( + "2018-09-05T14:24:02.123+01:00", + datetime( + 2018, 9, 5, 14, 24, 2, 123000, tzinfo=timezone(timedelta(hours=1)) + ), + ), + ( + "2019-04-18T02:45:55.555000000+01:00", + datetime( + 2019, 4, 18, 2, 45, 55, 555000, tzinfo=timezone(timedelta(hours=1)) + ), + ), + ( + "2019-04-18T22:45:55.555123+01:00", + datetime( + 2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone(timedelta(hours=1)) + ), + ), ], ) def test_datetime_strptime_patterns_single(time_string: str, expected: str) -> None: @@ -171,6 +201,39 @@ def test_datetime_strptime_patterns_single(time_string: str, expected: str) -> N assert result == expected +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_infer_tz_aware_time_unit(time_unit: TimeUnit) -> None: + result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.strptime( + pl.Datetime(time_unit) + ) + assert result.dtype == pl.Datetime(time_unit, "+02:00") + assert result.item() == datetime( + 2020, 1, 2, 4, 0, tzinfo=timezone(timedelta(hours=2)) + ) + + +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None: + result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.strptime( + pl.Datetime(time_unit), utc=True + ) + assert result.dtype == pl.Datetime(time_unit, "UTC") + assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc) + + +def test_infer_tz_aware_raises() -> None: + msg = "cannot parse tz-aware values with tz-aware dtype - please drop the time zone from the dtype" + with pytest.raises(ComputeError, match=msg): + pl.Series(["2020-01-02T04:00:00+02:00"]).str.strptime( + pl.Datetime("us", "Europe/Vienna") + ) + msg = "cannot use strptime with both 'utc=True' and tz-aware dtype, please drop time zone from the dtype" + with pytest.raises(ComputeError, match=msg): + pl.Series(["2020-01-02T04:00:00+02:00"]).str.strptime( + pl.Datetime("us", "Europe/Vienna"), utc=True + ) + + def test_datetime_strptime_patterns_consistent() -> None: # note that all should be year first df = pl.Series( @@ -320,7 +383,7 @@ def test_replace_timezone_invalid_timezone() -> None: ts = pl.Series(["2020-01-01 00:00:00+01:00"]).str.strptime( pl.Datetime, "%Y-%m-%d %H:%M:%S%z" ) - with pytest.raises(ComputeError, match=r"unable to parse time zone: foo"): + with pytest.raises(ComputeError, match=r"unable to parse time zone: 'foo'"): ts.dt.replace_time_zone("foo")