Skip to content

Commit

Permalink
feat(rust, python): tz-aware strptime (#5736)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 7, 2022
1 parent c01032a commit efcb15f
Show file tree
Hide file tree
Showing 13 changed files with 216 additions and 62 deletions.
18 changes: 14 additions & 4 deletions polars/polars-core/src/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -699,10 +699,20 @@ impl Display for AnyValue<'_> {
Some(_tz) => {
#[cfg(feature = "timezones")]
{
let tz = _tz.parse::<chrono_tz::Tz>().unwrap();
let dt_utc = chrono::Utc.from_local_datetime(&ndt).unwrap();
let dt_tz_aware = dt_utc.with_timezone(&tz);
write!(f, "{}", dt_tz_aware)
match _tz.parse::<chrono_tz::Tz>() {
Ok(tz) => {
let dt_utc = chrono::Utc.from_local_datetime(&ndt).unwrap();
let dt_tz_aware = dt_utc.with_timezone(&tz);
write!(f, "{}", dt_tz_aware)
}
Err(_) => match parse_offset(_tz) {
Ok(offset) => {
let dt_tz_aware = offset.from_utc_datetime(&ndt);
write!(f, "{}", dt_tz_aware)
}
Err(_) => write!(f, "invalid timezone"),
},
}
}
#[cfg(not(feature = "timezones"))]
{
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv/read_impl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pub(crate) fn cast_columns(
(Utf8, Datetime(tu, _)) => s
.utf8()
.unwrap()
.as_datetime(None, *tu, false)
.as_datetime(None, *tu, false, false)
.map(|ca| ca.into_series()),
(_, dt) => s.cast(dt),
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ pub(super) fn strptime(s: &Series, options: &StrpTimeOptions) -> PolarsResult<Se
}
DataType::Datetime(tu, _) => {
if options.exact {
ca.as_datetime(options.fmt.as_deref(), *tu, options.cache)?
ca.as_datetime(options.fmt.as_deref(), *tu, options.cache, options.tz_aware)?
.into_series()
} else {
ca.as_datetime_not_exact(options.fmt.as_deref(), *tu)?
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ pub struct StrpTimeOptions {
pub exact: bool,
/// use a cache of unique, converted dates to apply the datetime conversion.
pub cache: bool,
/// Parse a timezone aware timestamp
pub tz_aware: bool,
}

impl Default for StrpTimeOptions {
Expand All @@ -29,6 +31,7 @@ impl Default for StrpTimeOptions {
strict: false,
exact: false,
cache: true,
tz_aware: false,
}
}
}
Expand Down
156 changes: 110 additions & 46 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,12 +388,17 @@ pub trait Utf8Methods: AsUtf8 {
fmt: Option<&str>,
tu: TimeUnit,
cache: bool,
mut tz_aware: bool,
) -> PolarsResult<DatetimeChunked> {
let utf8_ca = self.as_utf8();
let fmt = match fmt {
Some(fmt) => fmt,
None => return infer::to_datetime(utf8_ca, tu),
};
// todo! use regex?
if fmt.contains("%z") || fmt.contains("%:z") || fmt.contains("%#z") {
tz_aware = true;
}
let fmt = self::strptime::compile_fmt(fmt);
let cache = cache && utf8_ca.len() > 50;

Expand All @@ -403,64 +408,123 @@ pub trait Utf8Methods: AsUtf8 {
TimeUnit::Milliseconds => datetime_to_timestamp_ms,
};

let mut cache_map = PlHashMap::new();

// we can use the fast parser
let mut ca: Int64Chunked = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
let convert = |s: &str| {
// Safety:
// fmt_len is correct, it was computed with this `fmt` str.
match unsafe { self::strptime::parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
// fallback to chrono
None => NaiveDateTime::parse_from_str(s, &fmt).ok(),
Some(v) => Some(v),
}
.map(func)
};
if utf8_ca.null_count() == 0 {
utf8_ca
.into_no_null_iter()
.map(|val| {
if cache {
*cache_map.entry(val).or_insert_with(|| convert(val))
} else {
convert(val)
if tz_aware {
#[cfg(feature = "timezones")]
{
use chrono::DateTime;
use polars_arrow::export::hashbrown::hash_map::Entry;
let mut cache_map = PlHashMap::new();
let mut tz = None;

let mut convert = |s: &str| {
DateTime::parse_from_str(s, &fmt).ok().map(|dt| {
match tz {
None => tz = Some(dt.timezone()),
Some(tz_found) => {
if tz_found != dt.timezone() {
return Err(PolarsError::ComputeError(
"Different timezones found during 'strptime' operation.".into(),
));
}
}
}
Ok(func(dt.naive_utc()))
}).transpose()
};

let mut ca: Int64Chunked = utf8_ca
.into_iter()
.map(|opt_s| {
opt_s
.map(|s| {
let out = if cache {
match cache_map.entry(s) {
Entry::Vacant(entry) => {
let value = convert(s)?;
entry.insert(value);
value
}
Entry::Occupied(val) => *val.get(),
}
} else {
convert(s)?
};
Ok(out)
})
.transpose()
.map(|options| options.flatten())
})
.collect_trusted()
.collect::<PolarsResult<_>>()?;

let tz = tz.map(|of| format!("{}", of));
ca.rename(utf8_ca.name());
Ok(ca.into_datetime(tu, tz))
}
#[cfg(not(feature = "timezones"))]
{
panic!("activate 'timezones' feature")
}
} else {
let mut cache_map = PlHashMap::new();
// we can use the fast parser
let mut ca: Int64Chunked = if let Some(fmt_len) =
self::strptime::fmt_len(fmt.as_bytes())
{
let convert = |s: &str| {
// Safety:
// fmt_len is correct, it was computed with this `fmt` str.
match unsafe { self::strptime::parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
// fallback to chrono
None => NaiveDateTime::parse_from_str(s, &fmt).ok(),
Some(v) => Some(v),
}
.map(func)
};
if utf8_ca.null_count() == 0 {
utf8_ca
.into_no_null_iter()
.map(|val| {
if cache {
*cache_map.entry(val).or_insert_with(|| convert(val))
} else {
convert(val)
}
})
.collect_trusted()
} else {
utf8_ca
.into_iter()
.map(|opt_s| {
opt_s.and_then(|val| {
if cache {
*cache_map.entry(val).or_insert_with(|| convert(val))
} else {
convert(val)
}
})
})
.collect_trusted()
}
} else {
let mut cache_map = PlHashMap::new();
utf8_ca
.into_iter()
.map(|opt_s| {
opt_s.and_then(|val| {
opt_s.and_then(|s| {
if cache {
*cache_map.entry(val).or_insert_with(|| convert(val))
*cache_map.entry(s).or_insert_with(|| {
NaiveDateTime::parse_from_str(s, &fmt).ok().map(func)
})
} else {
convert(val)
NaiveDateTime::parse_from_str(s, &fmt).ok().map(func)
}
})
})
.collect_trusted()
}
} else {
utf8_ca
.into_iter()
.map(|opt_s| {
opt_s.and_then(|s| {
if cache {
*cache_map.entry(s).or_insert_with(|| {
NaiveDateTime::parse_from_str(s, &fmt).ok().map(func)
})
} else {
NaiveDateTime::parse_from_str(s, &fmt).ok().map(func)
}
})
})
.collect_trusted()
};

ca.rename(utf8_ca.name());
Ok(ca.into_datetime(tu, None))
};
ca.rename(utf8_ca.name());
Ok(ca.into_datetime(tu, None))
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions polars/polars-time/src/groupby/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, tu, false)?
.as_datetime(None, tu, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down Expand Up @@ -544,7 +544,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, TimeUnit::Milliseconds, false)?
.as_datetime(None, TimeUnit::Milliseconds, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def strptime(
strict: bool = True,
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
) -> pli.Expr:
"""
Parse a Utf8 expression to a Date/Datetime/Time type.
Expand All @@ -51,6 +52,9 @@ def strptime(
- If False, allow the format to match anywhere in the target string.
cache
Use a cache of unique, converted dates to apply the datetime conversion.
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
Notes
-----
Expand Down Expand Up @@ -106,7 +110,7 @@ def strptime(
elif datatype == Datetime:
tu = datatype.tu # type: ignore[union-attr]
dtcol = pli.wrap_expr(
self._pyexpr.str_parse_datetime(fmt, strict, exact, cache)
self._pyexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware)
)
return dtcol if (tu is None) else dtcol.dt.cast_time_unit(tu)
elif datatype == Time:
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def strptime(
strict: bool = True,
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
) -> pli.Series:
"""
Parse a Series of dtype Utf8 to a Date/Datetime Series.
Expand All @@ -48,6 +49,9 @@ def strptime(
- If False, allow the format to match anywhere in the target string.
cache
Use a cache of unique, converted dates to apply the datetime conversion.
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
Returns
-------
Expand Down
40 changes: 34 additions & 6 deletions py-polars/polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,13 +256,41 @@ def _to_python_datetime(
raise NotImplementedError # pragma: no cover


# cache here as we have a single tz per column
# and this function will be called on every conversion
@functools.lru_cache(16)
def _localize_offset(dt: datetime, offset: str) -> datetime:
try:
import pytz
except ImportError:
raise ImportError("pytz needs to be installed to handle datetimes with offsets")
import re

if offset.startswith("-"):
g = re.search(r"-(\d\d):(\d\d)", offset)
if g is None:
raise ValueError(f"Offset: {offset} not understood.")
hours = -int(g.group(1))
minutes = -int(g.group(2))
elif offset.startswith("+"):
g = re.search(r"\+(\d\d):(\d\d)", offset)
if g is None:
raise ValueError(f"Offset: {offset} not understood.")
hours = int(g.group(1))
minutes = int(g.group(2))
else:
raise ValueError(f"Offset: {offset} not understood.")

tz = pytz.FixedOffset(hours * 60 + minutes)
return dt.astimezone(tz)


def _localize(dt: datetime, tz: str) -> datetime:
if not _ZONEINFO_AVAILABLE:
raise ImportError(
"backports.zoneinfo is not installed. Please run "
"`pip install backports.zoneinfo`."
)
return dt.astimezone(zoneinfo.ZoneInfo(tz))
# zone info installation should already be checked
try:
return dt.astimezone(zoneinfo.ZoneInfo(tz))
except zoneinfo.ZoneInfoNotFoundError:
return _localize_offset(dt, tz)


def _in_notebook() -> bool:
Expand Down
12 changes: 11 additions & 1 deletion py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,17 @@ disable_error_code = [
]

[[tool.mypy.overrides]]
module = ["backports", "pyarrow.*", "polars.polars", "matplotlib.*", "fsspec.*", "connectorx", "IPython.*", "zoneinfo"]
module = [
"backports",
"pyarrow.*",
"polars.polars",
"matplotlib.*",
"fsspec.*",
"connectorx",
"IPython.*",
"zoneinfo",
"pytz",
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
Expand Down
1 change: 1 addition & 0 deletions py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
numpy
pandas
pyarrow
pytz
backports.zoneinfo; python_version < '3.9'
tzdata; platform_system == 'Windows'
xlsx2csv
Expand Down

0 comments on commit efcb15f

Please sign in to comment.