Skip to content

Commit

Permalink
add exact kwarg to strptime
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 10, 2022
1 parent 2f26a0b commit 9171649
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 7 deletions.
113 changes: 113 additions & 0 deletions polars/polars-core/src/chunked_array/temporal/utf8.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use super::*;
#[cfg(feature = "dtype-time")]
use crate::chunked_array::temporal::time::time_to_time64ns;
use crate::export::chrono::ParseError;
use crate::prelude::*;
use polars_time::export::chrono;

Expand Down Expand Up @@ -74,6 +75,28 @@ where
None
}

struct ParseErrorByteCopy(ParseErrorKind);

impl From<ParseError> for ParseErrorByteCopy {
fn from(e: ParseError) -> Self {
// we need to do this until chrono ParseErrorKind is public
// blocked by https://github.com/chronotope/chrono/pull/588
unsafe { std::mem::transmute(e) }
}
}

#[allow(dead_code)]
enum ParseErrorKind {
OutOfRange,
Impossible,
NotEnough,
Invalid,
/// The input string has been prematurely ended.
TooShort,
TooLong,
BadFormat,
}

impl Utf8Chunked {
fn get_first_val(&self) -> Result<&str> {
let idx = match self.first_non_null() {
Expand Down Expand Up @@ -159,6 +182,96 @@ impl Utf8Chunked {
Ok(ca.into())
}

#[cfg(feature = "dtype-date")]
pub fn as_date_not_exact(&self, fmt: Option<&str>) -> Result<DateChunked> {
let fmt = match fmt {
Some(fmt) => fmt,
None => self.sniff_fmt_date()?,
};
let mut ca: Int32Chunked = self
.into_iter()
.map(|opt_s| match opt_s {
None => None,
Some(mut s) => {
let fmt_len = fmt.len();

for i in 1..(s.len() - fmt_len) {
if s.is_empty() {
return None;
}
match NaiveDate::parse_from_str(s, fmt).map(naive_date_to_date) {
Ok(nd) => return Some(nd),
Err(e) => {
let e: ParseErrorByteCopy = e.into();
match e.0 {
ParseErrorKind::TooLong => {
s = &s[..s.len() - 1];
}
_ => {
s = &s[i..];
}
}
}
}
}
None
}
})
.collect_trusted();
ca.rename(self.name());
Ok(ca.into())
}

#[cfg(feature = "dtype-datetime")]
pub fn as_datetime_not_exact(
&self,
fmt: Option<&str>,
tu: TimeUnit,
) -> Result<DatetimeChunked> {
let fmt = match fmt {
Some(fmt) => fmt,
None => self.sniff_fmt_datetime()?,
};

let func = match tu {
TimeUnit::Nanoseconds => naive_datetime_to_datetime_ns,
TimeUnit::Milliseconds => naive_datetime_to_datetime_ms,
};

let mut ca: Int64Chunked = self
.into_iter()
.map(|opt_s| match opt_s {
None => None,
Some(mut s) => {
let fmt_len = fmt.len();

for i in 1..(s.len() - fmt_len) {
if s.is_empty() {
return None;
}
match NaiveDateTime::parse_from_str(s, fmt).map(|dt| func(&dt)) {
Ok(nd) => return Some(nd),
Err(e) => {
let e: ParseErrorByteCopy = e.into();
match e.0 {
ParseErrorKind::TooLong => {
s = &s[..s.len() - 1];
}
_ => {
s = &s[i..];
}
}
}
}
}
None
}
})
.collect_trusted();
ca.rename(self.name());
Ok(ca.into_datetime(tu, None))
}

#[cfg(feature = "dtype-date")]
pub fn as_date(&self, fmt: Option<&str>) -> Result<DateChunked> {
let fmt = match fmt {
Expand Down
8 changes: 6 additions & 2 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2437,6 +2437,7 @@ def strptime(
datatype: Union[Type[Date], Type[Datetime]],
fmt: Optional[str] = None,
strict: bool = True,
exact: bool = True,
) -> Expr:
"""
Parse utf8 expression as a Date/Datetimetype.
Expand All @@ -2452,15 +2453,18 @@ def strptime(
example: "%y-%m-%d".
strict
raise an error if any conversion fails
exact
- If True, require an exact format match.
- If False, allow the format to match anywhere in the target string.
"""
if not issubclass(datatype, DataType):
raise ValueError(
f"expected: {DataType} got: {datatype}"
) # pragma: no cover
if datatype == Date:
return wrap_expr(self._pyexpr.str_parse_date(fmt, strict))
return wrap_expr(self._pyexpr.str_parse_date(fmt, strict, exact))
elif datatype == Datetime:
return wrap_expr(self._pyexpr.str_parse_datetime(fmt, strict))
return wrap_expr(self._pyexpr.str_parse_datetime(fmt, strict, exact))
else:
raise ValueError(
"dtype should be of type {Date, Datetime}"
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3420,6 +3420,7 @@ def strptime(
datatype: Union[Type[Date], Type[Datetime]],
fmt: Optional[str] = None,
strict: bool = True,
exact: bool = True,
) -> Series:
"""
Parse a Series of dtype Utf8 to a Date/Datetime Series.
Expand All @@ -3435,6 +3436,9 @@ def strptime(
example: "%y-%m-%d".
strict
raise an error if any conversion fails
exact
- If True, require an exact format match.
- If False, allow the format to match anywhere in the target string.
Returns
-------
Expand All @@ -3443,7 +3447,7 @@ def strptime(
s = wrap_s(self._s)
return (
s.to_frame()
.select(pli.col(s.name).str.strptime(datatype, fmt, strict))
.select(pli.col(s.name).str.strptime(datatype, fmt, strict, exact))
.to_series()
)

Expand Down
16 changes: 12 additions & 4 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,14 @@ impl PyExpr {
self.clone().inner.product().into()
}

pub fn str_parse_date(&self, fmt: Option<String>, strict: bool) -> PyExpr {
pub fn str_parse_date(&self, fmt: Option<String>, strict: bool, exact: bool) -> PyExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
let out = ca.as_date(fmt.as_deref())?;
let out = if exact {
ca.as_date(fmt.as_deref())
} else {
ca.as_date_not_exact(fmt.as_deref())
}?;
if strict {
if out.null_count() != ca.null_count() {
Err(PolarsError::ComputeError(
Expand All @@ -358,10 +362,14 @@ impl PyExpr {
.into()
}

pub fn str_parse_datetime(&self, fmt: Option<String>, strict: bool) -> PyExpr {
pub fn str_parse_datetime(&self, fmt: Option<String>, strict: bool, exact: bool) -> PyExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
let out = ca.as_datetime(fmt.as_deref(), TimeUnit::Milliseconds)?;
let out = if exact {
ca.as_datetime(fmt.as_deref(), TimeUnit::Milliseconds)
} else {
ca.as_datetime_not_exact(fmt.as_deref(), TimeUnit::Milliseconds)
}?;

if strict {
if out.null_count() != ca.null_count() {
Expand Down
22 changes: 22 additions & 0 deletions py-polars/tests/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pyarrow as pa
import pytest
from test_series import verify_series_and_expr_api

import polars as pl

Expand Down Expand Up @@ -343,3 +344,24 @@ def test_to_arrow() -> None:
)
arr = date_series.to_arrow()
assert arr.type == pa.date32()


def test_non_exact_strptime() -> None:
a = pl.Series("a", ["2022-01-16", "2022-01-17", "foo2022-01-18", "b2022-01-19ar"])
fmt = "%Y-%m-%d"

expected = pl.Series("a", [date(2022, 1, 16), date(2022, 1, 17), None, None])
verify_series_and_expr_api(
a, expected, "str.strptime", pl.Date, fmt, strict=False, exact=True
)

expected = pl.Series(
"a",
[date(2022, 1, 16), date(2022, 1, 17), date(2022, 1, 18), date(2022, 1, 19)],
)
verify_series_and_expr_api(
a, expected, "str.strptime", pl.Date, fmt, strict=False, exact=False
)

with pytest.raises(Exception):
a.str.strptime(pl.Date, fmt, strict=True, exact=True)

0 comments on commit 9171649

Please sign in to comment.