Skip to content

Commit

Permalink
default to native strptime (#3093)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 8, 2022
1 parent 90c2d85 commit f48dc2f
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 38 deletions.
88 changes: 53 additions & 35 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,32 +315,40 @@ impl Utf8Methods for Utf8Chunked {
Some(fmt) => fmt,
None => return infer::to_date(self),
};
let fmt = self::strptime::compile_fmt(fmt);

let mut ca: Int32Chunked = match self.has_validity() {
false => self
.into_no_null_iter()
.map(|s| {
NaiveDate::parse_from_str(s, fmt)
.ok()
.map(naive_date_to_date)
})
.collect_trusted(),
_ => self
.into_iter()
// we can use the fast parser
let mut ca: Int32Chunked = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
let convert = |s: &str| {
// Safety:
// fmt_len is correct, it was computed with this `fmt` str.
match unsafe { self::strptime::parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
// fallback to chrono
None => NaiveDate::parse_from_str(s, &fmt).ok(),
Some(ndt) => Some(ndt.date()),
}
.map(naive_date_to_date)
};

if self.null_count() == 0 {
self.into_no_null_iter().map(convert).collect_trusted()
} else {
self.into_iter()
.map(|opt_s| opt_s.and_then(convert))
.collect_trusted()
}
} else {
self.into_iter()
.map(|opt_s| {
let opt_nd = opt_s.map(|s| {
NaiveDate::parse_from_str(s, fmt)
opt_s.and_then(|s| {
NaiveDate::parse_from_str(s, &fmt)
.ok()
.map(naive_date_to_date)
});
match opt_nd {
None => None,
Some(None) => None,
Some(Some(nd)) => Some(nd),
}
})
})
.collect_trusted(),
.collect_trusted()
};

ca.rename(self.name());
Ok(ca.into())
}
Expand All @@ -352,31 +360,41 @@ impl Utf8Methods for Utf8Chunked {
Some(fmt) => fmt,
None => return infer::to_datetime(self, tu),
};
let fmt = self::strptime::compile_fmt(fmt);

let func = match tu {
TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
TimeUnit::Microseconds => datetime_to_timestamp_us,
TimeUnit::Milliseconds => datetime_to_timestamp_ms,
};

let mut ca: Int64Chunked = match self.has_validity() {
false => self
.into_no_null_iter()
.map(|s| NaiveDateTime::parse_from_str(s, fmt).ok().map(func))
.collect_trusted(),
_ => self
.into_iter()
// we can use the fast parser
let mut ca: Int64Chunked = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
let convert = |s: &str| {
// Safety:
// fmt_len is correct, it was computed with this `fmt` str.
match unsafe { self::strptime::parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
// fallback to chrono
None => NaiveDateTime::parse_from_str(s, &fmt).ok(),
Some(v) => Some(v),
}
.map(func)
};
if self.null_count() == 0 {
self.into_no_null_iter().map(convert).collect_trusted()
} else {
self.into_iter()
.map(|opt_s| opt_s.and_then(convert))
.collect_trusted()
}
} else {
self.into_iter()
.map(|opt_s| {
let opt_nd =
opt_s.map(|s| NaiveDateTime::parse_from_str(s, fmt).ok().map(func));
match opt_nd {
None => None,
Some(None) => None,
Some(Some(nd)) => Some(nd),
}
opt_s.and_then(|s| NaiveDateTime::parse_from_str(s, &fmt).ok().map(func))
})
.collect_trusted(),
.collect_trusted()
};

ca.rename(self.name());
Ok(ca.into_datetime(tu, None))
}
Expand Down
22 changes: 19 additions & 3 deletions polars/polars-time/src/chunkedarray/utf8/strptime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,21 @@ unsafe fn update_and_parse<T: lexical::FromLexical>(
.map(|v| (v, new_offset))
}

/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.
/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
/// year, day, month distinctively with %Y, %d, %m.
pub(super) fn compile_fmt(fmt: &str) -> String {
fmt.replace("%D", "%m/%d/%y")
.replace("%R", "%H:%M")
.replace("%T", "%H:%M:%S")
.replace("%X", "%H:%M:%S")
.replace("%F", "%Y-%m-%d")
}

#[inline]
// # Safety
// Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.
pub unsafe fn parse(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<NaiveDateTime> {
pub(super) unsafe fn parse(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<NaiveDateTime> {
const ESCAPE: u8 = b'%';
if val.len() < fmt_len as usize {
return None;
Expand Down Expand Up @@ -61,7 +72,12 @@ pub unsafe fn parse(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<NaiveDateTim
}
b'y' => {
let new_offset = offset + 2;
year = 2000 + lexical::parse::<i32, _>(&val[offset..new_offset]).ok()?;
let decade = lexical::parse::<i32, _>(&val[offset..new_offset]).ok()?;
if decade < 50 {
year = 2000 + decade;
} else {
year = 1900 + decade;
}
offset = new_offset;
}
b'9' => {
Expand Down Expand Up @@ -92,7 +108,7 @@ pub unsafe fn parse(val: &[u8], fmt: &[u8], fmt_len: u16) -> Option<NaiveDateTim
Some(NaiveDate::from_ymd(year, month, day).and_hms_nano(hour, min, sec, nano))
}

pub fn fmt_len(fmt: &[u8]) -> Option<u16> {
pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {
let mut iter = fmt.iter();
let mut cnt = 0;

Expand Down

0 comments on commit f48dc2f

Please sign in to comment.