Skip to content

Commit

Permalink
improve csv to date/datetime ergonomics
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 22, 2021
1 parent 83c7d80 commit d6fe3b9
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 14 deletions.
28 changes: 20 additions & 8 deletions polars/polars-core/src/chunked_array/temporal/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,12 @@ where
None
}

fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
// (string, fmt) -> result
where
F: Fn(&str, &str) -> chrono::ParseResult<K>,
{
for fmt in [
// 2021-12-31
"%Y-%m-%d",
// 31-12-2021
"%d-%m-%Y",
// 21/12/31 12:54:98
"%y/%m/%d %H:%M:%S",
// 2021-12-31 24:58:01
Expand All @@ -45,8 +41,6 @@ where
"%Y/%m/%d %H:%M:%S",
// 20210319 23:58:50
"%Y%m%d %H:%M:%S",
// 2021319 (2021-03-19)
"%Y%m%d",
// 2019-04-18T02:45:55
"%FT%H:%M:%S",
// 2019-04-18T02:45:55.555000000
Expand All @@ -62,6 +56,24 @@ where
None
}

fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
// (string, fmt) -> result
where
F: Fn(&str, &str) -> chrono::ParseResult<K>,
{
for fmt in [
// 2021-12-31
"%Y-%m-%d", // 31-12-2021
"%d-%m-%Y", // 2021319 (2021-03-19)
"%Y%m%d",
] {
if convert(val, fmt).is_ok() {
return Some(fmt);
}
}
None
}

impl Utf8Chunked {
fn get_first_val(&self) -> Result<&str> {
let idx = match self.first_non_null() {
Expand All @@ -79,7 +91,7 @@ impl Utf8Chunked {
#[cfg(feature = "dtype-datetime")]
fn sniff_fmt_datetime(&self) -> Result<&'static str> {
let val = self.get_first_val()?;
if let Some(pattern) = date_pattern(val, NaiveDateTime::parse_from_str) {
if let Some(pattern) = datetime_pattern(val, NaiveDateTime::parse_from_str) {
return Ok(pattern);
}
Err(PolarsError::ComputeError(
Expand Down
18 changes: 14 additions & 4 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,18 @@ where

// cast to the original dtypes in the schema
for fld in to_cast {
df.may_apply(fld.name(), |s| s.cast(fld.data_type()))?;
use DataType::*;
df.may_apply(fld.name(), |s| match (s.dtype(), fld.data_type()) {
#[cfg(feature = "temporal")]
(Utf8, Date) => s.utf8().unwrap().as_date(None).map(|ca| ca.into_series()),
#[cfg(feature = "temporal")]
(Utf8, Datetime) => s
.utf8()
.unwrap()
.as_datetime(None)
.map(|ca| ca.into_series()),
(_, dt) => s.cast(dt),
})?;
}
df
} else {
Expand Down Expand Up @@ -592,10 +603,9 @@ fn parse_dates(df: DataFrame, fixed_schema: &Schema) -> DataFrame {
*s = ca.into_series();
continue;
}
// the order is important. A datetime can always be parsed as date.
if let Ok(ca) = ca.as_datetime(None) {
if let Ok(ca) = ca.as_date(None) {
*s = ca.into_series()
} else if let Ok(ca) = ca.as_date(None) {
} else if let Ok(ca) = ca.as_datetime(None) {
*s = ca.into_series()
}
}
Expand Down
5 changes: 4 additions & 1 deletion py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Iterable,
Iterator,
List,
Mapping,
Optional,
Sequence,
TextIO,
Expand Down Expand Up @@ -365,7 +366,9 @@ def read_csv(
comment_char: Optional[str] = None,
quote_char: Optional[str] = r'"',
skip_rows: int = 0,
dtypes: Optional[Union[Dict[str, Type[DataType]], List[Type[DataType]]]] = None,
dtypes: Optional[
Union[Mapping[str, Type[DataType]], List[Type[DataType]]]
] = None,
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
ignore_errors: bool = False,
parse_dates: bool = False,
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Dict,
Iterator,
List,
Mapping,
Optional,
TextIO,
Tuple,
Expand Down Expand Up @@ -142,7 +143,7 @@ def read_csv(
comment_char: Optional[str] = None,
quote_char: Optional[str] = r'"',
skip_rows: int = 0,
dtypes: Optional[Union[Dict[str, Type[DataType]], List[Type[DataType]]]] = None,
dtypes: Optional[Union[Mapping[str, Type[DataType]], List[Type[DataType]]]] = None,
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
ignore_errors: bool = False,
parse_dates: bool = False,
Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,3 +459,31 @@ def test_read_sql() -> None:

except ImportError:
pass # if connectorx not installed on test machine


def test_csv_date_handling() -> None:
csv = """date
1745-04-02
1742-03-21
1743-06-16
1730-07-22
""
1739-03-16
"""
expected = pl.DataFrame(
{
"date": [
date(1745, 4, 2),
date(1742, 3, 21),
date(1743, 6, 16),
date(1730, 7, 22),
None,
date(1739, 3, 16),
]
}
)
out = pl.read_csv(csv.encode(), parse_dates=True)
assert out.frame_equal(expected, null_equal=True)
dtypes = {"date": pl.Date}
out = pl.read_csv(csv.encode(), dtypes=dtypes)
assert out.frame_equal(expected, null_equal=True)

0 comments on commit d6fe3b9

Please sign in to comment.