Skip to content

Commit

Permalink
csv-parsing automatically parse dates
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Sep 18, 2021
1 parent ef911b4 commit 4e1013d
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 1 deletion.
2 changes: 1 addition & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rows = ["polars-core/rows"]
simd = ["polars-core/simd"]
avx512 = ["polars-core/avx512"]
docs = ["polars-core/docs"]
temporal = ["polars-core/temporal", "polars-lazy/temporal"]
temporal = ["polars-core/temporal", "polars-lazy/temporal", "polars-io/temporal"]
random = ["polars-core/random"]
default = ["docs",
"zip_with",
Expand Down
6 changes: 6 additions & 0 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1812,6 +1812,12 @@ impl Default for DataFrame {
}
}

impl From<DataFrame> for Vec<Series> {
fn from(df: DataFrame) -> Self {
df.columns
}
}

/// Conversion from Vec<RecordBatch> into DataFrame
///
///
Expand Down
1 change: 1 addition & 0 deletions polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ csv-file = ["csv-core", "memmap", "lexical", "arrow/io_csv"]
fmt = ["polars-core/plain_fmt"]
decompress = ["flate2/miniz_oxide"]
decompress-fast = ["flate2/zlib-ng-compat"]
temporal = ["polars-core/dtype-date32", "polars-core/dtype-date64"]
# don't use this
private = []

Expand Down
52 changes: 52 additions & 0 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ where
null_values: Option<NullValues>,
predicate: Option<Arc<dyn PhysicalIoExpr>>,
aggregate: Option<&'a [ScanAggregation]>,
#[cfg(feature = "temporal")]
parse_dates: bool,
}

impl<'a, R> CsvReader<'a, R>
Expand Down Expand Up @@ -359,6 +361,13 @@ where
self
}

/// Automatically try to parse dates/ datetimes. If parsing failes, columns remain of dtype Utf8.
#[cfg(feature = "temporal")]
pub fn with_parse_dates(mut self, toggle: bool) -> Self {
self.parse_dates = toggle;
self
}

#[cfg(feature = "private")]
pub fn with_predicate(mut self, predicate: Option<Arc<dyn PhysicalIoExpr>>) -> Self {
self.predicate = predicate;
Expand Down Expand Up @@ -410,6 +419,8 @@ where
null_values: None,
predicate: None,
aggregate: None,
#[cfg(feature = "temporal")]
parse_dates: false,
}
}

Expand Down Expand Up @@ -510,10 +521,32 @@ where
if rechunk && df.n_chunks()? > 1 {
df.as_single_chunk();
}
#[cfg(feature = "temporal")]
if self.parse_dates {
df = parse_dates(df)
}
Ok(df)
}
}

#[cfg(feature = "temporal")]
fn parse_dates(df: DataFrame) -> DataFrame {
let mut cols: Vec<Series> = df.into();

for s in cols.iter_mut() {
if let Ok(ca) = s.utf8() {
// the order is important. A datetime can always be parsed as date.
if let Ok(ca) = ca.as_date64(None) {
*s = ca.into_series()
} else if let Ok(ca) = ca.as_date32(None) {
*s = ca.into_series()
}
}
}

DataFrame::new_no_checks(cols)
}

#[cfg(test)]
mod test {
use crate::prelude::*;
Expand Down Expand Up @@ -1043,4 +1076,23 @@ bar,bar";
assert!(df.frame_equal(&expect));
Ok(())
}

#[test]
fn test_automatic_datetime_parsing() -> Result<()> {
let csv = r"timestamp,open,high
2021-01-01 00:00:00,0.00305500,0.00306000
2021-01-01 00:15:00,0.00298800,0.00300400
2021-01-01 00:30:00,0.00298300,0.00300100
2021-01-01 00:45:00,0.00299400,0.00304000
";

let file = Cursor::new(csv);
let df = CsvReader::new(file).with_parse_dates(true).finish()?;

let ts = df.column("timestamp")?;
assert_eq!(ts.dtype(), &DataType::Date64);
assert_eq!(ts.null_count(), 0);

Ok(())
}
}
2 changes: 2 additions & 0 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ def read_csv(
low_memory: bool = False,
comment_char: Optional[str] = None,
null_values: Optional[Union[str, tp.List[str], Dict[str, str]]] = None,
parse_dates: bool = True,
) -> "DataFrame":
"""
Read a CSV file into a Dataframe.
Expand Down Expand Up @@ -558,6 +559,7 @@ def read_csv(
low_memory,
comment_char,
processed_null_values,
parse_dates,
)
return self

Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def read_csv(
comment_char: Optional[str] = None,
storage_options: Optional[Dict] = None,
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
parse_dates: bool = True,
) -> "pl.DataFrame":
"""
Read into a DataFrame from a csv file.
Expand Down Expand Up @@ -220,6 +221,9 @@ def read_csv(
- str -> all values encountered equal to this string will be null
- List[str] -> A null value per column.
- Dict[str, str] -> A dictionary that maps column name to a null value string.
parse_dates
Try to automatically parse dates. If this not succeeds, the column remains
of data type Utf8.
Returns
-------
Expand All @@ -246,6 +250,7 @@ def read_csv(
and encoding == "utf8"
and not low_memory
and null_values is None
and parse_dates
):
include_columns = None

Expand Down Expand Up @@ -305,6 +310,7 @@ def read_csv(
low_memory=low_memory,
comment_char=comment_char,
null_values=null_values,
parse_dates=parse_dates,
)

if new_columns:
Expand Down
2 changes: 2 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ impl PyDataFrame {
low_memory: bool,
comment_char: Option<&str>,
null_values: Option<Wrap<NullValues>>,
parse_dates: bool
) -> PyResult<Self> {
let null_values = null_values.map(|w| w.0);
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
Expand Down Expand Up @@ -153,6 +154,7 @@ impl PyDataFrame {
.low_memory(low_memory)
.with_comment_char(comment_char)
.with_null_values(null_values)
.with_parse_dates(parse_dates)
.finish()
.map_err(PyPolarsEr::from)?;
Ok(df.into())
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,20 @@ def test_csv_null_values():
assert df[1, "b"] is None


def test_datetime_parsing():
csv = """
timestamp,open,high
2021-01-01 00:00:00,0.00305500,0.00306000
2021-01-01 00:15:00,0.00298800,0.00300400
2021-01-01 00:30:00,0.00298300,0.00300100
2021-01-01 00:45:00,0.00299400,0.00304000
"""

f = io.StringIO(csv)
df = pl.read_csv(f)
assert df.dtypes == [pl.Date64, pl.Float64, pl.Float64]


def test_partial_dtype_overwrite():
csv = """
a,b,c
Expand Down

0 comments on commit 4e1013d

Please sign in to comment.