Skip to content

Commit

Permalink
improve datetime inference (#2923)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 17, 2022
1 parent c4e5b5d commit 04f3dfa
Show file tree
Hide file tree
Showing 19 changed files with 489 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ impl CategoricalChunked {
}

/// Retrieve the indexes need to sort this and the other arrays.
#[cfg(feature = "sort_multiple")]
pub(crate) fn argsort_multiple(&self, other: &[Series], reverse: &[bool]) -> Result<IdxCa> {
if self.use_lexical_sort() {
args_validate(self.logical(), other, reverse)?;
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ decompress = ["flate2/miniz_oxide"]
decompress-fast = ["flate2/zlib-ng-compat"]
temporal = ["dtype-datetime", "dtype-date", "dtype-time"]
# don't use this
private = []
private = ["polars-time/private"]

[dependencies]
ahash = "0.7"
Expand Down
30 changes: 11 additions & 19 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ use crate::utils::resolve_homedir;
use crate::{RowCount, SerReader, SerWriter};
pub use arrow::io::csv::write;
use polars_core::prelude::*;
#[cfg(feature = "temporal")]
use polars_time::prelude::*;
#[cfg(feature = "temporal")]
use rayon::prelude::*;
#[cfg(feature = "temporal")]
use std::borrow::Cow;
use std::fs::File;
use std::io::Write;
Expand Down Expand Up @@ -225,7 +225,6 @@ where
aggregate: Option<&'a [ScanAggregation]>,
quote_char: Option<u8>,
skip_rows_after_header: usize,
#[cfg(feature = "temporal")]
parse_dates: bool,
row_count: Option<RowCount>,
}
Expand Down Expand Up @@ -393,7 +392,6 @@ where
}

/// Automatically try to parse dates/ datetimes and time. If parsing fails, columns remain of dtype `[DataType::Utf8]`.
#[cfg(feature = "temporal")]
pub fn with_parse_dates(mut self, toggle: bool) -> Self {
self.parse_dates = toggle;
self
Expand Down Expand Up @@ -452,7 +450,6 @@ where
aggregate: None,
quote_char: Some(b'"'),
skip_rows_after_header: 0,
#[cfg(feature = "temporal")]
parse_dates: false,
row_count: None,
}
Expand Down Expand Up @@ -534,6 +531,7 @@ where
&to_cast,
self.skip_rows_after_header,
self.row_count,
self.parse_dates,
)?;
csv_reader.as_df()?
} else {
Expand Down Expand Up @@ -564,6 +562,7 @@ where
&[],
self.skip_rows_after_header,
self.row_count,
self.parse_dates,
)?;
csv_reader.as_df()?
};
Expand All @@ -577,7 +576,9 @@ where
df.as_single_chunk_par();
}
}

#[cfg(feature = "temporal")]
// only needed until we also can parse time columns in place
if self.parse_dates {
// determine the schema that's given by the user. That should not be changed
let fixed_schema = match (self.schema_overwrite, self.dtype_overwrite) {
Expand All @@ -602,32 +603,23 @@ where
}

#[cfg(feature = "temporal")]
fn parse_dates(df: DataFrame, fixed_schema: &Schema) -> DataFrame {
let cols = df
.get_columns()
.par_iter()
fn parse_dates(mut df: DataFrame, fixed_schema: &Schema) -> DataFrame {
let cols = std::mem::take(df.get_columns_mut())
.into_par_iter()
.map(|s| {
if let Ok(ca) = s.utf8() {
// don't change columns that are in the fixed schema.
if fixed_schema.index_of(s.name()).is_some() {
return s.clone();
return s;
}

#[cfg(feature = "dtype-time")]
if let Ok(ca) = ca.as_time(None) {
return ca.into_series();
}
#[cfg(feature = "dtype-date")]
if let Ok(ca) = ca.as_date(None) {
return ca.into_series();
}
#[cfg(feature = "dtype-datetime")]
if let Ok(ca) = ca.as_datetime(None, TimeUnit::Milliseconds) {
return ca.into_series();
}
s.clone()
s
} else {
s.clone()
s
}
})
.collect::<Vec<_>>();
Expand Down

0 comments on commit 04f3dfa

Please sign in to comment.