Skip to content

Commit

Permalink
check for null values in csv schema inference (#2203)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 28, 2021
1 parent 4ae88c1 commit 47695c4
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 1 deletion.
16 changes: 16 additions & 0 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ fn parse_dates(df: DataFrame, fixed_schema: &Schema) -> DataFrame {

#[cfg(test)]
mod test {
use crate::csv_core::utils::infer_file_schema;
use crate::prelude::*;
use polars_core::datatypes::AnyValue;
use polars_core::prelude::*;
Expand Down Expand Up @@ -1364,4 +1365,19 @@ A3,\"B4_\"\"with_embedded_double_quotes\"\"\",C4,4";

Ok(())
}

#[test]
fn test_null_values_infer_schema() -> Result<()> {
let csv = r#"a,b
1,2
3,NA
5,6"#;
let file = Cursor::new(csv);
let df = CsvReader::new(file)
.with_null_values(Some(NullValues::AllColumns("NA".into())))
.finish()?;
let expected = &[DataType::Int64, DataType::Int64];
assert_eq!(df.dtypes(), expected);
Ok(())
}
}
2 changes: 2 additions & 0 deletions polars/polars-io/src/csv_core/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ impl<'a> CoreReader<'a> {
&mut skip_rows,
comment_char,
quote_char,
null_values.as_ref(),
)?;
Cow::Owned(inferred_schema)
}
Expand All @@ -191,6 +192,7 @@ impl<'a> CoreReader<'a> {
&mut skip_rows,
comment_char,
quote_char,
null_values.as_ref(),
)?;
Cow::Owned(inferred_schema)
}
Expand Down
31 changes: 30 additions & 1 deletion polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use crate::csv_core::parser::{
next_line_position, skip_bom, skip_line_ending, SplitFields, SplitLines,
};
use crate::mmap::{MmapBytesReader, ReaderBytes};
use crate::prelude::NullValues;
use lazy_static::lazy_static;
use polars_core::datatypes::PlHashSet;
use polars_core::prelude::*;
Expand Down Expand Up @@ -128,6 +129,7 @@ pub fn infer_file_schema(
skip_rows: &mut usize,
comment_char: Option<u8>,
quote_char: Option<u8>,
null_values: Option<&NullValues>,
) -> Result<(Schema, usize)> {
// We use lossy utf8 here because we don't want the schema inference to fail on utf8.
// It may later.
Expand Down Expand Up @@ -237,7 +239,33 @@ pub fn infer_file_schema(
slice
};
let s = parse_bytes_with_encoding(slice_escaped, encoding)?;
column_types[i].insert(infer_field_schema(&s));
match &null_values {
None => {
column_types[i].insert(infer_field_schema(&s));
}
Some(NullValues::Columns(names)) => {
if !names.iter().any(|name| name == s.as_ref()) {
column_types[i].insert(infer_field_schema(&s));
}
}
Some(NullValues::AllColumns(name)) => {
if s.as_ref() != name {
column_types[i].insert(infer_field_schema(&s));
}
}
Some(NullValues::Named(names)) => {
let current_name = &headers[i];
let null_name = &names.iter().find(|name| &name.0 == current_name);

if let Some(null_name) = null_name {
if null_name.1 != s.as_ref() {
column_types[i].insert(infer_field_schema(&s));
}
} else {
column_types[i].insert(infer_field_schema(&s));
}
}
}
}
}
}
Expand Down Expand Up @@ -293,6 +321,7 @@ pub fn infer_file_schema(
skip_rows,
comment_char,
quote_char,
null_values,
);
}

Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ impl<'a> LazyCsvReader<'a> {
&mut self.skip_rows,
self.comment_char,
self.quote_char,
None,
)?;
let schema = f(schema)?;
Ok(self.with_schema(Arc::new(schema)))
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/src/logical_plan/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ impl LogicalPlanBuilder {
&mut skip_rows,
comment_char,
quote_char,
null_values.as_ref(),
)
.expect("could not read schema");
Arc::new(schema)
Expand Down

0 comments on commit 47695c4

Please sign in to comment.