Skip to content

Commit

Permalink
improve csv error messages
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 26, 2021
1 parent 0fadc09 commit 4f921e2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 47 deletions.
70 changes: 29 additions & 41 deletions polars/polars-io/src/csv_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,69 +6,46 @@ use arrow::array::Utf8Array;
use arrow::bitmap::MutableBitmap;
use polars_arrow::prelude::FromDataUtf8;
use polars_core::prelude::*;
use std::fmt::Debug;

trait ToPolarsError: Debug {
fn to_polars_err(&self) -> PolarsError {
PolarsError::ComputeError(
format!(
"Could not parse primitive type during csv parsing: {:?}.\
This can occur when a column was inferred as integer type but we stumbled upon a floating point value\
You could pass a predefined schema or set `with_ignore_parser_errors` to `true`",
self
)
.into(),
)
}
}

impl ToPolarsError for lexical::Error {}

pub(crate) trait PrimitiveParser: PolarsNumericType {
fn parse(bytes: &[u8]) -> Result<Self::Native>;
fn parse(bytes: &[u8]) -> Option<Self::Native>;
}

impl PrimitiveParser for Float32Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<f32> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<f32> {
lexical::parse(bytes).ok()
}
}
impl PrimitiveParser for Float64Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<f64> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<f64> {
lexical::parse(bytes).ok()
}
}

impl PrimitiveParser for UInt32Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<u32> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<u32> {
lexical::parse(bytes).ok()
}
}
impl PrimitiveParser for UInt64Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<u64> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<u64> {
lexical::parse(bytes).ok()
}
}
impl PrimitiveParser for Int32Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<i32> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<i32> {
lexical::parse(bytes).ok()
}
}
impl PrimitiveParser for Int64Type {
#[inline]
fn parse(bytes: &[u8]) -> Result<i64> {
let a = lexical::parse(bytes).map_err(|e| e.to_polars_err())?;
Ok(a)
fn parse(bytes: &[u8]) -> Option<i64> {
lexical::parse(bytes).ok()
}
}

Expand Down Expand Up @@ -96,16 +73,14 @@ where
self.append_null()
} else {
let bytes = drop_quotes(bytes);
// legacy comment (remember this if you decide to use Results again):
// its faster to work on options.
// if we need to throw an error, we parse again to be able to throw the error
let result = T::parse(bytes).ok();

match (result, ignore_errors) {
match (T::parse(bytes), ignore_errors) {
(Some(value), _) => self.append_value(value),
(None, true) => self.append_null(),
(None, _) => {
T::parse(bytes)?;
}
(None, _) => return Err(PolarsError::ComputeError("".into())),
};
}
Ok(())
Expand Down Expand Up @@ -390,6 +365,19 @@ impl Buffer {
};
}

pub(crate) fn dtype(&self) -> DataType {
match self {
Buffer::Boolean(_) => DataType::Boolean,
Buffer::Int32(_) => DataType::Int32,
Buffer::Int64(_) => DataType::Int64,
Buffer::UInt32(_) => DataType::UInt32,
Buffer::UInt64(_) => DataType::UInt64,
Buffer::Float32(_) => DataType::Float32,
Buffer::Float64(_) => DataType::Float64,
Buffer::Utf8(_) => DataType::Utf8,
}
}

#[inline]
pub(crate) fn add(
&mut self,
Expand Down
20 changes: 14 additions & 6 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ fn skip_this_line(bytes: &[u8], quote: Option<u8>, offset: usize) -> (&[u8], usi
///
/// # Arguments
/// * `bytes` - input to parse
/// * `offset` - offset in bytes in total input. This is 0 if single threaded. If multithreaded every
/// * `offset` - offset in bytes in total input. This is 0 if single threaded. If multi-threaded every
/// thread has a different offset.
/// * `projection` - Indices of the columns to project.
/// * `buffers` - Parsed output will be written to these buffers. Except for UTF8 data. The offsets of the
Expand Down Expand Up @@ -477,13 +477,21 @@ pub(crate) fn parse_lines(
buf.add_null()
} else {
buf.add(field, ignore_parser_errors, needs_escaping)
.map_err(|e| {
.map_err(|_| {
let bytes_offset = offset + field.as_ptr() as usize - start;
let unparsable = String::from_utf8_lossy(field);
PolarsError::ComputeError(
format!(
"{:?} on thread line {}; on input: {}",
e,
r#"Could not parse {} as dtype {:?} at column {}.
The total offset in the file is {} bytes.
Consider running the parser `with_ignore_parser_errors=true`
or consider adding {} to the `null_values` list."#,
&unparsable,
buf.dtype(),
idx,
String::from_utf8_lossy(field)
bytes_offset,
&unparsable,
)
.into(),
)
Expand All @@ -504,7 +512,7 @@ pub(crate) fn parse_lines(
if let Some(b'\n') = bytes.get(0) {
bytes = &bytes[read_sol..];
} else {
let (bytes_rem, _) = skip_this_line(bytes, quote_char, offset);
let (bytes_rem, _) = skip_this_line(bytes, quote_char, 0);
bytes = bytes_rem;
}
break;
Expand Down

0 comments on commit 4f921e2

Please sign in to comment.