Skip to content

Commit

Permalink
Improve parse_lines error message. (#3569)
Browse files Browse the repository at this point in the history
Improve parse_lines error message:
  - Fix file offset at which the error was encountered
    (header and skipped lines were not included).
  - Remove leading spaces in error message due to use
    of raw string.
  - Add other ideas (specify correct dtype or increasing
    number of records used to infer the schema) to
    potentionaly avoid the error, as the error also show
    up quite frequently when an incorrect dtype was
    inferred.

Previous error message:

ComputeError: Could not parse 3.0 as dtype Int64 at column 3.
                                            The total offset in the file is 37 bytes.

                                            Consider running the parser `with_ignore_parser_errors=true`
                                            or consider adding 3.0 to the `null_values` list.

Current error message:

ComputeError: Could not parse `3.0` as dtype Int64 at column 3.
The current offset in the file is 69 bytes.

Consider specifying the correct dtype, increasing
the number of records used to infer the schema,
running the parser with `ignore_parser_errors=true`
or  adding `3.0` to the `null_values` list.
  • Loading branch information
ghuls committed Jun 4, 2022
1 parent 4fe2345 commit a3a7fdf
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
18 changes: 13 additions & 5 deletions polars/polars-io/src/csv_core/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,9 @@ impl<'a> CoreReader<'a> {
})
}

fn find_starting_point<'b>(&self, mut bytes: &'b [u8]) -> Result<&'b [u8]> {
fn find_starting_point<'b>(&self, mut bytes: &'b [u8]) -> Result<(&'b [u8], usize)> {
let starting_point_offset = bytes.as_ptr() as usize;

// Skip all leading white space and the occasional utf8-bom
bytes = skip_whitespace(skip_bom(bytes));
// \n\n can be a empty string row of a single column
Expand All @@ -304,7 +306,10 @@ impl<'a> CoreReader<'a> {
bytes = &bytes[pos..];
}
}
Ok(bytes)

let starting_point_offset = bytes.as_ptr() as usize - starting_point_offset;

Ok((bytes, starting_point_offset))
}

fn parse_csv(
Expand All @@ -314,8 +319,9 @@ impl<'a> CoreReader<'a> {
predicate: Option<&Arc<dyn PhysicalIoExpr>>,
) -> Result<DataFrame> {
let logging = std::env::var("POLARS_VERBOSE").is_ok();

// Make the variable mutable so that we can reassign the sliced file to this variable.
let mut bytes = self.find_starting_point(bytes)?;
let (mut bytes, starting_point_offset) = self.find_starting_point(bytes)?;

// initial row guess. We use the line statistic to guess the number of rows to allocate
let mut total_rows = 128;
Expand Down Expand Up @@ -485,9 +491,10 @@ impl<'a> CoreReader<'a> {
let local_bytes = &bytes[read..stop_at_nbytes];

last_read = read;
let offset = read + starting_point_offset;
read += parse_lines(
local_bytes,
read,
offset,
delimiter,
self.comment_char,
self.quote_char,
Expand Down Expand Up @@ -602,9 +609,10 @@ impl<'a> CoreReader<'a> {
let local_bytes = &bytes[read..stop_at_nbytes];

last_read = read;
let offset = read + starting_point_offset;
read += parse_lines(
local_bytes,
read,
offset,
delimiter,
self.comment_char,
self.quote_char,
Expand Down
12 changes: 7 additions & 5 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -486,11 +486,13 @@ pub(crate) fn parse_lines(
let unparsable = String::from_utf8_lossy(field);
PolarsError::ComputeError(
format!(
r#"Could not parse {} as dtype {:?} at column {}.
The total offset in the file is {} bytes.
Consider running the parser `with_ignore_parser_errors=true`
or consider adding {} to the `null_values` list."#,
"Could not parse `{}` as dtype {:?} at column {}.\n\
The current offset in the file is {} bytes.\n\
\n\
Consider specifying the correct dtype, increasing\n\
the number of records used to infer the schema,\n\
running the parser with `ignore_parser_errors=true`\n\
or adding `{}` to the `null_values` list.",
&unparsable,
buf.dtype(),
idx,
Expand Down

0 comments on commit a3a7fdf

Please sign in to comment.