Skip to content

Commit

Permalink
csv: fix comment before header
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 30, 2021
1 parent 0853add commit 3283158
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 9 deletions.
14 changes: 14 additions & 0 deletions polars/polars-io/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1208,4 +1208,18 @@ linenum,last_name,first_name
assert_eq!(df.dtypes(), vec![DataType::Utf8; 4]);
Ok(())
}

#[test]
fn test_header_with_comments() -> Result<()> {
let csv = "# ignore me\na,b,c\nd,e,f";

let file = Cursor::new(csv);
let df = CsvReader::new(file)
.with_comment_char(Some(b'#'))
.finish()?;
// 1 row.
assert_eq!(df.shape(), (1, 3));

Ok(())
}
}
6 changes: 3 additions & 3 deletions polars/polars-io/src/csv_core/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ impl<'a> CoreReader<'a> {
pub(crate) fn new(
reader_bytes: ReaderBytes<'a>,
n_rows: Option<usize>,
skip_rows: usize,
mut skip_rows: usize,
mut projection: Option<Vec<usize>>,
max_records: Option<usize>,
delimiter: Option<u8>,
Expand Down Expand Up @@ -148,7 +148,7 @@ impl<'a> CoreReader<'a> {
max_records,
has_header,
schema_overwrite,
skip_rows,
&mut skip_rows,
comment_char,
quote_char,
)?;
Expand All @@ -162,7 +162,7 @@ impl<'a> CoreReader<'a> {
max_records,
has_header,
schema_overwrite,
skip_rows,
&mut skip_rows,
comment_char,
quote_char,
)?;
Expand Down
28 changes: 24 additions & 4 deletions polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ pub fn infer_file_schema(
max_read_records: Option<usize>,
has_header: bool,
schema_overwrite: Option<&Schema>,
skip_rows: usize,
// we take &mut because we maybe need to skip more rows dependent
// on the schema inference
skip_rows: &mut usize,
comment_char: Option<u8>,
quote_char: Option<u8>,
) -> Result<(Schema, usize)> {
Expand All @@ -131,11 +133,29 @@ pub fn infer_file_schema(
let encoding = CsvEncoding::LossyUtf8;

let bytes = &skip_line_ending(skip_whitespace(skip_bom(reader_bytes)).0).0;
let mut lines = SplitLines::new(bytes, b'\n').skip(skip_rows);
let mut lines = SplitLines::new(bytes, b'\n').skip(*skip_rows);

// get or create header names
// when has_header is false, creates default column names with column_ prefix
let headers: Vec<String> = if let Some(mut header_line) = lines.next() {

// skip lines that are comments
let mut first_line = None;
if let Some(comment_ch) = comment_char {
for (i, line) in (&mut lines).enumerate() {
if let Some(ch) = line.get(0) {
if *ch != comment_ch {
first_line = Some(line);
*skip_rows += i;
break;
}
}
}
} else {
first_line = lines.next();
}

// now that we've found the first non-comment line we parse the headers, or we create a header
let headers: Vec<String> = if let Some(mut header_line) = first_line {
let len = header_line.len();
if len > 1 {
// remove carriage return
Expand Down Expand Up @@ -169,7 +189,7 @@ pub fn infer_file_schema(
};
if !has_header {
// re-init lines so that the header is included in type inference.
lines = SplitLines::new(bytes, b'\n').skip(skip_rows);
lines = SplitLines::new(bytes, b'\n').skip(*skip_rows);
}

let header_length = headers.len();
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-lazy/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ impl LogicalPlanBuilder {
delimiter: u8,
has_header: bool,
ignore_errors: bool,
skip_rows: usize,
mut skip_rows: usize,
stop_after_n_rows: Option<usize>,
cache: bool,
schema: Option<Arc<Schema>>,
Expand All @@ -782,7 +782,7 @@ impl LogicalPlanBuilder {
Some(100),
has_header,
schema_overwrite,
skip_rows,
&mut skip_rows,
comment_char,
quote_char,
)
Expand Down

0 comments on commit 3283158

Please sign in to comment.