Skip to content

Commit

Permalink
csv allow only header (#3423)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 18, 2022
1 parent 7911011 commit 558a179
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 26 deletions.
6 changes: 3 additions & 3 deletions polars/polars-core/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use crate::prelude::*;
use jsonpath_lib::Compiled;
use jsonpath_lib::PathCompiled;
use serde_json::Value;
use std::borrow::Cow;

#[cfg(feature = "extract_jsonpath")]
fn extract_json<'a>(expr: &Compiled, json_str: &'a str) -> Option<Cow<'a, str>> {
fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {
serde_json::from_str(json_str).ok().and_then(|value| {
// TODO: a lot of heap allocations here. Improve json path by adding a take?
let result = expr.select(&value).ok()?;
Expand All @@ -23,7 +23,7 @@ impl Utf8Chunked {
/// Refer to <https://goessner.net/articles/JsonPath/>
#[cfg(feature = "extract_jsonpath")]
pub fn json_path_match(&self, json_path: &str) -> Result<Utf8Chunked> {
match Compiled::compile(json_path) {
match PathCompiled::compile(json_path) {
Ok(pat) => Ok(self.apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))),
Err(e) => Err(PolarsError::ComputeError(
format!("error compiling JSONpath expression {:?}", e).into(),
Expand Down
13 changes: 9 additions & 4 deletions polars/polars-io/src/csv_core/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,16 @@ where
/// and not with
/// '\nfield_1,field_1'
pub(crate) fn skip_header(input: &[u8]) -> (&[u8], usize) {
let mut pos = next_line_position_naive(input).expect("no lines in the file");
if input[pos] == b'\n' {
pos += 1;
match next_line_position_naive(input) {
Some(mut pos) => {
if input[pos] == b'\n' {
pos += 1;
}
(&input[pos..], pos)
}
// no lines in the file, so skipping the header is skipping all.
None => (&[], input.len()),
}
(&input[pos..], pos)
}

/// Remove whitespace from the start of buffer.
Expand Down
19 changes: 19 additions & 0 deletions polars/polars-io/src/csv_core/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,25 @@ pub fn infer_file_schema(
}
column_names
}
} else if has_header && !bytes.is_empty() {
// there was no new line char. So we copy the whole buf and add one
// this is likely to be cheap as there no rows.
let mut buf = Vec::with_capacity(bytes.len() + 2);
buf.extend_from_slice(bytes);
buf.push(b'\n');

return infer_file_schema(
&ReaderBytes::Owned(buf),
delimiter,
max_read_lines,
has_header,
schema_overwrite,
skip_rows,
comment_char,
quote_char,
null_values,
parse_dates,
);
} else {
return Err(PolarsError::NoData("empty csv".into()));
};
Expand Down
7 changes: 5 additions & 2 deletions polars/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,15 @@ impl LazyFrame {

fn rename_impl_swapping(self, mut existing: Vec<String>, mut new: Vec<String>) -> Self {
assert_eq!(new.len(), existing.len());
for idx in 0..existing.len() {
// remove "name" -> "name"
let mut removed = 0;
for mut idx in 0..existing.len() {
// remove "name" -> "name
// these are no ops.
idx -= removed;
if existing[idx] == new[idx] {
existing.swap_remove(idx);
new.swap_remove(idx);
removed += 1;
}
}

Expand Down
15 changes: 15 additions & 0 deletions polars/tests/it/io/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -949,8 +949,23 @@ fn test_escaping_quotes() -> Result<()> {
fn test_header_only() -> Result<()> {
let csv = "a,b,c";
let file = Cursor::new(csv);

// no header
let df = CsvReader::new(file).has_header(false).finish()?;
assert_eq!(df.shape(), (1, 3));

// has header
for csv in &["x,y,z", "x,y,z\n"] {
let file = Cursor::new(csv);
let df = CsvReader::new(file).has_header(true).finish()?;

assert_eq!(df.shape(), (0, 3));
assert_eq!(
df.dtypes(),
&[DataType::Utf8, DataType::Utf8, DataType::Utf8]
);
}

Ok(())
}

Expand Down
34 changes: 17 additions & 17 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1684,6 +1684,17 @@ def test_rename_same_name() -> None:
df = df.rename({"groups": "groups"})
df = df.select(["groups"])
assert df.collect().to_dict(False) == {"groups": ["A", "A", "B", "C", "B"]}
df = pl.DataFrame(
{
"nrs": [1, 2, 3, 4, 5],
"groups": ["A", "A", "B", "C", "B"],
"test": [1, 2, 3, 4, 5],
}
).lazy()
df = df.rename({"nrs": "nrs", "groups": "groups"})
df = df.select(["groups"])
df.collect()
assert df.collect().to_dict(False) == {"groups": ["A", "A", "B", "C", "B"]}


def test_fill_null() -> None:
Expand Down

0 comments on commit 558a179

Please sign in to comment.