Skip to content

Commit

Permalink
feat: Add option to disable globbing in csv
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 27, 2024
1 parent f1846a9 commit 27a609c
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 9 deletions.
29 changes: 21 additions & 8 deletions crates/polars-lazy/src/scan/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,30 @@ pub struct LazyCsvReader {
path: PathBuf,
paths: Arc<[PathBuf]>,
separator: u8,
has_header: bool,
ignore_errors: bool,
skip_rows: usize,
n_rows: Option<usize>,
cache: bool,
schema: Option<SchemaRef>,
schema_overwrite: Option<SchemaRef>,
low_memory: bool,
comment_prefix: Option<CommentPrefix>,
quote_char: Option<u8>,
eol_char: u8,
null_values: Option<NullValues>,
missing_is_null: bool,
truncate_ragged_lines: bool,
infer_schema_length: Option<usize>,
rechunk: bool,
skip_rows_after_header: usize,
encoding: CsvEncoding,
row_index: Option<RowIndex>,
try_parse_dates: bool,
raise_if_empty: bool,
n_threads: Option<usize>,
cache: bool,
has_header: bool,
ignore_errors: bool,
low_memory: bool,
missing_is_null: bool,
truncate_ragged_lines: bool,
decimal_comma: bool,
try_parse_dates: bool,
raise_if_empty: bool,
glob: bool,
}

#[cfg(feature = "csv")]
Expand Down Expand Up @@ -72,6 +73,7 @@ impl LazyCsvReader {
truncate_ragged_lines: false,
n_threads: None,
decimal_comma: false,
glob: true,
}
}

Expand Down Expand Up @@ -238,6 +240,13 @@ impl LazyCsvReader {
self
}

#[must_use]
/// Expand path given via globbing rules.
pub fn with_glob(mut self, toggle: bool) -> Self {
self.glob = toggle;
self
}

/// Modify a schema before we run the lazy scanning.
///
/// Important! Run this function latest in the builder!
Expand Down Expand Up @@ -322,6 +331,10 @@ impl LazyFileListReader for LazyCsvReader {
Ok(lf)
}

fn glob(&self) -> bool {
self.glob
}

fn path(&self) -> &Path {
&self.path
}
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-lazy/src/scan/file_list_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ fn polars_glob(pattern: &str, cloud_options: Option<&CloudOptions>) -> PolarsRes
pub trait LazyFileListReader: Clone {
/// Get the final [LazyFrame].
fn finish(self) -> PolarsResult<LazyFrame> {
if !self.glob() {
return self.finish_no_glob();
}
if let Some(paths) = self.iter_paths()? {
let lfs = paths
.map(|r| {
Expand Down Expand Up @@ -89,6 +92,10 @@ pub trait LazyFileListReader: Clone {
/// It is recommended to always use [LazyFileListReader::finish] method.
fn finish_no_glob(self) -> PolarsResult<LazyFrame>;

fn glob(&self) -> bool {
true
}

/// Path of the scanned file.
/// It can be potentially a glob pattern.
fn path(&self) -> &Path;
Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/io/csv/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def read_csv(
raise_if_empty: bool = True,
truncate_ragged_lines: bool = False,
decimal_comma: bool = False,
glob: bool = True,
) -> DataFrame:
r"""
Read a CSV file into a DataFrame.
Expand Down Expand Up @@ -188,6 +189,8 @@ def read_csv(
Truncate lines that are longer than the schema.
decimal_comma
Parse floats with decimal signs
glob
Expand path given via globbing rules.
Returns
-------
Expand Down Expand Up @@ -442,6 +445,7 @@ def read_csv(
raise_if_empty=raise_if_empty,
truncate_ragged_lines=truncate_ragged_lines,
decimal_comma=decimal_comma,
glob=glob,
)

if new_columns:
Expand Down Expand Up @@ -479,6 +483,7 @@ def _read_csv_impl(
raise_if_empty: bool = True,
truncate_ragged_lines: bool = False,
decimal_comma: bool = False,
glob: bool = True,
) -> DataFrame:
path: str | None
if isinstance(source, (str, Path)):
Expand Down Expand Up @@ -542,6 +547,7 @@ def _read_csv_impl(
raise_if_empty=raise_if_empty,
truncate_ragged_lines=truncate_ragged_lines,
decimal_comma=decimal_comma,
glob=glob,
)
if columns is None:
return scan.collect()
Expand Down Expand Up @@ -925,6 +931,7 @@ def scan_csv(
raise_if_empty: bool = True,
truncate_ragged_lines: bool = False,
decimal_comma: bool = False,
glob: bool = True,
) -> LazyFrame:
r"""
Lazily read from a CSV file or multiple files via glob patterns.
Expand Down Expand Up @@ -1019,6 +1026,8 @@ def scan_csv(
Truncate lines that are longer than the schema.
decimal_comma
Parse floats with decimal signs
glob
Expand path given via globbing rules.
Returns
-------
Expand Down Expand Up @@ -1138,6 +1147,7 @@ def with_column_names(cols: list[str]) -> list[str]:
raise_if_empty=raise_if_empty,
truncate_ragged_lines=truncate_ragged_lines,
decimal_comma=decimal_comma,
glob=glob,
)


Expand Down Expand Up @@ -1169,6 +1179,7 @@ def _scan_csv_impl(
raise_if_empty: bool = True,
truncate_ragged_lines: bool = True,
decimal_comma: bool = False,
glob: bool = True,
) -> LazyFrame:
dtype_list: list[tuple[str, PolarsDataType]] | None = None
if dtypes is not None:
Expand Down Expand Up @@ -1210,5 +1221,6 @@ def _scan_csv_impl(
truncate_ragged_lines=truncate_ragged_lines,
decimal_comma=decimal_comma,
schema=schema,
glob=glob,
)
return wrap_ldf(pylf)
4 changes: 3 additions & 1 deletion py-polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ impl PyLazyFrame {
#[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string,
infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema
encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema
)
)]
fn new_from_csv(
Expand Down Expand Up @@ -170,6 +170,7 @@ impl PyLazyFrame {
raise_if_empty: bool,
truncate_ragged_lines: bool,
decimal_comma: bool,
glob: bool,
schema: Option<Wrap<Schema>>,
) -> PyResult<Self> {
let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -214,6 +215,7 @@ impl PyLazyFrame {
.with_missing_is_null(!missing_utf8_is_empty_string)
.truncate_ragged_lines(truncate_ragged_lines)
.with_decimal_comma(decimal_comma)
.with_glob(glob)
.raise_if_empty(raise_if_empty);

if let Some(lambda) = with_schema_modify {
Expand Down
12 changes: 12 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2081,3 +2081,15 @@ def test_fsspec_not_available(monkeypatch: pytest.MonkeyPatch) -> None:
pl.read_csv(
"s3://foods/cabbage.csv", storage_options={"key": "key", "secret": "secret"}
)


@pytest.mark.write_disk()
@pytest.mark.skipif(os.environ.get("POLARS_FORCE_ASYNC") == "1", reason="only local")
def test_no_glob(tmpdir: Path) -> None:
df = pl.DataFrame({"foo": 1})
p = tmpdir / "*.csv"
df.write_csv(str(p))
p = tmpdir / "*1.csv"
df.write_csv(str(p))
p = tmpdir / "*.csv"
assert_frame_equal(pl.read_csv(str(p), glob=False), df)

0 comments on commit 27a609c

Please sign in to comment.