Skip to content

Commit

Permalink
feat[rust, python]: support custom null value for csv output (#4714)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Sep 3, 2022
1 parent 3408c52 commit 0b155a8
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 20 deletions.
6 changes: 6 additions & 0 deletions polars/polars-io/src/csv/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,10 @@ where
self.options.quote = char;
self
}

/// Set the CSV file's null value representation
pub fn with_null_value(mut self, null_value: String) -> Self {
self.options.null = null_value;
self
}
}
8 changes: 4 additions & 4 deletions polars/polars-io/src/csv/write_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ fn fast_float_write<N: ToLexical>(f: &mut Vec<u8>, n: N, write_size: usize) -> s

fn write_anyvalue(f: &mut Vec<u8>, value: AnyValue, options: &SerializeOptions) {
match value {
AnyValue::Null => write!(f, ""),
AnyValue::Null => write!(f, "{}", &options.null),
AnyValue::Int8(v) => write!(f, "{}", v),
AnyValue::Int16(v) => write!(f, "{}", v),
AnyValue::Int32(v) => write!(f, "{}", v),
Expand Down Expand Up @@ -133,6 +133,8 @@ pub struct SerializeOptions {
pub delimiter: u8,
/// quoting character
pub quote: u8,
/// null value representation
pub null: String,
}

impl Default for SerializeOptions {
Expand All @@ -144,6 +146,7 @@ impl Default for SerializeOptions {
float_precision: None,
delimiter: b',',
quote: b'"',
null: String::new(),
}
}
}
Expand Down Expand Up @@ -171,9 +174,7 @@ pub(crate) fn write<W: Write>(

let len = df.height();
let n_threads = POOL.current_num_threads();

let total_rows_per_pool_iter = n_threads * chunk_size;

let any_value_iter_pool = LowContentionPool::<Vec<_>>::new(n_threads);
let write_buffer_pool = LowContentionPool::<Vec<_>>::new(n_threads);

Expand All @@ -186,7 +187,6 @@ pub(crate) fn write<W: Write>(
let thread_offset = thread_no * chunk_size;
let total_offset = n_rows_finished + thread_offset;
let df = df.slice(total_offset as i64, chunk_size);

let cols = df.get_columns();

// Safety:
Expand Down
10 changes: 9 additions & 1 deletion py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1794,6 +1794,7 @@ def write_csv(
date_format: str | None = None,
time_format: str | None = None,
float_precision: int | None = None,
null_value: str | None = None,
) -> str | None:
"""
Write to comma-separated values (CSV) file.
Expand Down Expand Up @@ -1825,6 +1826,8 @@ def write_csv(
float_precision
Number of decimal places to write, applied to both ``Float32`` and
``Float64`` datatypes.
null_value
A string representing null values (defaulting to the empty string).
Examples
--------
Expand All @@ -1843,8 +1846,11 @@ def write_csv(
"""
if len(sep) > 1:
raise ValueError("only single byte separator is allowed")
if len(quote) > 1:
elif len(quote) > 1:
raise ValueError("only single byte quote char is allowed")
elif null_value == "":
null_value = None

if file is None:
buffer = BytesIO()
self._df.write_csv(
Expand All @@ -1857,6 +1863,7 @@ def write_csv(
date_format,
time_format,
float_precision,
null_value,
)
return str(buffer.getvalue(), encoding="utf-8")

Expand All @@ -1873,6 +1880,7 @@ def write_csv(
date_format,
time_format,
float_precision,
null_value,
)
return None

Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,9 @@ impl PyDataFrame {
date_format: Option<String>,
time_format: Option<String>,
float_precision: Option<usize>,
null_value: Option<String>,
) -> PyResult<()> {
let null = null_value.unwrap_or(String::new());
if let Ok(s) = py_f.extract::<&str>(py) {
let f = std::fs::File::create(s).unwrap();
// no need for a buffered writer, because the csv writer does internal buffering
Expand All @@ -443,6 +445,7 @@ impl PyDataFrame {
.with_date_format(date_format)
.with_time_format(time_format)
.with_float_precision(float_precision)
.with_null_value(null)
.finish(&mut self.df)
.map_err(PyPolarsErr::from)?;
} else {
Expand All @@ -456,6 +459,7 @@ impl PyDataFrame {
.with_date_format(date_format)
.with_time_format(time_format)
.with_float_precision(float_precision)
.with_null_value(null)
.finish(&mut self.df)
.map_err(PyPolarsErr::from)?;
}
Expand Down
31 changes: 16 additions & 15 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,12 @@ def test_csv_null_values() -> None:
"""
)
f = io.StringIO(csv)

df = pl.read_csv(f, null_values="na")
assert df[0, "a"] is None
assert df[1, "b"] is None
assert df.rows() == [(None, "b", "c"), ("a", None, "c")]

out = io.BytesIO()
df.write_csv(out, null_value="na")
assert csv == out.getvalue().decode("ascii")

csv = textwrap.dedent(
"""\
Expand All @@ -86,20 +88,18 @@ def test_csv_null_values() -> None:
)
f = io.StringIO(csv)
df = pl.read_csv(f, null_values=["na", "n/a"])
assert df[0, "a"] is None
assert df[1, "b"] is None
assert df.rows() == [(None, "b", "c"), ("a", None, "c")]

csv = textwrap.dedent(
"""\
r"""
a,b,c
na,b,c
a,n/a,c
a,\N,c
"""
)
f = io.StringIO(csv)
df = pl.read_csv(f, null_values={"a": "na", "b": "n/a"})
assert df[0, "a"] is None
assert df[1, "b"] is None
df = pl.read_csv(f, null_values={"a": "na", "b": r"\N"})
assert df.rows() == [(None, "b", "c"), ("a", None, "c")]


def test_datetime_parsing() -> None:
Expand Down Expand Up @@ -517,11 +517,12 @@ def test_csv_schema_offset(foods_csv: str) -> None:

def test_empty_string_missing_round_trip() -> None:
df = pl.DataFrame({"varA": ["A", "", None], "varB": ["B", "", None]})
f = io.BytesIO()
df.write_csv(f)
f.seek(0)
df_read = pl.read_csv(f)
assert df.frame_equal(df_read)
for null in (None, "NA", "NULL", r"\N"):
f = io.BytesIO()
df.write_csv(f, null_value=null)
f.seek(0)
df_read = pl.read_csv(f, null_values=null)
assert df.frame_equal(df_read)


def test_write_csv_delimiter() -> None:
Expand Down

0 comments on commit 0b155a8

Please sign in to comment.