Skip to content

Commit

Permalink
Update zfill to match Python output (#11634)
Browse files Browse the repository at this point in the history
Fixes `cudf::strings::zfill` to match Python's `zfill` behavior. This will match Pandas 1.5 `zfill` as well.
The new behavior correctly skips the leading sign character when applying the '0' character fill.
Updates gtests and added more test data.
The pytest was updated to xfail for test data with leading sign characters until Pandas 1.5 is supported.
The Java tests did not include any test data with sign characters.

Closes #11632

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: #11634
  • Loading branch information
davidwendt committed Sep 2, 2022
1 parent f382403 commit dc0d8d1
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 56 deletions.
15 changes: 8 additions & 7 deletions cpp/include/cudf/strings/padding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,27 +64,28 @@ std::unique_ptr<column> pad(
/**
* @brief Add '0' as padding to the left of each string.
*
* This is equivalent to `pad(width,left,'0')` but preserves the sign character
* if it appears in the first position.
*
* If the string is already width or more characters, no padding is performed.
* No strings are truncated.
*
* This equivalent to `pad(width,left,'0')` but is more optimized for this special case.
*
* Null string entries result in null entries in the output column.
* Null rows in the input result in corresponding null rows in the output column.
*
* @code{.pseudo}
* Example:
* s = ['1234','-9876','+0.34','-342567']
* s = ['1234','-9876','+0.34','-342567', '2+2']
* r = zfill(s,6)
* r is now ['001234','0-9876','0+0.34','-342567']
* r is now ['001234','-09876','+00.34','-342567', '0002+2']
* @endcode
*
* @param strings Strings instance for this operation.
* @param input Strings instance for this operation.
* @param width The minimum number of characters for each string.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of strings.
*/
std::unique_ptr<column> zfill(
strings_column_view const& strings,
strings_column_view const& input,
size_type width,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
42 changes: 19 additions & 23 deletions cpp/src/strings/padding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -146,58 +146,54 @@ std::unique_ptr<column> pad(
std::move(null_mask));
}

//
// Although zfill is identical to pad(width,'left','0') this implementation is a little
// more optimized since it does not need to calculate the size of the fillchar and can
// directly write it to the output buffer without extra logic for multi-byte UTF-8 chars.
//
std::unique_ptr<column> zfill(
strings_column_view const& strings,
strings_column_view const& input,
size_type width,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_column(type_id::STRING);
if (input.is_empty()) return make_empty_column(type_id::STRING);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto strings_column = column_device_view::create(input.parent(), stream);
auto d_strings = *strings_column;

// copy bitmask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);

// build offsets column
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<int32_t>(0),
compute_pad_output_length_fn{d_strings, width, 1}); // fillchar is 1 byte
auto offsets_column = make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();
offsets_transformer_itr, offsets_transformer_itr + input.size(), stream, mr);
auto const d_offsets = offsets_column->view().data<int32_t>();

// build chars column
auto const bytes =
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
auto const bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), input.size(), stream);
auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
auto d_chars = chars_column->mutable_view().data<char>();

thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
input.size(),
[d_strings, width, d_offsets, d_chars] __device__(size_type idx) {
if (d_strings.is_null(idx)) return;
string_view d_str = d_strings.element<string_view>(idx);
auto length = d_str.length();
char* out_ptr = d_chars + d_offsets[idx];
auto d_str = d_strings.element<string_view>(idx);
auto length = d_str.length();
auto in_ptr = d_str.data();
auto out_ptr = d_chars + d_offsets[idx];
// if the string starts with a sign, output the sign first
if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) {
*out_ptr++ = *in_ptr++;
d_str = string_view{in_ptr, d_str.size_bytes() - 1};
}
while (length++ < width)
*out_ptr++ = '0'; // prepend zero char
copy_string(out_ptr, d_str);
});

return make_strings_column(strings_count,
return make_strings_column(input.size(),
std::move(offsets_column),
std::move(chars_column),
strings.null_count(),
std::move(null_mask));
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr));
}

} // namespace detail
Expand Down
28 changes: 18 additions & 10 deletions cpp/tests/strings/pad_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,19 +136,27 @@ INSTANTIATE_TEST_CASE_P(StringsPadTest,
TEST_F(StringsPadTest, ZFill)
{
std::vector<const char*> h_strings{
"654321", "-12345", nullptr, "", "-5", "0987", "4", "+8.5", "éé"};
cudf::test::strings_column_wrapper strings(
"654321", "-12345", nullptr, "", "-5", "0987", "4", "+8.5", "éé", "+abé", "é+a", "100-"};
cudf::test::strings_column_wrapper input(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::size_type width = 6;
std::string phil = "+";
auto strings_view = cudf::strings_column_view(strings);

auto results = cudf::strings::zfill(strings_view, width);

std::vector<const char*> h_expected{
"654321", "-12345", nullptr, "000000", "0000-5", "000987", "000004", "00+8.5", "0000éé"};
auto strings_view = cudf::strings_column_view(input);

auto results = cudf::strings::zfill(strings_view, 6);

std::vector<const char*> h_expected{"654321",
"-12345",
nullptr,
"000000",
"-00005",
"000987",
"000004",
"+008.5",
"0000éé",
"+00abé",
"000é+a",
"00100-"};
cudf::test::strings_column_wrapper expected(
h_expected.begin(),
h_expected.end(),
Expand Down
21 changes: 7 additions & 14 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2924,7 +2924,7 @@ def pad(
Equivalent to ``Series.str.pad(side='right')``.
center
Fills boths sides of strings with an arbitrary character.
Fills both sides of strings with an arbitrary character.
Equivalent to ``Series.str.pad(side='both')``.
zfill
Expand Down Expand Up @@ -2984,6 +2984,9 @@ def zfill(self, width: int) -> SeriesOrIndex:
width. Strings in the Series/Index with length greater
or equal to width are unchanged.
The sign character is preserved if it appears in the first
position of the string.
Parameters
----------
width : int
Expand All @@ -3008,13 +3011,7 @@ def zfill(self, width: int) -> SeriesOrIndex:
Fills the specified sides of strings with an arbitrary character.
center
Fills boths sides of strings with an arbitrary character.
Notes
-----
Differs from `str.zfill()
<https://docs.python.org/3/library/stdtypes.html#str.zfill>`_
which has special handling for ‘+’/’-‘ in the string.
Fills both sides of strings with an arbitrary character.
Examples
--------
Expand All @@ -3028,15 +3025,11 @@ def zfill(self, width: int) -> SeriesOrIndex:
dtype: object
Note that ``None`` is not string, therefore it is converted
to ``None``. The minus sign in ``'-1'`` is treated as a
regular character and the zero is added to the left
of it (`str.zfill()
<https://docs.python.org/3/library/stdtypes.html#str.zfill>`_
would have moved it to the left). ``1000`` remains unchanged as
to ``None``. ``1000`` remains unchanged as
it is longer than width.
>>> s.str.zfill(3)
0 0-1
0 -01
1 001
2 1000
3 <NA>
Expand Down
9 changes: 7 additions & 2 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1765,8 +1765,13 @@ def test_strings_filling_tests(data, width, fillchar):
[
["A,,B", "1,,5", "3,00,0"],
["Linda van der Berg", "George Pitt-Rivers"],
["+23", "³", "⅕", ""],
["hello", "there", "world", "+1234", "-1234", None, "accént", ""],
["³", "⅕", ""],
pytest.param(
["hello", "there", "world", "+1234", "-1234", None, "accént", ""],
marks=pytest.mark.xfail(
reason="pandas 1.5 upgrade TODO",
),
),
[" ", "\t\r\n ", ""],
["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None],
],
Expand Down

0 comments on commit dc0d8d1

Please sign in to comment.