Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nvtext hash_character_ngrams function #13654

Merged
merged 9 commits into from
Jul 12, 2023
3 changes: 2 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/minhash.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp
TEXT_NVBENCH text/hash_ngrams.cpp text/minhash.cpp text/normalize.cpp text/replace.cpp
text/tokenize.cpp
)

# ##################################################################################################
Expand Down
61 changes: 61 additions & 0 deletions cpp/benchmarks/text/hash_ngrams.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/generate_ngrams.hpp>

#include <nvbench/nvbench.cuh>

#include <rmm/device_buffer.hpp>

static void bench_hash_ngrams(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const ngrams = static_cast<cudf::size_type>(state.get_int64("ngrams"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const strings_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
cudf::strings_column_view input(strings_table->view().column(0));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

auto chars_size = input.chars_size();
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
// output are hashes: approximate total number of hashes
state.add_global_memory_writes<nvbench::int32_t>(num_rows * ngrams);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::hash_character_ngrams(input, ngrams);
});
}

NVBENCH_BENCH(bench_hash_ngrams)
.set_name("hash_ngrams")
.add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
.add_int64_axis("row_width", {128, 512, 2048})
.add_int64_axis("ngrams", {5, 10});
37 changes: 37 additions & 0 deletions cpp/include/nvtext/detail/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <nvtext/generate_ngrams.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace nvtext {
namespace detail {

/**
* @copydoc hash_character_ngrams(cudf::strings_column_view const&,
* cudf::size_type, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for allocating/copying device memory and launching kernels
*/
std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace nvtext
35 changes: 34 additions & 1 deletion cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -90,5 +90,38 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::size_type ngrams = 2,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Hashes ngrams of characters within each string
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
*
* Each character of a string used to build the ngrams and ngrams are not
* produced across adjacent strings rows.
*
* ```
* "abcdefg" would generate ngrams=5 as ["abcde", "bcdef" "cdefg"]
* ```
*
* The ngrams for each string are hashed and returned in a list column where
* the offsets specify rows of hash values for each string.
*
* The size of the child column will be the total number of ngrams generated from
* the input strings column.
*
* All null row entries are ignored and the output contains all valid rows.
*
* The hash algorithm uses MurmurHash32 on each ngram.
*
* @throw cudf::logic_error if `ngrams < 5`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param ngrams The ngram number to generate. Default is 5.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& strings,
cudf::size_type ngrams = 5,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace nvtext
150 changes: 137 additions & 13 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
* limitations under the License.
*/

#include <nvtext/generate_ngrams.hpp>
#include <nvtext/detail/generate_ngrams.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy_if.cuh>
#include <cudf/detail/hashing.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -156,31 +160,79 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
namespace detail {
namespace {

struct character_ngram_generator_fn {
/**
* @brief Base class for generating character ngrams
*
* The ngrams are produced for each string and the derived class's
* `process_ngram` function is called for each ngram/substring.
*
* @tparam Derived class uses the CRTP pattern to reuse code logic.
*/
template <typename Derived>
struct base_character_ngram_fn {
cudf::column_device_view const d_strings;
cudf::size_type ngrams;
int32_t const* d_ngram_offsets{};
cudf::size_type* d_offsets{};
char* d_chars{};
cudf::size_type const* d_ngram_offsets{};

__device__ void operator()(cudf::size_type idx)
base_character_ngram_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets)
: d_strings(d_strings), ngrams(ngrams), d_ngram_offsets(d_ngram_offsets)
{
}

__device__ void operator()(cudf::size_type idx) const
{
if (d_strings.is_null(idx)) return;
auto const d_str = d_strings.element<cudf::string_view>(idx);
if (d_str.empty()) return;
auto const& derived = static_cast<Derived const&>(*this);
auto itr = d_str.begin();
auto const ngram_offset = d_ngram_offsets[idx];
auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset;
auto d_sizes = d_offsets + ngram_offset;
auto out_ptr = d_chars ? d_chars + *d_sizes : nullptr;
for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
auto const begin = itr.byte_offset();
auto const end = (itr + ngrams).byte_offset();
if (out_ptr)
out_ptr =
cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
else
*d_sizes++ = end - begin;
auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
derived.process_ngram(ngram, n + ngram_offset);
}
}
};

/**
* @brief Generate character ngrams for each string
*
* Each string produces many strings depending on the ngram width and the string size.
* This functor can be used with `make_strings_children` to build the offsets and
* the chars child columns.
*/
struct character_ngram_generator_fn : base_character_ngram_fn<character_ngram_generator_fn> {
cudf::size_type* d_offsets{};
char* d_chars{};

character_ngram_generator_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets)
: base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets)
{
}

/**
* @brief Called through the base class for each ngram
*
* Either stores the size of each string or copies the string to the output
*
* @param d_str The ngram substring to process
* @param offset The output position relative to d_offsets
*/
__device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
{
auto d_str_offsets = d_offsets + offset;
if (d_chars) {
auto out_ptr = d_chars + *d_str_offsets;
cudf::strings::detail::copy_string(out_ptr, d_str);
} else {
*d_str_offsets = d_str.size_bytes();
}
}
};
Expand Down Expand Up @@ -229,6 +281,70 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
}

namespace {
/**
* @brief Computes the hash of each ngram as produced by the base class
*/
struct character_ngram_hash_fn : base_character_ngram_fn<character_ngram_hash_fn> {
cudf::hash_value_type* d_hashes;

character_ngram_hash_fn(cudf::column_device_view const& d_strings,
cudf::size_type ngrams,
cudf::size_type const* d_ngram_offsets,
cudf::hash_value_type* d_hashes)
: base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets), d_hashes(d_hashes)
{
}

__device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
{
auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{0};
d_hashes[offset] = hasher(d_str);
}
};
} // namespace

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(ngrams > 4, "Parameter ngrams should be an integer value of 5 or greater");

auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }

auto const d_strings = cudf::column_device_view::create(input.parent(), stream);

// build offsets column by computing the number of ngrams per string
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0, [d_strings = *d_strings, ngrams] __device__(auto idx) {
if (d_strings.is_null(idx)) { return 0; }
auto const length = d_strings.element<cudf::string_view>(idx).length();
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
});
auto [offsets, total_ngrams] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto d_offsets = offsets->view().data<cudf::size_type>();

CUDF_EXPECTS(total_ngrams > 0,
"Insufficient number of characters in each string to generate ngrams");

// compute ngrams and build hashes
auto hashes =
cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();

character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
input.size(),
generator);

return make_lists_column(
input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
}

} // namespace detail

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
Expand All @@ -239,4 +355,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
}

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
}

} // namespace nvtext
33 changes: 33 additions & 0 deletions cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,37 @@ TEST_F(TextGenerateNgramsTest, Errors)
cudf::logic_error);
}

TEST_F(TextGenerateNgramsTest, NgramsHash)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});

auto view = cudf::strings_column_view(input);
auto results = nvtext::hash_character_ngrams(view);

using LCW = cudf::test::lists_column_wrapper<uint32_t>;
// clang-format off
LCW expected({LCW{2169381797u, 3924065905u, 1634753325u, 3766025829u, 771291085u,
2286480985u, 2815102125u, 2383213292u, 1587939873u, 3417728802u,
741580288u, 1721912990u, 3322339040u, 2530504717u, 1448945146u},
LCW{3542029734u, 2351937583u, 2373822151u, 2610417165u, 1303810911u,
2541942822u, 1736466351u, 3466558519u, 408633648u, 1698719372u,
620653030u, 16851044u, 608863326u, 948572753u, 3672211877u,
4097451013u, 1444462157u, 3762829398u, 743082018u, 2953783152u,
2319357747u}});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
{
auto input = cudf::test::strings_column_wrapper({"1", "2", "3"});
auto view = cudf::strings_column_view(input);

// invalid parameter value
EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
// strings not long enough to generate ngrams
EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
}

CUDF_TEST_PROGRAM_MAIN()
Loading