rapidsai · kkraus14 · Jul 9, 2020 · Jun 18, 2020 · Jun 22, 2020 · Jun 22, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,7 @@
 - PR #5488 Add plumbings for `.str.replace_tokens`
 - PR #5502 Add Unsigned int types support in dlpack
 - PR #5497 Add `.str.isinteger` & `.str.isfloat`
+- PR #5511 Port of clx subword tokenizer to cudf 
 - PR #5528 Add unsigned int reading and writing support to parquet
 - PR #5510 Add support for `cudf.Index` to create Indexes
 - PR #5536 Parquet reader - add support for multiple sources

@@ -566,6 +566,10 @@ add_library(cudf
             src/text/tokenize.cu
             src/text/ngrams_tokenize.cu
             src/text/replace.cu
+            src/text/subword/load_hash_file.cu
+            src/text/subword/data_normalizer.cu
+            src/text/subword/wordpiece_tokenizer.cu
+            src/text/subword/subword_tokenize.cu
             src/scalar/scalar.cpp
             src/scalar/scalar_factories.cpp
             src/dictionary/add_keys.cu

@@ -213,3 +213,11 @@ set(CSV_WRITER_BENCH_SRC
   "${CMAKE_CURRENT_SOURCE_DIR}/io/csv/csv_writer_benchmark.cpp")
 
 ConfigureBench(CSV_WRITER_BENCH "${CSV_WRITER_BENCH_SRC}")
+
+###################################################################################################
+# - subword tokenizer benchmark -------------------------------------------------------------------
+
+set(SUBWORD_TOKENIZER_BENCH_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/text/subword_benchmark.cpp")
+
+ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}")
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <cudf/strings/strings_column_view.hpp>
+#include <nvtext/subword_tokenize.hpp>
+
+#include <tests/utilities/column_utilities.hpp>
+#include <tests/utilities/column_wrapper.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#define MAX_NUM_SENTENCES 101
+#define MAX_NUM_CHARS 150000
+#define MAX_ROWS_TENSOR 300
+
+static std::string create_hash_vocab_file()
+{
+  std::string dir_template("/tmp");
+  if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+  std::string hash_file = dir_template + "/hash_vocab.txt";
+  // create a fake hashed vocab text file for this test
+  // this only works with words in the strings in the benchmark code below
+  std::vector<std::pair<int, int>> coefficients(23, {65559, 0});
+  std::ofstream outfile(hash_file, std::ofstream::out);
+  outfile << "1\n0\n" << coefficients.size() << "\n";
+  for (auto c : coefficients) outfile << c.first << " " << c.second << "\n";
+  std::vector<uint64_t> hash_table(23, 0);
+  outfile << hash_table.size() << "\n";
+  hash_table[0]  = 3015668L;
+  hash_table[1]  = 6205475701751155871L;
+  hash_table[5]  = 6358029;
+  hash_table[16] = 451412625363L;
+  hash_table[20] = 6206321707968235495L;
+  for (auto h : hash_table) outfile << h << "\n";
+  outfile << "100\n101\n102\n\n";
+  return hash_file;
+}
+
+static void BM_cuda_tokenizer_cudf(benchmark::State& state)
+{
+  uint32_t nrows = MAX_NUM_SENTENCES - 1;
+  std::vector<const char*> h_strings(nrows, "This is a test ");
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  // cudf::test::strings_column_wrapper strings{"This is a test."};
+  std::string hash_file = create_hash_vocab_file();
+  std::vector<uint32_t> offsets{14};
+  uint32_t max_sequence_length = 64;
+  uint32_t stride              = 48;
+  uint32_t do_truncate         = 0;
+  uint32_t do_lower            = 1;
+  for (auto _ : state) {
+    auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
+                                           hash_file,
+                                           max_sequence_length,
+                                           stride,
+                                           do_lower,
+                                           do_truncate,
+                                           MAX_NUM_SENTENCES,
+                                           MAX_NUM_CHARS,
+                                           MAX_ROWS_TENSOR);
+  }
+}
+BENCHMARK(BM_cuda_tokenizer_cudf);
+
+BENCHMARK_MAIN();
@@ -21,6 +21,8 @@
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <thrust/scan.h>
+#include <mutex>
+#include <unordered_map>
 
 namespace cudf {
 namespace strings {
@@ -60,6 +62,47 @@ std::unique_ptr<column> make_offsets_child_column(
   return offsets_column;
 }
 
+// This template is a thin wrapper around per-context singleton objects.
+// It maintains a single object for each CUDA context.
+template <typename TableType>
+class per_context_cache {
+ public:
+  // Find an object cached for a current CUDA context.
+  // If there is no object available in the cache, it calls the initializer
+  // `init` to create a new one and cache it for later uses.
+  template <typename Initializer>
+  TableType* find_or_initialize(const Initializer& init)
+  {
+    CUcontext c;
+    cuCtxGetCurrent(&c);
+    auto finder = cache_.find(c);
+    if (finder == cache_.end()) {
+      TableType* result = init();
+      cache_[c]         = result;
+      return result;
+    } else
+      return finder->second;
+  }
+
+ private:
+  std::unordered_map<CUcontext, TableType*> cache_;
+};
+
+// This template is a thread-safe version of per_context_cache.
+template <typename TableType>
+class thread_safe_per_context_cache : public per_context_cache<TableType> {
+ public:
+  template <typename Initializer>
+  TableType* find_or_initialize(const Initializer& init)
+  {
+    std::lock_guard<std::mutex> guard(mutex);
+    return per_context_cache<TableType>::find_or_initialize(init);
+  }
+
+ private:
+  std::mutex mutex;
+};
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <nvtext/subword_tokenize.hpp>
+
+#include <stdint.h>
+#include <string.h>
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @brief Load the hashed vocabulary file into device memory.
+ *
+ * The object here can be used to call the subword_tokenize without
+ * incurring the cost of loading the same file each time.
+ *
+ * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
+ *        Note that this is the file AFTER python/perfect_hash.py has been used
+ *        for preprocessing.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Memory resource to allocate any returned objects.
+ * @return vocabulary hash-table elements
+ */
+hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
+                                       cudaStream_t stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace nvtext
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <stdint.h>
+#include <string.h>
+
+namespace nvtext {
+
+/**
+ * @brief The vocabulary data for use with the subword_tokenize function.
+ */
+struct hashed_vocabulary {
+  uint16_t first_token_id{};
+  uint16_t separator_token_id{};
+  uint16_t unknown_token_id{};
+  uint32_t outer_hash_a{};
+  uint32_t outer_hash_b{};
+  uint16_t num_bins{};
+  std::unique_ptr<cudf::column> table;             // uint64
+  std::unique_ptr<cudf::column> bin_coefficients;  // uint64
+  std::unique_ptr<cudf::column> bin_offsets;       // uint16
+};
+
+/**
+ * @brief Load the hashed vocabulary file into device memory.
+ *
+ * The object here can be used to call the subword_tokenize without
+ * incurring the cost of loading the same file each time.
+ *
+ * @throw cudf::logic_error if the `filename_hashed_vocabulary` could not be opened.
+ *
+ * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
+ *        Note that this is the file AFTER python/perfect_hash.py has been used
+ *        for preprocessing.
+ * @param mr Memory resource to allocate any returned objects.
+ * @return vocabulary hash-table elements
+ */
+hashed_vocabulary load_vocabulary_file(
+  std::string const& filename_hashed_vocabulary,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
+/**
+ * @brief Result object for the subword_tokenize functions.
+ */
+struct tokenizer_result {
+  /**
+   * @brief The number of rows for the output token-ids.
+   */
+  uint32_t nrows_tensor{};
+  /**
+   * @brief The number of token-ids in each row.
+   */
+  uint32_t sequence_length{};
+  /**
+   * @brief A vector of token-ids for each row.
+   *
+   * The data is a flat matrix (nrows_tensor x sequence_length) of token-ids.
+   * This column is of type UINT32 with no null entries.
+   */
+  std::unique_ptr<cudf::column> tensor_token_ids;
+  /**
+   * @brief This mask identifies which tensor-token-ids are valid.
+   *
+   * This column is of type UINT32 with no null entries.
+   */
+  std::unique_ptr<cudf::column> tensor_attention_mask;
+  /**
+   * @brief The metadata for each tensor row.
+   *
+   * There are three elements per tensor row [row-id, start_pos, stop_pos])
+   * This column is of type UINT32 with no null entries.
+   */
+  std::unique_ptr<cudf::column> tensor_metadata;
+};
+
+/**
+ * @brief Creates a tokenizer that cleans the text, splits it into tokens and
+ *        returns token-ids from an input vocabulary.
+ *
+ * The strings are first normalized by converting to lower-case, removing
+ * punctuation, replacing a select set of multi-byte characters and
+ * whitespace characters.
+ *
+ * The strings are then tokenized by using whitespace as a delimiter.
+ * Consecutive delimiters are ignored. Each token is then assigned
+ * a 4-byte token-id mapped from the provided vocabulary table.
+ *
+ * Essentially each string is converted into one or more vectors of token-ids
+ * in the output column. The total number of these vectors x `max_sequence_length`
+ * is the size of the output column.
+ *
+ * @throw cudf::logic_error if `stride > max_sequence_length`
+ * @throw cudf::logic_error if `max_sequence_length * max_rows_tensor` is
+ *        larger than the max value for cudf::size_type
+ *
+ * @param strings The input strings to tokenize.
+ * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
+ *        Note that this is the file AFTER python/perfect_hash.py has been used
+ *        for preprocessing.
+ * @param max_sequence_length Limit of the number of token-ids per row in final tensor
+ *        for each string.
+ * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride`
+ *        the token-ids from the previous row, unless it is the first string.
+ * @param do_lower_case If true, the tokenizer will convert uppercase characters in the
+ *        input stream to lower-case and strip accents from those characters.
+ *        If false, accented and uppercase characters are not transformed.
+ * @param do_truncate If true, the tokenizer will discard all the token-ids after
+ *        `max_sequence_length` for each input string. If false, it will use a new row
+ *        in the output token-ids to continue generating the output.
+ * @param max_num_strings Maximum number of input strings for instantiating the tokenizer.
+ *        Used for allocating temporary working memory on the GPU.
+ *        If the input contains a larger number of strings, behavior is undefined.
+ * @param max_num_chars Maximum number of characters for instantiating the tokenizer.
+ *        Used for allocating temporary working memory on the GPU.
+ *        If input contains larger number of characters, behavior is undefined.
+ * @param max_rows_tensor Maximum number of rows for the output token-ids expected
+ *        to be generated by the tokenizer.
+ *        Used for allocating temporary working memory on the GPU device.
+ *        If the output generates a larger number of rows, behavior is undefined.
+ * @param mr Memory resource to allocate any returned objects.
+ * @return token-ids, attention-mask, and metadata
+ */
+tokenizer_result subword_tokenize(
+  cudf::strings_column_view const& strings,
+  std::string const& filename_hashed_vocabulary,
+  uint32_t max_sequence_length,
+  uint32_t stride,
+  bool do_lower_case,
+  bool do_truncate,
+  uint32_t max_num_strings,
+  uint32_t max_num_chars,
+  uint32_t max_rows_tensor,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
+/**
+ * @copydoc subword_tokenize()
+ *
+ * This function differs from the one above by only the hashed vocabulary parameter.
+ * The file can be pre-loaded using the @ref load_vocabulary_file API and then
+ * passed in place of the file name in a call to this API.
+ *
+ * @param vocabulary_table The vocabulary table pre-loaded into this object.
+ */
+tokenizer_result subword_tokenize(
+  cudf::strings_column_view const& strings,
+  hashed_vocabulary const& vocabulary_table,
+  uint32_t max_sequence_length,
+  uint32_t stride,
+  bool do_lower_case,
+  bool do_truncate,
+  uint32_t max_num_strings,
+  uint32_t max_num_chars,
+  uint32_t max_rows_tensor,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
+
+}  // namespace nvtext