API for JSON unquoted whitespace normalization (#15033)

This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer. Addresses feature request #14865 Authors: - Shruti Shivakumar (https://github.com/shrshi) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) URL: #15033
rapidsai · Mar 4, 2024 · d158ccd · d158ccd
1 parent 0ff5a2c
commit d158ccd
Show file tree

Hide file tree

Showing 11 changed files with 314 additions and 184 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -376,7 +376,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_quote_normalization.cu
+  src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
@@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Normalize unquoted whitespace (space and tab characters) using FST
+ *
+ * @param inbuf Input device buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
@@ -118,6 +118,9 @@ class json_reader_options {
   // Normalize single quotes
   bool _normalize_single_quotes = false;
 
+  // Normalize unquoted spaces and tabs
+  bool _normalize_whitespace = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -265,6 +268,13 @@ class json_reader_options {
    */
   bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize unquoted whitespace characters
+   *
+   * @returns true if the reader should normalize whitespace, false otherwise
+   */
+  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -358,6 +368,14 @@ class json_reader_options {
    */
   void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
+   * characters i.e. tabs and spaces
+   */
+  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -533,6 +551,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted
+   * whitespace
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_whitespace(bool val)
+  {
+    options._normalize_whitespace = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *

diff --git a/cpp/src/io/json/json_quote_normalization.cu → cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_quote_normalization.cu → cpp/src/io/json/json_normalization.cu
@@ -32,13 +32,15 @@
 
 namespace cudf::io::json {
 
-using SymbolT       = char;
-using StateT        = char;
+// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+using StateT  = char;
+
+// Type sufficiently large to index symbols within the input and output (may be unsigned)
 using SymbolOffsetT = uint32_t;
 
 namespace normalize_quotes {
 
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
@@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {
 
 }  // namespace normalize_quotes
 
+namespace normalize_whitespace {
+
+enum class dfa_symbol_group_id : uint32_t {
+  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
+  ESCAPE_CHAR,         ///< Escape character SG: '\\'
+  NEWLINE_CHAR,        ///< Newline character SG: '\n'
+  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
+  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
+};
+// Alias for readability of symbol group ids
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+// The i-th string representing all the characters of a symbol group
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
+  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+/**
+ * -------- FST states ---------
+ * -----------------------------
+ * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
+ *        |   quotes as well as any other character not enclosed by a string. Also handles
+ *        |   newline character present within a string
+ * TT_DQS | Double-quoted string state handling all characters within double quotes except
+ *        |   newline character
+ * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
+ *        |   state is necessary to process escaped double-quote characters. Without this
+ *        |   state, whitespaces following escaped double quotes inside strings may be removed.
+ *
+ * NOTE: An important case NOT handled by this FST is that of whitespace following newline
+ * characters within a string. Consider the following example
+ * Input:           {"a":"x\n y"}
+ * FST output:      {"a":"x\ny"}
+ * Expected output: {"a":"x\n y"}
+ * Such strings are not part of the JSON standard (characters allowed within quotes should
+ * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
+ * reading JSON files
+ */
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
+// Aliases for readability of the transition table
+constexpr auto TT_OOS        = dfa_states::TT_OOS;
+constexpr auto TT_DQS        = dfa_states::TT_DQS;
+constexpr auto TT_DEC        = dfa_states::TT_DEC;
+constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
+  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+
+// The DFA's starting state
+constexpr StateT start_state = static_cast<StateT>(TT_OOS);
+
+struct TransduceToNormalizedWS {
+  /**
+   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+   */
+  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    // -------- TRANSLATION TABLE ------------
+    //      Let the alphabet set be Sigma
+    // ---------------------------------------
+    // ---------- NON-SPECIAL CASES: ----------
+    //      Output symbol same as input symbol <s>
+    // state | read_symbol <s>  -> output_symbol <s>
+    // DQS   | Sigma            -> Sigma
+    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
+    // DEC   | Sigma            -> Sigma
+    // ---------- SPECIAL CASES: --------------
+    //    Input symbol translates to output symbol
+    // OOS   | {<SPC>}          -> <nop>
+    // OOS   | {\t}             -> <nop>
+
+    // Case when read symbol is a space or tab but is unquoted
+    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
+    // However, since there is no output in this case i.e. the count returned by
+    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
+    // So skipping the check for this case.
+
+    // In all other cases, we have an output symbol for the input symbol.
+    // We simply output the input symbol
+    return read_symbol;
+  }
+
+  /**
+   * @brief Returns the number of output characters for a given transition.
+   * During whitespace normalization, we always emit one output character i.e., the input
+   * character, except when we need to remove the space/tab character
+   */
+  template <typename StateT, typename SymbolGroupT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
+                                                 SymbolGroupT const match_id,
+                                                 SymbolT const read_symbol) const
+  {
+    // Case when read symbol is a space or tab but is unquoted
+    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+      return 0;
+    }
+    return 1;
+  }
+};
+
+}  // namespace normalize_whitespace
+
 namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
@@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
   return outbuf;
 }
 
+rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
+    stream);
+
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
+                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
+                   normalize_whitespace::start_state,
+                   stream);
+
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
+}
+
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
@@ -235,6 +235,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
       normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
+  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
+  // enabled, invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_whitespace()) {
+    buffer =
+      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+  }
+
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }