Skip to content

Commit

Permalink
API for JSON unquoted whitespace normalization (#15033)
Browse files Browse the repository at this point in the history
This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer. 
Addresses feature request #14865

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: #15033
  • Loading branch information
shrshi committed Mar 4, 2024
1 parent 0ff5a2c commit d158ccd
Show file tree
Hide file tree
Showing 11 changed files with 314 additions and 184 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ add_library(
src/io/functions.cpp
src/io/json/byte_range_info.cu
src/io/json/json_column.cu
src/io/json/json_quote_normalization.cu
src/io/json/json_normalization.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
Expand Down
10 changes: 10 additions & 0 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Normalize unquoted whitespace (space and tab characters) using FST
*
* @param inbuf Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
} // namespace cudf::io::json::detail
31 changes: 31 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class json_reader_options {
// Normalize single quotes
bool _normalize_single_quotes = false;

// Normalize unquoted spaces and tabs
bool _normalize_whitespace = false;

// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

Expand Down Expand Up @@ -265,6 +268,13 @@ class json_reader_options {
*/
bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }

/**
* @brief Whether the reader should normalize unquoted whitespace characters
*
* @returns true if the reader should normalize whitespace, false otherwise
*/
bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }

/**
* @brief Queries the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -358,6 +368,14 @@ class json_reader_options {
*/
void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }

/**
* @brief Set whether the reader should enable normalization of unquoted whitespace
*
* @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
* characters i.e. tabs and spaces
*/
void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -533,6 +551,19 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether the reader should normalize unquoted whitespace
*
* @param val Boolean value to indicate whether the reader should normalize unquoted
* whitespace
* @return this for chaining
*/
json_reader_options_builder& normalize_whitespace(bool val)
{
options._normalize_whitespace = val;
return *this;
}

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@

namespace cudf::io::json {

using SymbolT = char;
using StateT = char;
// Type used to represent the atomic symbol type used within the finite-state machine
using SymbolT = char;
using StateT = char;

// Type sufficiently large to index symbols within the input and output (may be unsigned)
using SymbolOffsetT = uint32_t;

namespace normalize_quotes {

// Type sufficiently large to index symbols within the input and output (may be unsigned)
enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
Expand Down Expand Up @@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {

} // namespace normalize_quotes

namespace normalize_whitespace {

enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
ESCAPE_CHAR, ///< Escape character SG: '\\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' '
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
// Alias for readability of symbol group ids
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
{{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};

/**
* -------- FST states ---------
* -----------------------------
* TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
* | quotes as well as any other character not enclosed by a string. Also handles
* | newline character present within a string
* TT_DQS | Double-quoted string state handling all characters within double quotes except
* | newline character
* TT_DEC | State handling escaped characters inside double-quoted string. Note that this
* | state is necessary to process escaped double-quote characters. Without this
* | state, whitespaces following escaped double quotes inside strings may be removed.
*
* NOTE: An important case NOT handled by this FST is that of whitespace following newline
* characters within a string. Consider the following example
* Input: {"a":"x\n y"}
* FST output: {"a":"x\ny"}
* Expected output: {"a":"x\n y"}
* Such strings are not part of the JSON standard (characters allowed within quotes should
* have ASCII at least 0x20 i.e. space character and above) but may be encountered while
* reading JSON files
*/
enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
// Aliases for readability of the transition table
constexpr auto TT_OOS = dfa_states::TT_OOS;
constexpr auto TT_DQS = dfa_states::TT_DQS;
constexpr auto TT_DEC = dfa_states::TT_DEC;
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
{/* IN_STATE " \ \n <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};

// The DFA's starting state
constexpr StateT start_state = static_cast<StateT>(TT_OOS);

struct TransduceToNormalizedWS {
/**
* @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
*/
template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
SymbolGroupT const match_id,
RelativeOffsetT const relative_offset,
SymbolT const read_symbol) const
{
// -------- TRANSLATION TABLE ------------
// Let the alphabet set be Sigma
// ---------------------------------------
// ---------- NON-SPECIAL CASES: ----------
// Output symbol same as input symbol <s>
// state | read_symbol <s> -> output_symbol <s>
// DQS | Sigma -> Sigma
// OOS | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
// DEC | Sigma -> Sigma
// ---------- SPECIAL CASES: --------------
// Input symbol translates to output symbol
// OOS | {<SPC>} -> <nop>
// OOS | {\t} -> <nop>

// Case when read symbol is a space or tab but is unquoted
// This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
// However, since there is no output in this case i.e. the count returned by
// operator()(state_id, match_id, read_symbol) is zero, this function is never called.
// So skipping the check for this case.

// In all other cases, we have an output symbol for the input symbol.
// We simply output the input symbol
return read_symbol;
}

/**
* @brief Returns the number of output characters for a given transition.
* During whitespace normalization, we always emit one output character i.e., the input
* character, except when we need to remove the space/tab character
*/
template <typename StateT, typename SymbolGroupT, typename SymbolT>
constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
SymbolGroupT const match_id,
SymbolT const read_symbol) const
{
// Case when read symbol is a space or tab but is unquoted
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
return 0;
}
return 1;
}
};

} // namespace normalize_whitespace

namespace detail {

rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
Expand All @@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
return outbuf;
}

rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
stream);

rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
parser.Transduce(inbuf.data(),
static_cast<SymbolOffsetT>(inbuf.size()),
outbuf.data(),
thrust::make_discard_iterator(),
outbuf_size.data(),
normalize_whitespace::start_state,
stream);

outbuf.resize(outbuf_size.value(stream), stream);
return outbuf;
}

} // namespace detail
} // namespace cudf::io::json
7 changes: 7 additions & 0 deletions cpp/src/io/json/read_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
}

// If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
// enabled, invoke pre-processing FST
if (reader_opts.is_enabled_normalize_whitespace()) {
buffer =
normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
}

return device_parse_nested_json(buffer, reader_opts, stream, mr);
// For debug purposes, use host_parse_nested_json()
}
Expand Down

0 comments on commit d158ccd

Please sign in to comment.