Skip to content

Commit

Permalink
ARROW-17847: [C++] Support unquoted decimal in JSON parser (apache#14242
Browse files Browse the repository at this point in the history
)

Support both quoted and unquoted decimal in JSON parser automatically.

Merge conflicts to 8.0.0:
Replace std::string_view with util::string_view

Authored-by: Jin Shang <shangjin1997@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
js8544 authored and stiga-huang committed Oct 11, 2022
1 parent 0d30a05 commit 4c46cb1
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 8 deletions.
42 changes: 35 additions & 7 deletions cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ static Status ParseError(T&&... t) {
}

const std::string& Kind::Name(Kind::type kind) {
static const std::string names[] = {"null", "boolean", "number",
"string", "array", "object"};
static const std::string names[] = {
"null", "boolean", "number", "string", "array", "object", "number_or_string",
};

return names[kind];
}
Expand All @@ -71,14 +72,15 @@ const std::shared_ptr<const KeyValueMetadata>& Kind::Tag(Kind::type kind) {
key_value_metadata({{"json_kind", Kind::Name(Kind::kString)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kArray)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kObject)}}),
key_value_metadata({{"json_kind", Kind::Name(Kind::kNumberOrString)}}),
};
return tags[kind];
}

static arrow::internal::Trie MakeFromTagTrie() {
arrow::internal::TrieBuilder builder;
for (auto kind : {Kind::kNull, Kind::kBoolean, Kind::kNumber, Kind::kString,
Kind::kArray, Kind::kObject}) {
Kind::kArray, Kind::kObject, Kind::kNumberOrString}) {
DCHECK_OK(builder.Append(Kind::Name(kind)));
}
auto name_to_kind = builder.Finish();
Expand All @@ -104,7 +106,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) {
Status Visit(const BinaryType&) { return SetKind(Kind::kString); }
Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const TimestampType&) { return SetKind(Kind::kString); }
Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); }
Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); }
Status Visit(const DictionaryType& dict_type) {
return Kind::ForType(*dict_type.value_type(), kind_);
}
Expand Down Expand Up @@ -393,6 +395,12 @@ class RawArrayBuilder<Kind::kObject> {
TypedBufferBuilder<bool> null_bitmap_builder_;
};

template <>
class RawArrayBuilder<Kind::kNumberOrString> : public ScalarBuilder {
public:
using ScalarBuilder::ScalarBuilder;
};

class RawBuilderSet {
public:
explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {}
Expand Down Expand Up @@ -432,6 +440,9 @@ class RawBuilderSet {
case Kind::kString:
return MakeBuilder<Kind::kString>(leading_nulls, builder);

case Kind::kNumberOrString:
return MakeBuilder<Kind::kNumberOrString>(leading_nulls, builder);

case Kind::kArray: {
RETURN_NOT_OK(MakeBuilder<Kind::kArray>(leading_nulls, builder));
const auto& list_type = checked_cast<const ListType&>(t);
Expand Down Expand Up @@ -493,6 +504,10 @@ class RawBuilderSet {
case Kind::kString:
return Cast<Kind::kString>(builder)->AppendNull();

case Kind::kNumberOrString: {
return Cast<Kind::kNumberOrString>(builder)->AppendNull();
}

case Kind::kArray:
return Cast<Kind::kArray>(builder)->AppendNull();

Expand All @@ -506,6 +521,7 @@ class RawBuilderSet {
}
return Status::OK();
}

default:
return Status::NotImplemented("invalid builder Kind");
}
Expand All @@ -532,6 +548,9 @@ class RawBuilderSet {
case Kind::kString:
return FinishScalar(scalar_values, Cast<Kind::kString>(builder), out);

case Kind::kNumberOrString:
return FinishScalar(scalar_values, Cast<Kind::kNumberOrString>(builder), out);

case Kind::kArray:
return Cast<Kind::kArray>(builder)->Finish(std::move(finish_children), out);

Expand Down Expand Up @@ -565,7 +584,8 @@ class RawBuilderSet {
std::vector<RawArrayBuilder<Kind::kNumber>>,
std::vector<RawArrayBuilder<Kind::kString>>,
std::vector<RawArrayBuilder<Kind::kArray>>,
std::vector<RawArrayBuilder<Kind::kObject>>>
std::vector<RawArrayBuilder<Kind::kObject>>,
std::vector<RawArrayBuilder<Kind::kNumberOrString>>>
arenas_;
};

Expand Down Expand Up @@ -612,12 +632,20 @@ class HandlerBase : public BlockParser,
}

bool RawNumber(const char* data, rj::SizeType size, ...) {
status_ = AppendScalar<Kind::kNumber>(builder_, string_view(data, size));
if (builder_.kind == Kind::kNumberOrString) {
status_ = AppendScalar<Kind::kNumberOrString>(builder_, string_view(data, size));
} else {
status_ = AppendScalar<Kind::kNumber>(builder_, string_view(data, size));
}
return status_.ok();
}

bool String(const char* data, rj::SizeType size, ...) {
status_ = AppendScalar<Kind::kString>(builder_, string_view(data, size));
if (builder_.kind == Kind::kNumberOrString) {
status_ = AppendScalar<Kind::kNumberOrString>(builder_, string_view(data, size));
} else {
status_ = AppendScalar<Kind::kString>(builder_, string_view(data, size));
}
return status_.ok();
}

Expand Down
10 changes: 9 additions & 1 deletion cpp/src/arrow/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,15 @@ class ResizableBuffer;
namespace json {

struct Kind {
enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject };
enum type : uint8_t {
kNull,
kBoolean,
kNumber,
kString,
kArray,
kObject,
kNumberOrString
};

static const std::string& Name(Kind::type);

Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "arrow/json/test_common.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/string_view.h"

Expand Down Expand Up @@ -136,6 +137,24 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) {
"[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
}

TEST(BlockParserWithSchema, UnquotedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
AssertParseColumns(options, unquoted_decimal_src(),
{field("price", utf8()), field("cost", utf8())},
{R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"});
}

TEST(BlockParserWithSchema, MixedDecimal) {
auto options = ParseOptions::Defaults();
options.explicit_schema =
schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
AssertParseColumns(options, mixed_decimal_src(),
{field("price", utf8()), field("cost", utf8())},
{R"(["30.04", "1.23"])", R"(["30.001", "1.229"])"});
}

class BlockParserTypeError : public ::testing::TestWithParam<UnexpectedFieldBehavior> {
public:
ParseOptions Options(std::shared_ptr<Schema> explicit_schema) {
Expand Down
31 changes: 31 additions & 0 deletions cpp/src/arrow/json/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/json/test_common.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"

namespace arrow {
namespace json {
Expand Down Expand Up @@ -203,6 +204,36 @@ TEST_P(ReaderTest, MultipleChunks) {
AssertTablesEqual(*expected_table, *table_);
}

TEST_P(ReaderTest, UnquotedDecimal) {
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = unquoted_decimal_src();
SetUpReader(src);
ASSERT_OK_AND_ASSIGN(table_, reader_->Read());

auto expected_table = TableFromJSON(schema, {R"([
{ "price": "30.04", "cost":"30.001" },
{ "price": "1.23", "cost":"1.229" }
])"});
AssertTablesEqual(*expected_table, *table_);
}

TEST_P(ReaderTest, MixedDecimal) {
auto schema =
::arrow::schema({field("price", decimal(9, 2)), field("cost", decimal(9, 3))});
parse_options_.explicit_schema = schema;
auto src = mixed_decimal_src();
SetUpReader(src);
ASSERT_OK_AND_ASSIGN(table_, reader_->Read());

auto expected_table = TableFromJSON(schema, {R"([
{ "price": "30.04", "cost":"30.001" },
{ "price": "1.23", "cost":"1.229" }
])"});
AssertTablesEqual(*expected_table, *table_);
}

TEST(ReaderTest, MultipleChunksParallel) {
int64_t count = 1 << 10;

Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/json/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,19 @@ inline static std::string null_src() {
)";
}

inline static std::string unquoted_decimal_src() {
return R"(
{ "price": 30.04, "cost":30.001 }
{ "price": 1.23, "cost":1.229 }
)";
}

inline static std::string mixed_decimal_src() {
return R"(
{ "price": 30.04, "cost": 30.001 }
{ "price": "1.23", "cost": "1.229" }
)";
}

} // namespace json
} // namespace arrow

0 comments on commit 4c46cb1

Please sign in to comment.