Skip to content

Commit

Permalink
Introduce GetJsonObjectOptions in getJSONObject Java API (#14956)
Browse files Browse the repository at this point in the history
Resolves [10219](NVIDIA/spark-rapids#10219)

This PR introduces a new class named `GetJsonObjectOptions` that holds the configurations to control the behavior of the underlying `cudf::get_json_object` function. It incorporates this new class into the `getJSONObject` JAVA API as an additional argument but also keeps the previous API to maintain backwards compatibility.  It also includes a test case, `testGetJSONObjectWithSingleQuotes`, validating the behavior of `getJSONObject` when single quotes are enabled.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - MithunR (https://github.com/mythrocks)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #14956
  • Loading branch information
SurajAralihalli committed Feb 12, 2024
1 parent daa63d2 commit 49c2995
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 6 deletions.
22 changes: 20 additions & 2 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -2978,6 +2978,24 @@ public final ColumnVector repeatStrings(ColumnView repeatTimes) {
repeatTimes.getNativeView()));
}

/**
* Apply a JSONPath string to all rows in an input strings column.
*
* Applies a JSONPath string to an incoming strings column where each row in the column
* is a valid json string. The output is returned by row as a strings column.
*
* For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
* Note: Only implements the operators: $ . [] *
*
* @param path The JSONPath string to be applied to each row
* @param path The GetJsonObjectOptions to control get_json_object behaviour
* @return new strings ColumnVector containing the retrieved json object strings
*/
public final ColumnVector getJSONObject(Scalar path, GetJsonObjectOptions options) {
assert(type.equals(DType.STRING)) : "column type must be a String";
return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle(), options.isAllowSingleQuotes(), options.isStripQuotesFromSingleStrings(), options.isMissingFieldsAsNulls()));
}

/**
* Apply a JSONPath string to all rows in an input strings column.
*
Expand All @@ -2992,7 +3010,7 @@ public final ColumnVector repeatStrings(ColumnView repeatTimes) {
*/
public final ColumnVector getJSONObject(Scalar path) {
assert(type.equals(DType.STRING)) : "column type must be a String";
return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle()));
return getJSONObject(path, GetJsonObjectOptions.DEFAULT);
}

/**
Expand Down Expand Up @@ -4194,7 +4212,7 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
long repeatTimesHandle);


private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException;
private static native long getJSONObject(long viewHandle, long scalarHandle, boolean allowSingleQuotes, boolean stripQuotesFromSingleStrings, boolean missingFieldsAsNulls) throws CudfException;

/**
* Native method to parse and convert a timestamp column vector to string column vector. A unix
Expand Down
75 changes: 75 additions & 0 deletions java/src/main/java/ai/rapids/cudf/GetJsonObjectOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package ai.rapids.cudf;

public final class GetJsonObjectOptions {

public static GetJsonObjectOptions DEFAULT = new GetJsonObjectOptions.Builder().build();

private final boolean allowSingleQuotes;
private final boolean stripQuotesFromSingleStrings;
private final boolean missingFieldsAsNulls;

private GetJsonObjectOptions(Builder builder) {
this.allowSingleQuotes = builder.allowSingleQuotes;
this.stripQuotesFromSingleStrings = builder.stripQuotesFromSingleStrings;
this.missingFieldsAsNulls = builder.missingFieldsAsNulls;
}

public boolean isAllowSingleQuotes() {
return allowSingleQuotes;
}

public boolean isStripQuotesFromSingleStrings() {
return stripQuotesFromSingleStrings;
}

public boolean isMissingFieldsAsNulls() {
return missingFieldsAsNulls;
}

public static Builder builder() {
return new Builder();
}

public static final class Builder {
private boolean allowSingleQuotes = false;
private boolean stripQuotesFromSingleStrings = true;
private boolean missingFieldsAsNulls = false;

public Builder allowSingleQuotes(boolean allowSingleQuotes) {
this.allowSingleQuotes = allowSingleQuotes;
return this;
}

public Builder stripQuotesFromSingleStrings(boolean stripQuotesFromSingleStrings) {
this.stripQuotesFromSingleStrings = stripQuotesFromSingleStrings;
return this;
}

public Builder missingFieldsAsNulls(boolean missingFieldsAsNulls) {
this.missingFieldsAsNulls = missingFieldsAsNulls;
return this;
}

public GetJsonObjectOptions build() {
return new GetJsonObjectOptions(this);
}
}
}
12 changes: 8 additions & 4 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2436,9 +2436,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv
CATCH_STD(env, 0)
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass,
jlong j_view_handle,
jlong j_scalar_handle) {
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle, jboolean allow_single_quotes,
jboolean strip_quotes_from_single_strings, jboolean missing_fields_as_nulls) {

JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
Expand All @@ -2448,7 +2448,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
cudf::strings_column_view n_strings_col_view(*n_column_view);
cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path));
auto options = cudf::get_json_object_options{};
options.set_allow_single_quotes(allow_single_quotes);
options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
options.set_missing_fields_as_nulls(missing_fields_as_nulls);
return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options));
}
CATCH_STD(env, 0)
}
Expand Down
16 changes: 16 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6379,6 +6379,7 @@ void testGetJSONObject() {
" }\n" +
"}";


try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " +
"Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " +
Expand All @@ -6389,6 +6390,21 @@ void testGetJSONObject() {
}
}

@Test
void testGetJSONObjectWithSingleQuotes() {
String jsonString = "{" +
"\'a\': \'A\"\'" +
"}";

GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build();
try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
ColumnVector expectedAuthors = ColumnVector.fromStrings("A\"", "A\"");
Scalar path = Scalar.fromString("$.a");
ColumnVector gotAuthors = json.getJSONObject(path, options)) {
assertColumnsAreEqual(expectedAuthors, gotAuthors);
}
}

@Test
void testMakeStructEmpty() {
final int numRows = 10;
Expand Down

0 comments on commit 49c2995

Please sign in to comment.