Skip to content

Commit

Permalink
Add regex_program strings extract java APIs and tests (#12699)
Browse files Browse the repository at this point in the history
This PR adds [extract, extract_all_record](https://docs.rapids.ai/api/libcudf/nightly/strings_2extract_8hpp.html) related `regex_program` java APIs and unit tests.
Part of work for NVIDIA/spark-rapids#7295.

Authors:
  - Cindy Jiang (https://github.com/cindyyuanjiang)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: #12699
  • Loading branch information
cindyyuanjiang authored Feb 6, 2023
1 parent 8286001 commit c7db81a
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 19 deletions.
31 changes: 27 additions & 4 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -3252,10 +3252,28 @@ public final ColumnVector containsRe(RegexProgram regexProg) {
* @throws CudfException if any error happens including if the RE does
* not contain any capture groups.
*/
@Deprecated
public final Table extractRe(String pattern) throws CudfException {
return extractRe(new RegexProgram(pattern));
}

/**
* For each captured group specified in the given regex program
* return a column in the table. Null entries are added if the string
* does not match. Any null inputs also result in null output entries.
*
* For supported regex patterns refer to:
* @link https://docs.rapids.ai/api/libcudf/nightly/md_regex.html
* @param regexProg the regex program to use
* @return the table of extracted matches
* @throws CudfException if any error happens including if the regex
* program does not contain any capture groups.
*/
public final Table extractRe(RegexProgram regexProg) throws CudfException {
assert type.equals(DType.STRING) : "column type must be a String";
assert pattern != null : "pattern may not be null";
return new Table(extractRe(this.getNativeView(), pattern));
assert regexProg != null : "regex program may not be null";
return new Table(extractRe(this.getNativeView(), regexProg.pattern(),
regexProg.combinedFlags(), regexProg.capture().nativeId));
}

/**
Expand Down Expand Up @@ -4100,9 +4118,14 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
private static native long stringContains(long cudfViewHandle, long compString) throws CudfException;

/**
* Native method for extracting results from an regular expressions. Returns a table handle.
* Native method for extracting results from a regex program pattern. Returns a table handle.
*
* @param cudfViewHandle Native handle of the cudf::column_view being operated on.
* @param pattern String regex pattern.
* @param flags Regex flags setting.
* @param capture Capture groups setting.
*/
private static native long[] extractRe(long cudfViewHandle, String pattern) throws CudfException;
private static native long[] extractRe(long cudfViewHandle, String pattern, int flags, int capture) throws CudfException;

/**
* Native method for extracting all results corresponding to group idx from a regex program pattern.
Expand Down
20 changes: 12 additions & 8 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1674,18 +1674,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env,

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass,
jlong j_view_handle,
jstring patternObj) {
jstring pattern_obj,
jint regex_flags,
jint capture_groups) {
JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr);
JNI_NULL_CHECK(env, patternObj, "pattern is null", nullptr);
JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr);

try {
cudf::jni::auto_set_device(env);
cudf::strings_column_view const strings_column{
*reinterpret_cast<cudf::column_view *>(j_view_handle)};
cudf::jni::native_jstring pattern(env, patternObj);

return cudf::jni::convert_table_for_return(
env, cudf::strings::extract(strings_column, pattern.get()));
auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
auto const strings_column = cudf::strings_column_view{*column_view};
auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
return cudf::jni::convert_table_for_return(env,
cudf::strings::extract(strings_column, *regex_prog));
}
CATCH_STD(env, 0);
}
Expand Down
18 changes: 11 additions & 7 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4040,14 +4040,18 @@ void testStringFindOperations() {

@Test
void testExtractRe() {
try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null);
Table expected = new Table.TestBuilder()
.column("a", "b", null, null)
.column("1", "2", null, null)
.build();
Table found = input.extractRe("([ab])(\\d)")) {
assertTablesAreEqual(expected, found);
try (ColumnVector input = ColumnVector.fromStrings("a1", "b2", "c3", null);
Table expected = new Table.TestBuilder()
.column("a", "b", null, null)
.column("1", "2", null, null)
.build()) {
try (Table found = input.extractRe("([ab])(\\d)")) {
assertTablesAreEqual(expected, found);
}
try (Table found = input.extractRe(new RegexProgram("([ab])(\\d)"))) {
assertTablesAreEqual(expected, found);
}
}
}

@Test
Expand Down

0 comments on commit c7db81a

Please sign in to comment.