From e2dec2ce58bf77fbc9c083e55d8ed49cf7712e3c Mon Sep 17 00:00:00 2001 From: HuJiayin Date: Wed, 22 Jul 2015 09:19:03 +0800 Subject: [PATCH] support soundex rebase code --- .../catalyst/analysis/FunctionRegistry.scala | 1 + .../expressions/stringOperations.scala | 15 +++ .../org/apache/spark/sql/functions.scala | 16 ++++ .../spark/sql/StringFunctionsSuite.scala | 9 ++ .../apache/spark/unsafe/types/UTF8String.java | 95 +++++++++++++++++++ 5 files changed, 136 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index e3d8d2adf2135..a7de61ea31937 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -164,6 +164,7 @@ object FunctionRegistry { expression[Levenshtein]("levenshtein"), expression[RegExpExtract]("regexp_extract"), expression[RegExpReplace]("regexp_replace"), + expression[SoundEx]("soundex"), expression[StringInstr]("instr"), expression[StringLocate]("locate"), expression[StringLPad]("lpad"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 1f18a6e9ff8a5..b2b05558a3d0e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -784,6 +784,21 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres } } +/** + * A function that return soundex code of the given string expression. + */ +case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes + with CodegenFallback { + + override def dataType: DataType = StringType + + override def inputTypes: Seq[DataType] = Seq(StringType) + + override def nullSafeEval(input: Any): Any = { + input.asInstanceOf[UTF8String].soundex() + } +} + /** * Returns the numeric value of the first character of str. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index e5ff8ae7e3179..c44571615d58a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1891,6 +1891,22 @@ object functions { StringRepeat(str.expr, lit(n).expr) } + /** + * * Return the soundex code for the specified expression. + * + * @group string_funcs + * @since 1.5.0 + */ + def soundex(e: Column): Column = SoundEx(e.expr) + + /** + * Return the soundex for the specified column. + * + * @group string_funcs + * @since 1.5.0 + */ + def soundex(columnName: String): Column = soundex(Column(columnName)) + /** * Splits str around pattern (pattern is a regular expression). * NOTE: pattern is a string represent the regular expression. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 3702e73b4e74f..6494e60b6ff6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -144,6 +144,15 @@ class StringFunctionsSuite extends QueryTest { Row("aa123cc")) } + test("soundex function") { + val df = Seq(("MARY", "SU")).toDF("l", "r") + checkAnswer( + df.select(soundex("l"), soundex($"r")), Row("M600", "S000")) + + checkAnswer( + df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000")) + } + test("string instr function") { val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c") diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 946d355f1fc28..be57e1a94e9d4 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -50,6 +50,12 @@ public final class UTF8String implements Comparable, Serializable { 5, 5, 5, 5, 6, 6, 6, 6}; + /** + * Soundex mapping table + */ + private static final char[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '0', + '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '0', '2', '0', '2'}; + public static final UTF8String EMPTY_UTF8 = UTF8String.fromString(""); /** @@ -628,4 +634,93 @@ public int hashCode() { } return result; } + + /** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, + * but can also be used as a general purpose scheme to find word with similar phonemes. + * https://en.wikipedia.org/wiki/Soundex + */ + public UTF8String soundex() { + if (numBytes == 0) { + return UTF8String.fromBytes(new byte[0]); + } + String tmp; + byte data[] = {'0', '0', '0', '0'}; + char ch; + int idx = 0; + int idxp0 = 0; + int idxp = 0; + int idxpp = 0; + int offset = numBytesForFirstByte(getByte(0)); + if (offset > 1 || getByte(0) < 97 && getByte(0) > 90 + || getByte(0) < 65 || getByte(0) > 122) { + return this; + } + int i = 1; + int j = 1; + data[0] = getByte(0); + + while (i < numBytes) { + + if (j > 3) break; + if (numBytesForFirstByte(getByte(i)) > 1 || getByte(i) < 97 && getByte(i) > 90 + || getByte(i) < 65 || getByte(i) > 122) { + return this; + } + if (getByte(i) == getByte(i - 1) + || (getByte(i) - 32 == getByte(i - 1) + || (getByte(i) == getByte(i - 1) - 32))) { + i += 1; + continue; + } + if (getByte(i) <= 122 && getByte(i) >= 97) { + idx = getByte(i) - 'A' - 32; + } else { + idx = getByte(i) - 'A'; + } + if (i > 1) { + if (getByte(i - 2) <= 122 && getByte(i - 2) >= 97) { + idxpp = getByte(i - 2) - 'A' - 32; + } else { + idxpp = getByte(i - 2) - 'A'; + } + } + if (getByte(i - 1) <= 122 && getByte(i - 1) >= 97) { + idxp = getByte(i - 1) - 'A' - 32; + } else { + idxp = getByte(i - 1) - 'A'; + } + if (getByte(0) <= 122 && getByte(0) >= 97) { + idxp0 = getByte(0) - 'A' - 32; + } else { + idxp0 = getByte(0) - 'A'; + } + + if (idx >= 0 && idx <= US_ENGLISH_MAPPING.length) { + ch = US_ENGLISH_MAPPING[idx]; + if (i > 1 && (getByte(i - 1) - 'H' == 0 || getByte(i - 1) - 'W' == 0 + || getByte(i - 1) - 'H' - 32 == 0 || getByte(i - 1) - 'W' - 32 == 0) + && idxpp >= 0 && idxpp <= US_ENGLISH_MAPPING.length + && (ch == US_ENGLISH_MAPPING[idxpp]) + || i == 1 && idxp0 >= 0 && idxp0 <= US_ENGLISH_MAPPING.length + && (ch == US_ENGLISH_MAPPING[idxp0])) { + i += 1; + continue; + } + if (idxp >= 0 && idxp <= US_ENGLISH_MAPPING.length + && US_ENGLISH_MAPPING[idxp] - '0' != 0 + && ch - data[j - 1] == 0) { + i += 1; + continue; + } + if (ch - '0' > 0) { + tmp = Character.toString(ch); + System.arraycopy(tmp.getBytes(), 0, data, j, 1); + j += 1; + } + } + i += 1; + } + return UTF8String.fromBytes(data); + } }