Skip to content

Commit

Permalink
support soundex rebase code
Browse files Browse the repository at this point in the history
  • Loading branch information
hujy committed Jul 22, 2015
1 parent d4c7a7a commit e2dec2c
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ object FunctionRegistry {
expression[Levenshtein]("levenshtein"),
expression[RegExpExtract]("regexp_extract"),
expression[RegExpReplace]("regexp_replace"),
expression[SoundEx]("soundex"),
expression[StringInstr]("instr"),
expression[StringLocate]("locate"),
expression[StringLPad]("lpad"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,21 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
}
}

/**
* A function that return soundex code of the given string expression.
*/
case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes
with CodegenFallback {

override def dataType: DataType = StringType

override def inputTypes: Seq[DataType] = Seq(StringType)

override def nullSafeEval(input: Any): Any = {
input.asInstanceOf[UTF8String].soundex()
}
}

/**
* Returns the numeric value of the first character of str.
*/
Expand Down
16 changes: 16 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1891,6 +1891,22 @@ object functions {
StringRepeat(str.expr, lit(n).expr)
}

/**
* * Return the soundex code for the specified expression.
*
* @group string_funcs
* @since 1.5.0
*/
def soundex(e: Column): Column = SoundEx(e.expr)

/**
* Return the soundex for the specified column.
*
* @group string_funcs
* @since 1.5.0
*/
def soundex(columnName: String): Column = soundex(Column(columnName))

/**
* Splits str around pattern (pattern is a regular expression).
* NOTE: pattern is a string represent the regular expression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,15 @@ class StringFunctionsSuite extends QueryTest {
Row("aa123cc"))
}

test("soundex function") {
val df = Seq(("MARY", "SU")).toDF("l", "r")
checkAnswer(
df.select(soundex("l"), soundex($"r")), Row("M600", "S000"))

checkAnswer(
df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000"))
}

test("string instr function") {
val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")

Expand Down
95 changes: 95 additions & 0 deletions unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
5, 5, 5, 5,
6, 6, 6, 6};

/**
* Soundex mapping table
*/
private static final char[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '0',
'0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '0', '2', '0', '2'};

public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");

/**
Expand Down Expand Up @@ -628,4 +634,93 @@ public int hashCode() {
}
return result;
}

/**
* Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
* but can also be used as a general purpose scheme to find word with similar phonemes.
* https://en.wikipedia.org/wiki/Soundex
*/
public UTF8String soundex() {
if (numBytes == 0) {
return UTF8String.fromBytes(new byte[0]);
}
String tmp;
byte data[] = {'0', '0', '0', '0'};
char ch;
int idx = 0;
int idxp0 = 0;
int idxp = 0;
int idxpp = 0;
int offset = numBytesForFirstByte(getByte(0));
if (offset > 1 || getByte(0) < 97 && getByte(0) > 90
|| getByte(0) < 65 || getByte(0) > 122) {
return this;
}
int i = 1;
int j = 1;
data[0] = getByte(0);

while (i < numBytes) {

if (j > 3) break;
if (numBytesForFirstByte(getByte(i)) > 1 || getByte(i) < 97 && getByte(i) > 90
|| getByte(i) < 65 || getByte(i) > 122) {
return this;
}
if (getByte(i) == getByte(i - 1)
|| (getByte(i) - 32 == getByte(i - 1)
|| (getByte(i) == getByte(i - 1) - 32))) {
i += 1;
continue;
}
if (getByte(i) <= 122 && getByte(i) >= 97) {
idx = getByte(i) - 'A' - 32;
} else {
idx = getByte(i) - 'A';
}
if (i > 1) {
if (getByte(i - 2) <= 122 && getByte(i - 2) >= 97) {
idxpp = getByte(i - 2) - 'A' - 32;
} else {
idxpp = getByte(i - 2) - 'A';
}
}
if (getByte(i - 1) <= 122 && getByte(i - 1) >= 97) {
idxp = getByte(i - 1) - 'A' - 32;
} else {
idxp = getByte(i - 1) - 'A';
}
if (getByte(0) <= 122 && getByte(0) >= 97) {
idxp0 = getByte(0) - 'A' - 32;
} else {
idxp0 = getByte(0) - 'A';
}

if (idx >= 0 && idx <= US_ENGLISH_MAPPING.length) {
ch = US_ENGLISH_MAPPING[idx];
if (i > 1 && (getByte(i - 1) - 'H' == 0 || getByte(i - 1) - 'W' == 0
|| getByte(i - 1) - 'H' - 32 == 0 || getByte(i - 1) - 'W' - 32 == 0)
&& idxpp >= 0 && idxpp <= US_ENGLISH_MAPPING.length
&& (ch == US_ENGLISH_MAPPING[idxpp])
|| i == 1 && idxp0 >= 0 && idxp0 <= US_ENGLISH_MAPPING.length
&& (ch == US_ENGLISH_MAPPING[idxp0])) {
i += 1;
continue;
}
if (idxp >= 0 && idxp <= US_ENGLISH_MAPPING.length
&& US_ENGLISH_MAPPING[idxp] - '0' != 0
&& ch - data[j - 1] == 0) {
i += 1;
continue;
}
if (ch - '0' > 0) {
tmp = Character.toString(ch);
System.arraycopy(tmp.getBytes(), 0, data, j, 1);
j += 1;
}
}
i += 1;
}
return UTF8String.fromBytes(data);
}
}

0 comments on commit e2dec2c

Please sign in to comment.