From e2dec2ce58bf77fbc9c083e55d8ed49cf7712e3c Mon Sep 17 00:00:00 2001
From: HuJiayin <jiayin.hu@intel.com>
Date: Wed, 22 Jul 2015 09:19:03 +0800
Subject: [PATCH] support soundex rebase code

---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/stringOperations.scala        | 15 +++
 .../org/apache/spark/sql/functions.scala      | 16 ++++
 .../spark/sql/StringFunctionsSuite.scala      |  9 ++
 .../apache/spark/unsafe/types/UTF8String.java | 95 +++++++++++++++++++
 5 files changed, 136 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e3d8d2adf2135..a7de61ea31937 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -164,6 +164,7 @@ object FunctionRegistry {
     expression[Levenshtein]("levenshtein"),
     expression[RegExpExtract]("regexp_extract"),
     expression[RegExpReplace]("regexp_replace"),
+    expression[SoundEx]("soundex"),
     expression[StringInstr]("instr"),
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 1f18a6e9ff8a5..b2b05558a3d0e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -784,6 +784,21 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
   }
 }
 
+/**
+ * A function that return soundex code of the given string expression.
+ */
+case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes
+  with CodegenFallback {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(StringType)
+
+  override def nullSafeEval(input: Any): Any = {
+    input.asInstanceOf[UTF8String].soundex()
+  }
+}
+
 /**
  * Returns the numeric value of the first character of str.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e5ff8ae7e3179..c44571615d58a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1891,6 +1891,22 @@ object functions {
     StringRepeat(str.expr, lit(n).expr)
   }
 
+  /**
+   * * Return the soundex code for the specified expression.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def soundex(e: Column): Column = SoundEx(e.expr)
+
+  /**
+   * Return the soundex for the specified column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def soundex(columnName: String): Column = soundex(Column(columnName))
+
   /**
    * Splits str around pattern (pattern is a regular expression).
    * NOTE: pattern is a string represent the regular expression.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 3702e73b4e74f..6494e60b6ff6f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -144,6 +144,15 @@ class StringFunctionsSuite extends QueryTest {
       Row("aa123cc"))
   }
 
+  test("soundex function") {
+    val df = Seq(("MARY", "SU")).toDF("l", "r")
+    checkAnswer(
+      df.select(soundex("l"), soundex($"r")), Row("M600", "S000"))
+
+    checkAnswer(
+      df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000"))
+  }
+
   test("string instr function") {
     val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 946d355f1fc28..be57e1a94e9d4 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -50,6 +50,12 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
     5, 5, 5, 5,
     6, 6, 6, 6};
 
+  /**
+   * Soundex mapping table
+   */
+  private static final char[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '0',
+          '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '0', '2', '0', '2'};
+
   public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
 
   /**
@@ -628,4 +634,93 @@ public int hashCode() {
     }
     return result;
   }
+
+  /**
+   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
+   * but can also be used as a general purpose scheme to find word with similar phonemes.
+   * https://en.wikipedia.org/wiki/Soundex
+   */
+  public UTF8String soundex() {
+    if (numBytes == 0) {
+      return UTF8String.fromBytes(new byte[0]);
+    }
+    String tmp;
+    byte data[] = {'0', '0', '0', '0'};
+    char ch;
+    int idx = 0;
+    int idxp0 = 0;
+    int idxp = 0;
+    int idxpp = 0;
+    int offset = numBytesForFirstByte(getByte(0));
+    if (offset > 1 || getByte(0) < 97 && getByte(0) > 90
+            || getByte(0) < 65 || getByte(0) > 122) {
+      return this;
+    }
+    int i = 1;
+    int j = 1;
+    data[0] = getByte(0);
+
+    while (i < numBytes) {
+
+      if (j > 3) break;
+      if (numBytesForFirstByte(getByte(i)) > 1 || getByte(i) < 97 && getByte(i) > 90
+              || getByte(i) < 65 || getByte(i) > 122) {
+        return this;
+      }
+      if (getByte(i) == getByte(i - 1)
+              || (getByte(i) - 32 == getByte(i - 1)
+              || (getByte(i) == getByte(i - 1) - 32))) {
+        i += 1;
+        continue;
+      }
+      if (getByte(i) <= 122 && getByte(i) >= 97) {
+        idx = getByte(i) - 'A' - 32;
+      } else {
+        idx = getByte(i) - 'A';
+      }
+      if (i > 1) {
+        if (getByte(i - 2) <= 122 && getByte(i - 2) >= 97) {
+          idxpp = getByte(i - 2) - 'A' - 32;
+        } else {
+          idxpp = getByte(i - 2) - 'A';
+        }
+      }
+      if (getByte(i - 1) <= 122 && getByte(i - 1) >= 97) {
+        idxp = getByte(i - 1) - 'A' - 32;
+      } else {
+        idxp = getByte(i - 1) - 'A';
+      }
+      if (getByte(0) <= 122 && getByte(0) >= 97) {
+        idxp0 = getByte(0) - 'A' - 32;
+      } else {
+        idxp0 = getByte(0) - 'A';
+      }
+
+      if (idx >= 0 && idx <= US_ENGLISH_MAPPING.length) {
+        ch = US_ENGLISH_MAPPING[idx];
+        if (i > 1 && (getByte(i - 1) - 'H' == 0 || getByte(i - 1) - 'W' == 0
+                || getByte(i - 1) - 'H' - 32 == 0 || getByte(i - 1) - 'W' - 32 == 0)
+                && idxpp >= 0 && idxpp <= US_ENGLISH_MAPPING.length
+                && (ch == US_ENGLISH_MAPPING[idxpp])
+                || i == 1 && idxp0 >= 0 && idxp0 <= US_ENGLISH_MAPPING.length
+                && (ch == US_ENGLISH_MAPPING[idxp0])) {
+          i += 1;
+          continue;
+        }
+        if (idxp >= 0 && idxp <= US_ENGLISH_MAPPING.length
+                && US_ENGLISH_MAPPING[idxp] - '0' != 0
+                && ch - data[j - 1] == 0) {
+          i += 1;
+          continue;
+        }
+        if (ch - '0' > 0) {
+          tmp = Character.toString(ch);
+          System.arraycopy(tmp.getBytes(), 0, data, j, 1);
+          j += 1;
+        }
+      }
+      i += 1;
+    }
+    return UTF8String.fromBytes(data);
+  }
 }