Skip to content

Commit

Permalink
fix soundex
Browse files Browse the repository at this point in the history
  • Loading branch information
Davies Liu committed Jul 31, 2015
1 parent 2538908 commit a4bd6d8
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 94 deletions.
17 changes: 17 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
'year', 'quarter', 'month', 'hour', 'minute', 'second',
'dayofmonth', 'dayofyear', 'weekofyear']

__all__ += ['soundex']


def _create_function(name, doc=""):
""" Create a function for aggregator by name"""
Expand Down Expand Up @@ -830,6 +832,7 @@ def weekofyear(col):
def size(col):
"""
Collection function: returns the length of the array or map stored in the column.
:param col: name of column or expression
>>> df = sqlContext.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
Expand All @@ -840,6 +843,20 @@ def size(col):
return Column(sc._jvm.functions.size(_to_java_column(col)))


@since
@ignore_unicode_prefix
def soundex(col):
"""
Returns the SoundEx encoding for a string
>>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
>>> df.select(soundex(df.name).alias("soundex")).collect()
[Row(soundex=u'P362'), Row(soundex=u'U612')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.size(_to_java_column(col)))


class UserDefinedFunction(object):
"""
User defined function in Python
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ object FunctionRegistry {
expression[Levenshtein]("levenshtein"),
expression[RegExpExtract]("regexp_extract"),
expression[RegExpReplace]("regexp_replace"),
expression[SoundEx]("soundex"),
expression[StringInstr]("instr"),
expression[StringLocate]("locate"),
expression[StringLPad]("lpad"),
Expand All @@ -175,6 +174,7 @@ object FunctionRegistry {
expression[StringRepeat]("repeat"),
expression[StringReverse]("reverse"),
expression[StringTrimRight]("rtrim"),
expression[SoundEx]("soundex"),
expression[StringSpace]("space"),
expression[StringSplit]("split"),
expression[Substring]("substr"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import java.util.Locale
import java.util.regex.{MatchResult, Pattern}

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedException
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
Expand Down Expand Up @@ -784,18 +783,16 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
/**
* A function that return soundex code of the given string expression.
*/
case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes
with CodegenFallback {
case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {

override def dataType: DataType = StringType

override def inputTypes: Seq[DataType] = Seq(StringType)

override def nullSafeEval(input: Any): Any = {
input.asInstanceOf[UTF8String].soundex()
}
override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex()

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = $c.soundex();")
defineCodeGen(ctx, ev, c => s"$c.soundex()")
}
}

Expand Down
8 changes: 0 additions & 8 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1887,14 +1887,6 @@ object functions {
*/
def soundex(e: Column): Column = SoundEx(e.expr)

/**
* Return the soundex for the specified column.
*
* @group string_funcs
* @since 1.5.0
*/
def soundex(columnName: String): Column = soundex(Column(columnName))

/**
* Splits str around pattern (pattern is a regular expression).
* NOTE: pattern is a string represent the regular expression.
Expand Down
113 changes: 35 additions & 78 deletions unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,6 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
5, 5, 5, 5,
6, 6, 6, 6};

/**
* Soundex mapping table
*/
private static final char[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '0',
'0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '0', '2', '0', '2'};

public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");

/**
Expand Down Expand Up @@ -635,93 +629,56 @@ public int hashCode() {
return result;
}

/**
* Soundex mapping table
*/
private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
'0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};

/**
* Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
* but can also be used as a general purpose scheme to find word with similar phonemes.
* https://en.wikipedia.org/wiki/Soundex
*/
public UTF8String soundex() {
if(this == null) {
return null;
}
if (numBytes == 0) {
return UTF8String.fromBytes(new byte[0]);
return EMPTY_UTF8;
}
byte data[] = {'0', '0', '0', '0'};
char ch;
int idx = 0;
int idxp0 = 0;
int idxp = 0;
int idxpp = 0;
int offset = numBytesForFirstByte(getByte(0));
if (offset > 1 || getByte(0) < 97 && getByte(0) > 90
|| getByte(0) < 65 || getByte(0) > 122) {

byte b = getByte(0);
if ('a' <= b && b <= 'z') {
b -= 32;
} else if (b < 'A' || 'Z' < b) {
// first character must be a letter
return this;
}
int i = 1;
int j = 1;
data[0] = getByte(0);

while (i < numBytes) {

if (j > 3) break;
if (numBytesForFirstByte(getByte(i)) > 1 || getByte(i) < 97 && getByte(i) > 90
|| getByte(i) < 65 || getByte(i) > 122) {
return this;
}
if (getByte(i) == getByte(i - 1)
|| (getByte(i) - 32 == getByte(i - 1)
|| (getByte(i) == getByte(i - 1) - 32))) {
i += 1;
byte sx[] = {'0', '0', '0', '0'};
sx[0] = b;
int sxi = 1;
int idx = b - 'A';
byte lastCode = US_ENGLISH_MAPPING[idx];

for (int i = 1; i < numBytes; i++) {
b = getByte(i);
if ('a' <= b && b <= 'z') {
b -= 32;
} else if (b < 'A' || 'Z' < b) {
// not a letter, skip it
lastCode = '0';
continue;
}
if (getByte(i) <= 122 && getByte(i) >= 97) {
idx = getByte(i) - 'A' - 32;
} else {
idx = getByte(i) - 'A';
}
if (i > 1) {
if (getByte(i - 2) <= 122 && getByte(i - 2) >= 97) {
idxpp = getByte(i - 2) - 'A' - 32;
} else {
idxpp = getByte(i - 2) - 'A';
}
}
if (getByte(i - 1) <= 122 && getByte(i - 1) >= 97) {
idxp = getByte(i - 1) - 'A' - 32;
idx = b - 'A';
byte code = US_ENGLISH_MAPPING[idx];
if (code == '7') {
// ignore it
} else {
idxp = getByte(i - 1) - 'A';
}
if (getByte(0) <= 122 && getByte(0) >= 97) {
idxp0 = getByte(0) - 'A' - 32;
} else {
idxp0 = getByte(0) - 'A';
}

if (idx >= 0 && idx <= US_ENGLISH_MAPPING.length) {
ch = US_ENGLISH_MAPPING[idx];
if (i > 1 && (getByte(i - 1) - 'H' == 0 || getByte(i - 1) - 'W' == 0
|| getByte(i - 1) - 'H' - 32 == 0 || getByte(i - 1) - 'W' - 32 == 0)
&& idxpp >= 0 && idxpp <= US_ENGLISH_MAPPING.length
&& (ch == US_ENGLISH_MAPPING[idxpp])
|| i == 1 && idxp0 >= 0 && idxp0 <= US_ENGLISH_MAPPING.length
&& (ch == US_ENGLISH_MAPPING[idxp0])) {
i += 1;
continue;
}
if (idxp >= 0 && idxp <= US_ENGLISH_MAPPING.length
&& US_ENGLISH_MAPPING[idxp] - '0' != 0
&& ch - data[j - 1] == 0) {
i += 1;
continue;
}
if (ch - '0' > 0) {
data[j] = (byte)ch;
j += 1;
if (code != '0' && code != lastCode) {
sx[sxi++] = code;
if (sxi > 3) break;
}
lastCode = code;
}
i += 1;
}
return UTF8String.fromBytes(data);
return UTF8String.fromBytes(sx);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -322,4 +322,52 @@ public void createBlankString() {
assertEquals(fromString(" "), blankString(3));
assertEquals(fromString(""), blankString(0));
}

@Test
public void soundex() {
assertEquals(fromString("Robert").soundex(), fromString("R163"));
assertEquals(fromString("Rupert").soundex(), fromString("R163"));
assertEquals(fromString("Rubin").soundex(), fromString("R150"));
assertEquals(fromString("Ashcraft").soundex(), fromString("A261"));
assertEquals(fromString("Ashcroft").soundex(), fromString("A261"));
assertEquals(fromString("Burroughs").soundex(), fromString("B620"));
assertEquals(fromString("Burrows").soundex(), fromString("B620"));
assertEquals(fromString("Ekzampul").soundex(), fromString("E251"));
assertEquals(fromString("Example").soundex(), fromString("E251"));
assertEquals(fromString("Ellery").soundex(), fromString("E460"));
assertEquals(fromString("Euler").soundex(), fromString("E460"));
assertEquals(fromString("Ghosh").soundex(), fromString("G200"));
assertEquals(fromString("Gauss").soundex(), fromString("G200"));
assertEquals(fromString("Gutierrez").soundex(), fromString("G362"));
assertEquals(fromString("Heilbronn").soundex(), fromString("H416"));
assertEquals(fromString("Hilbert").soundex(), fromString("H416"));
assertEquals(fromString("Jackson").soundex(), fromString("J250"));
assertEquals(fromString("Kant").soundex(), fromString("K530"));
assertEquals(fromString("Knuth").soundex(), fromString("K530"));
assertEquals(fromString("Lee").soundex(), fromString("L000"));
assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222"));
assertEquals(fromString("Lissajous").soundex(), fromString("L222"));
assertEquals(fromString("Ladd").soundex(), fromString("L300"));
assertEquals(fromString("Lloyd").soundex(), fromString("L300"));
assertEquals(fromString("Moses").soundex(), fromString("M220"));
assertEquals(fromString("O'Hara").soundex(), fromString("O600"));
assertEquals(fromString("Pfister").soundex(), fromString("P236"));
assertEquals(fromString("Rubin").soundex(), fromString("R150"));
assertEquals(fromString("Robert").soundex(), fromString("R163"));
assertEquals(fromString("Rupert").soundex(), fromString("R163"));
assertEquals(fromString("Soundex").soundex(), fromString("S532"));
assertEquals(fromString("Sownteks").soundex(), fromString("S532"));
assertEquals(fromString("Tymczak").soundex(), fromString("T522"));
assertEquals(fromString("VanDeusen").soundex(), fromString("V532"));
assertEquals(fromString("Washington").soundex(), fromString("W252"));
assertEquals(fromString("Wheaton").soundex(), fromString("W350"));

assertEquals(fromString("a").soundex(), fromString("A000"));
assertEquals(fromString("ab").soundex(), fromString("A100"));
assertEquals(fromString("abc").soundex(), fromString("A120"));
assertEquals(fromString("abcd").soundex(), fromString("A123"));
assertEquals(fromString("").soundex(), fromString(""));
assertEquals(fromString("123").soundex(), fromString("123"));
assertEquals(fromString("世界千世").soundex(), fromString("世界千世"));
}
}

0 comments on commit a4bd6d8

Please sign in to comment.