Add a bunch of optimised fuzzy string matching algorithms.

A new QgsStringUtils class has been added containing some common fuzzy matching algorithms, including Levenshtein edit distance and Soundex. These can be used for finding "similar" strings in a table. Expression functions for these algorithms have also been added to a new "Fuzzy Matching" group.
qgis · Jun 27, 2015 · feb3bee · feb3bee
1 parent 79305b2
commit feb3bee
Show file tree

Hide file tree

Showing 13 changed files with 661 additions and 2 deletions.
diff --git a/python/core/core.sip b/python/core/core.sip
@@ -104,6 +104,7 @@
 %Include qgssnappingutils.sip
 %Include qgsspatialindex.sip
 %Include qgsstatisticalsummary.sip
+%Include qgsstringutils.sip
 %Include qgstolerance.sip
 %Include qgsvectordataprovider.sip
 %Include qgsvectorfilewriter.sip

diff --git a/python/core/qgsstringutils.sip b/python/core/qgsstringutils.sip
@@ -0,0 +1,51 @@
+/** \ingroup core
+ * \class QgsStringUtils
+ * \brief Utility functions for working with strings.
+ * \note Added in version 2.11
+ */
+
+class QgsStringUtils
+{
+%TypeHeaderCode
+#include <qgsstringutils.h>
+%End
+
+  public:
+    /** Returns the Levenshtein edit distance between two strings. This equates to the minimum
+     * number of character edits (insertions, deletions or substitutions) required to change
+     * one string to another.
+     * @param string1 first string
+     * @param string2 second string
+     * @param caseSensitive set to true for case sensitive comparison
+     * @returns edit distance. Lower distances indicate more similiar strings.
+     */
+    static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
+
+    /** Returns the longest common substring between two strings. This substring is the longest
+     * string that is a substring of the two input strings. Eg, the longest common substring
+     * of "ABABC" and "BABCA" is "ABC".
+     * @param string1 first string
+     * @param string2 second string
+     * @param caseSensitive set to true for case sensitive comparison
+     * @returns longest common substring
+     */
+    static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false );
+
+    /** Returns the Hamming distance between two strings. This equates to the number of characters at
+     * corresponding positions within the input strings where the characters are different. The input
+     * strings must be the same length.
+     * @param string1 first string
+     * @param string2 second string
+     * @param caseSensitive set to true for case sensitive comparison
+     * @returns Hamming distance between strings, or -1 if strings are different lengths.
+     */
+    static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
+
+    /** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
+     * so strings with similar sounds should be represented by the same Soundex code.
+     * @param string input string
+     * @returns 4 letter Soundex code
+     */
+    static QString soundex( const QString &string );
+
+};
diff --git a/resources/function_help/hamming_distance b/resources/function_help/hamming_distance
@@ -0,0 +1,17 @@
+<h3>hamming_distance function</h3>
+Returns the Hamming distance between two strings. This equates to the number of characters at
+corresponding positions within the input strings where the characters are different. The input
+strings must be the same length, and the comparison is case-sensitive.
+
+<h4>Syntax</h4>
+<pre>hamming_distance(string1,string2)</pre>
+
+<h4>Arguments</h4>
+string1 &rarr; a string<br />
+string2 &rarr; a string<br />
+
+<h4>Example</h4>
+<pre> hamming_distance('abc','xec') &rarr; 2</pre><br />
+<pre> hamming_distance('abc','ABc') &rarr; 2</pre><br />
+<pre> hamming_distance(upper('abc'),upper('ABC')) &rarr; 0</pre>
+
diff --git a/resources/function_help/levenshtein b/resources/function_help/levenshtein
@@ -0,0 +1,20 @@
+<h3>levenshtein function</h3>
+Returns the Levenshtein edit distance between two strings. This equates to the minimum
+number of character edits (insertions, deletions or substitutions) required to change
+one string to another.<br />
+The Levenshtein distance is a measure of the similarity between two strings. Smaller
+distances mean the strings are more similar, and larger distances indicate more
+different strings. The distance is case sensitive.
+
+<h4>Syntax</h4>
+<pre>levenshtein(string1,string2)</pre>
+
+<h4>Arguments</h4>
+string1 &rarr; a string<br />
+string2 &rarr; a string<br />
+
+<h4>Example</h4>
+<pre> levenshtein('kittens','mitten') &rarr; 2</pre><br />
+<pre> levenshtein('Kitten','kitten') &rarr; 1</pre><br />
+<pre> levenshtein(upper('Kitten'),upper('kitten')) &rarr; 0</pre>
+
diff --git a/resources/function_help/longest_common_substring b/resources/function_help/longest_common_substring
@@ -0,0 +1,17 @@
+<h3>longest_common_substring function</h3>
+Returns the longest common substring between two strings. This substring is the longest
+string that is a substring of the two input strings. Eg, the longest common substring
+of "ABABC" and "BABCA" is "ABC". The substring is case sensitive.
+
+<h4>Syntax</h4>
+<pre>longest_common_substring(string1,string2)</pre>
+
+<h4>Arguments</h4>
+string1 &rarr; a string<br />
+string2 &rarr; a string<br />
+
+<h4>Example</h4>
+<pre> longest_common_substring('ABABC','BABCA') &rarr; 'ABC'</pre><br />
+<pre> longest_common_substring('abcDeF','abcdef') &rarr; 'abc'</pre><br />
+<pre> longest_common_substring(upper('abcDeF'),upper('abcdex')) &rarr; 'ABCDE'</pre>
+
diff --git a/resources/function_help/soundex b/resources/function_help/soundex
@@ -0,0 +1,15 @@
+<h3>soundex function</h3>
+Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
+so strings with similar sounds should be represented by the same Soundex code.
+
+<h4>Syntax</h4>
+<pre>soundex(string)</pre>
+
+<h4>Arguments</h4>
+string &rarr; a string
+
+<h4>Example</h4>
+<pre> soundex('robert') &rarr; 'R163'</pre><br />
+<pre> soundex('rupert') &rarr; 'R163'</pre><br />
+<pre> soundex('rubin') &rarr; 'R150'</pre>
+
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
@@ -169,6 +169,7 @@ SET(QGIS_CORE_SRCS
   qgssnappingutils.cpp
   qgsspatialindex.cpp
   qgsstatisticalsummary.cpp
+  qgsstringutils.cpp
   qgstransaction.cpp
   qgstolerance.cpp
   qgsvectordataprovider.cpp
@@ -590,6 +591,7 @@ SET(QGIS_CORE_HDRS
   qgssnappingutils.h
   qgsspatialindex.h
   qgsstatisticalsummary.h
+  qgsstringutils.h
   qgstolerance.h
   qgstransaction.h
   qgsvectordataprovider.h

diff --git a/src/core/qgsexpression.cpp b/src/core/qgsexpression.cpp
@@ -35,6 +35,7 @@
 #include "qgssymbollayerv2utils.h"
 #include "qgsvectorcolorrampv2.h"
 #include "qgsstylev2.h"
+#include "qgsstringutils.h"
 
 // from parser
 extern QgsExpression::Node* parseExpression( const QString& str, QString& parserErrorMsg );
@@ -657,6 +658,34 @@ static QVariant fcnTrim( const QVariantList& values, const QgsFeature*, QgsExpre
   return QVariant( str.trimmed() );
 }
 
+static QVariant fcnLevenshtein( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
+{
+  QString string1 = getStringValue( values.at( 0 ), parent );
+  QString string2 = getStringValue( values.at( 1 ), parent );
+  return QVariant( QgsStringUtils::levenshteinDistance( string1, string2, true) );
+}
+
+static QVariant fcnLCS( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
+{
+  QString string1 = getStringValue( values.at( 0 ), parent );
+  QString string2 = getStringValue( values.at( 1 ), parent );
+  return QVariant( QgsStringUtils::longestCommonSubstring( string1, string2, true ) );
+}
+
+static QVariant fcnHamming( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
+{
+  QString string1 = getStringValue( values.at( 0 ), parent );
+  QString string2 = getStringValue( values.at( 1 ), parent );
+  int dist = QgsStringUtils::hammingDistance( string1, string2 );
+  return ( dist < 0 ? QVariant() : QVariant( QgsStringUtils::hammingDistance( string1, string2, true ) ) );
+}
+
+static QVariant fcnSoundex( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
+{
+  QString string = getStringValue( values.at( 0 ), parent );
+  return QVariant( QgsStringUtils::soundex( string ) );
+}
+
 static QVariant fcnWordwrap( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
 {
   if ( values.length() == 2 || values.length() == 3 )
@@ -1697,7 +1726,8 @@ const QStringList& QgsExpression::BuiltinFunctions()
     << "distance" << "intersection" << "sym_difference" << "combine"
     << "union" << "geom_to_wkt" << "geomToWKT" << "geometry"
     << "transform" << "get_feature" << "getFeature"
-    << "attribute"
+    << "attribute" << "levenshtein" << "longest_common_substring" << "hamming_distance"
+    << "soundex"
     << "$rownum" << "$id" << "$scale" << "_specialcol_";
   }
   return gmBuiltinFunctions;
@@ -1757,6 +1787,10 @@ const QList<QgsExpression::Function*>& QgsExpression::Functions()
     << new StaticFunction( "upper", 1, fcnUpper, "String" )
     << new StaticFunction( "title", 1, fcnTitle, "String" )
     << new StaticFunction( "trim", 1, fcnTrim, "String" )
+    << new StaticFunction( "levenshtein", 2, fcnLevenshtein, "Fuzzy Matching" )
+    << new StaticFunction( "longest_common_substring", 2, fcnLCS, "Fuzzy Matching" )
+    << new StaticFunction( "hamming_distance", 2, fcnHamming, "Fuzzy Matching" )
+    << new StaticFunction( "soundex", 1, fcnSoundex, "Fuzzy Matching" )
     << new StaticFunction( "wordwrap", -1, fcnWordwrap, "String" )
     << new StaticFunction( "length", 1, fcnLength, "String" )
     << new StaticFunction( "replace", 3, fcnReplace, "String" )