Skip to content

Commit

Permalink
Add a bunch of optimised fuzzy string matching algorithms.
Browse files Browse the repository at this point in the history
A new QgsStringUtils class has been added containing some
common fuzzy matching algorithms, including Levenshtein edit
distance and Soundex. These can be used for finding "similar"
strings in a table.

Expression functions for these algorithms have also been
added to a new "Fuzzy Matching" group.
  • Loading branch information
nyalldawson committed Jun 27, 2015
1 parent 79305b2 commit feb3bee
Show file tree
Hide file tree
Showing 13 changed files with 661 additions and 2 deletions.
1 change: 1 addition & 0 deletions python/core/core.sip
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
%Include qgssnappingutils.sip
%Include qgsspatialindex.sip
%Include qgsstatisticalsummary.sip
%Include qgsstringutils.sip
%Include qgstolerance.sip
%Include qgsvectordataprovider.sip
%Include qgsvectorfilewriter.sip
Expand Down
51 changes: 51 additions & 0 deletions python/core/qgsstringutils.sip
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/** \ingroup core
* \class QgsStringUtils
* \brief Utility functions for working with strings.
* \note Added in version 2.11
*/

class QgsStringUtils
{
%TypeHeaderCode
#include <qgsstringutils.h>
%End

public:
/** Returns the Levenshtein edit distance between two strings. This equates to the minimum
* number of character edits (insertions, deletions or substitutions) required to change
* one string to another.
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns edit distance. Lower distances indicate more similiar strings.
*/
static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the longest common substring between two strings. This substring is the longest
* string that is a substring of the two input strings. Eg, the longest common substring
* of "ABABC" and "BABCA" is "ABC".
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns longest common substring
*/
static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the Hamming distance between two strings. This equates to the number of characters at
* corresponding positions within the input strings where the characters are different. The input
* strings must be the same length.
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns Hamming distance between strings, or -1 if strings are different lengths.
*/
static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
* so strings with similar sounds should be represented by the same Soundex code.
* @param string input string
* @returns 4 letter Soundex code
*/
static QString soundex( const QString &string );

};
17 changes: 17 additions & 0 deletions resources/function_help/hamming_distance
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<h3>hamming_distance function</h3>
Returns the Hamming distance between two strings. This equates to the number of characters at
corresponding positions within the input strings where the characters are different. The input
strings must be the same length, and the comparison is case-sensitive.

<h4>Syntax</h4>
<pre>hamming_distance(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> hamming_distance('abc','xec') &rarr; 2</pre><br />
<pre> hamming_distance('abc','ABc') &rarr; 2</pre><br />
<pre> hamming_distance(upper('abc'),upper('ABC')) &rarr; 0</pre>

20 changes: 20 additions & 0 deletions resources/function_help/levenshtein
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<h3>levenshtein function</h3>
Returns the Levenshtein edit distance between two strings. This equates to the minimum
number of character edits (insertions, deletions or substitutions) required to change
one string to another.<br />
The Levenshtein distance is a measure of the similarity between two strings. Smaller
distances mean the strings are more similar, and larger distances indicate more
different strings. The distance is case sensitive.

<h4>Syntax</h4>
<pre>levenshtein(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> levenshtein('kittens','mitten') &rarr; 2</pre><br />
<pre> levenshtein('Kitten','kitten') &rarr; 1</pre><br />
<pre> levenshtein(upper('Kitten'),upper('kitten')) &rarr; 0</pre>

17 changes: 17 additions & 0 deletions resources/function_help/longest_common_substring
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<h3>longest_common_substring function</h3>
Returns the longest common substring between two strings. This substring is the longest
string that is a substring of the two input strings. Eg, the longest common substring
of "ABABC" and "BABCA" is "ABC". The substring is case sensitive.

<h4>Syntax</h4>
<pre>longest_common_substring(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> longest_common_substring('ABABC','BABCA') &rarr; 'ABC'</pre><br />
<pre> longest_common_substring('abcDeF','abcdef') &rarr; 'abc'</pre><br />
<pre> longest_common_substring(upper('abcDeF'),upper('abcdex')) &rarr; 'ABCDE'</pre>

15 changes: 15 additions & 0 deletions resources/function_help/soundex
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<h3>soundex function</h3>
Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
so strings with similar sounds should be represented by the same Soundex code.

<h4>Syntax</h4>
<pre>soundex(string)</pre>

<h4>Arguments</h4>
string &rarr; a string

<h4>Example</h4>
<pre> soundex('robert') &rarr; 'R163'</pre><br />
<pre> soundex('rupert') &rarr; 'R163'</pre><br />
<pre> soundex('rubin') &rarr; 'R150'</pre>

2 changes: 2 additions & 0 deletions src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ SET(QGIS_CORE_SRCS
qgssnappingutils.cpp
qgsspatialindex.cpp
qgsstatisticalsummary.cpp
qgsstringutils.cpp
qgstransaction.cpp
qgstolerance.cpp
qgsvectordataprovider.cpp
Expand Down Expand Up @@ -590,6 +591,7 @@ SET(QGIS_CORE_HDRS
qgssnappingutils.h
qgsspatialindex.h
qgsstatisticalsummary.h
qgsstringutils.h
qgstolerance.h
qgstransaction.h
qgsvectordataprovider.h
Expand Down
36 changes: 35 additions & 1 deletion src/core/qgsexpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "qgssymbollayerv2utils.h"
#include "qgsvectorcolorrampv2.h"
#include "qgsstylev2.h"
#include "qgsstringutils.h"

// from parser
extern QgsExpression::Node* parseExpression( const QString& str, QString& parserErrorMsg );
Expand Down Expand Up @@ -657,6 +658,34 @@ static QVariant fcnTrim( const QVariantList& values, const QgsFeature*, QgsExpre
return QVariant( str.trimmed() );
}

static QVariant fcnLevenshtein( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
return QVariant( QgsStringUtils::levenshteinDistance( string1, string2, true) );
}

static QVariant fcnLCS( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
return QVariant( QgsStringUtils::longestCommonSubstring( string1, string2, true ) );
}

static QVariant fcnHamming( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
int dist = QgsStringUtils::hammingDistance( string1, string2 );
return ( dist < 0 ? QVariant() : QVariant( QgsStringUtils::hammingDistance( string1, string2, true ) ) );
}

static QVariant fcnSoundex( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string = getStringValue( values.at( 0 ), parent );
return QVariant( QgsStringUtils::soundex( string ) );
}

static QVariant fcnWordwrap( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
if ( values.length() == 2 || values.length() == 3 )
Expand Down Expand Up @@ -1697,7 +1726,8 @@ const QStringList& QgsExpression::BuiltinFunctions()
<< "distance" << "intersection" << "sym_difference" << "combine"
<< "union" << "geom_to_wkt" << "geomToWKT" << "geometry"
<< "transform" << "get_feature" << "getFeature"
<< "attribute"
<< "attribute" << "levenshtein" << "longest_common_substring" << "hamming_distance"
<< "soundex"
<< "$rownum" << "$id" << "$scale" << "_specialcol_";
}
return gmBuiltinFunctions;
Expand Down Expand Up @@ -1757,6 +1787,10 @@ const QList<QgsExpression::Function*>& QgsExpression::Functions()
<< new StaticFunction( "upper", 1, fcnUpper, "String" )
<< new StaticFunction( "title", 1, fcnTitle, "String" )
<< new StaticFunction( "trim", 1, fcnTrim, "String" )
<< new StaticFunction( "levenshtein", 2, fcnLevenshtein, "Fuzzy Matching" )
<< new StaticFunction( "longest_common_substring", 2, fcnLCS, "Fuzzy Matching" )
<< new StaticFunction( "hamming_distance", 2, fcnHamming, "Fuzzy Matching" )
<< new StaticFunction( "soundex", 1, fcnSoundex, "Fuzzy Matching" )
<< new StaticFunction( "wordwrap", -1, fcnWordwrap, "String" )
<< new StaticFunction( "length", 1, fcnLength, "String" )
<< new StaticFunction( "replace", 3, fcnReplace, "String" )
Expand Down
Loading

0 comments on commit feb3bee

Please sign in to comment.