Skip to content
Permalink
Browse files

Add a bunch of optimised fuzzy string matching algorithms.

A new QgsStringUtils class has been added containing some
common fuzzy matching algorithms, including Levenshtein edit
distance and Soundex. These can be used for finding "similar"
strings in a table.

Expression functions for these algorithms have also been
added to a new "Fuzzy Matching" group.
  • Loading branch information
nyalldawson committed Jun 27, 2015
1 parent 79305b2 commit feb3bee85837d707b1b10c14064fcb55364e282d
@@ -104,6 +104,7 @@
%Include qgssnappingutils.sip
%Include qgsspatialindex.sip
%Include qgsstatisticalsummary.sip
%Include qgsstringutils.sip
%Include qgstolerance.sip
%Include qgsvectordataprovider.sip
%Include qgsvectorfilewriter.sip
@@ -0,0 +1,51 @@
/** \ingroup core
* \class QgsStringUtils
* \brief Utility functions for working with strings.
* \note Added in version 2.11
*/

class QgsStringUtils
{
%TypeHeaderCode
#include <qgsstringutils.h>
%End

public:
/** Returns the Levenshtein edit distance between two strings. This equates to the minimum
* number of character edits (insertions, deletions or substitutions) required to change
* one string to another.
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns edit distance. Lower distances indicate more similiar strings.
*/
static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the longest common substring between two strings. This substring is the longest
* string that is a substring of the two input strings. Eg, the longest common substring
* of "ABABC" and "BABCA" is "ABC".
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns longest common substring
*/
static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the Hamming distance between two strings. This equates to the number of characters at
* corresponding positions within the input strings where the characters are different. The input
* strings must be the same length.
* @param string1 first string
* @param string2 second string
* @param caseSensitive set to true for case sensitive comparison
* @returns Hamming distance between strings, or -1 if strings are different lengths.
*/
static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false );

/** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
* so strings with similar sounds should be represented by the same Soundex code.
* @param string input string
* @returns 4 letter Soundex code
*/
static QString soundex( const QString &string );

};
@@ -0,0 +1,17 @@
<h3>hamming_distance function</h3>
Returns the Hamming distance between two strings. This equates to the number of characters at
corresponding positions within the input strings where the characters are different. The input
strings must be the same length, and the comparison is case-sensitive.

<h4>Syntax</h4>
<pre>hamming_distance(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> hamming_distance('abc','xec') &rarr; 2</pre><br />
<pre> hamming_distance('abc','ABc') &rarr; 2</pre><br />
<pre> hamming_distance(upper('abc'),upper('ABC')) &rarr; 0</pre>

@@ -0,0 +1,20 @@
<h3>levenshtein function</h3>
Returns the Levenshtein edit distance between two strings. This equates to the minimum
number of character edits (insertions, deletions or substitutions) required to change
one string to another.<br />
The Levenshtein distance is a measure of the similarity between two strings. Smaller
distances mean the strings are more similar, and larger distances indicate more
different strings. The distance is case sensitive.

<h4>Syntax</h4>
<pre>levenshtein(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> levenshtein('kittens','mitten') &rarr; 2</pre><br />
<pre> levenshtein('Kitten','kitten') &rarr; 1</pre><br />
<pre> levenshtein(upper('Kitten'),upper('kitten')) &rarr; 0</pre>

@@ -0,0 +1,17 @@
<h3>longest_common_substring function</h3>
Returns the longest common substring between two strings. This substring is the longest
string that is a substring of the two input strings. Eg, the longest common substring
of "ABABC" and "BABCA" is "ABC". The substring is case sensitive.

<h4>Syntax</h4>
<pre>longest_common_substring(string1,string2)</pre>

<h4>Arguments</h4>
string1 &rarr; a string<br />
string2 &rarr; a string<br />

<h4>Example</h4>
<pre> longest_common_substring('ABABC','BABCA') &rarr; 'ABC'</pre><br />
<pre> longest_common_substring('abcDeF','abcdef') &rarr; 'abc'</pre><br />
<pre> longest_common_substring(upper('abcDeF'),upper('abcdex')) &rarr; 'ABCDE'</pre>

@@ -0,0 +1,15 @@
<h3>soundex function</h3>
Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
so strings with similar sounds should be represented by the same Soundex code.

<h4>Syntax</h4>
<pre>soundex(string)</pre>

<h4>Arguments</h4>
string &rarr; a string

<h4>Example</h4>
<pre> soundex('robert') &rarr; 'R163'</pre><br />
<pre> soundex('rupert') &rarr; 'R163'</pre><br />
<pre> soundex('rubin') &rarr; 'R150'</pre>

@@ -169,6 +169,7 @@ SET(QGIS_CORE_SRCS
qgssnappingutils.cpp
qgsspatialindex.cpp
qgsstatisticalsummary.cpp
qgsstringutils.cpp
qgstransaction.cpp
qgstolerance.cpp
qgsvectordataprovider.cpp
@@ -590,6 +591,7 @@ SET(QGIS_CORE_HDRS
qgssnappingutils.h
qgsspatialindex.h
qgsstatisticalsummary.h
qgsstringutils.h
qgstolerance.h
qgstransaction.h
qgsvectordataprovider.h
@@ -35,6 +35,7 @@
#include "qgssymbollayerv2utils.h"
#include "qgsvectorcolorrampv2.h"
#include "qgsstylev2.h"
#include "qgsstringutils.h"

// from parser
extern QgsExpression::Node* parseExpression( const QString& str, QString& parserErrorMsg );
@@ -657,6 +658,34 @@ static QVariant fcnTrim( const QVariantList& values, const QgsFeature*, QgsExpre
return QVariant( str.trimmed() );
}

static QVariant fcnLevenshtein( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
return QVariant( QgsStringUtils::levenshteinDistance( string1, string2, true) );
}

static QVariant fcnLCS( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
return QVariant( QgsStringUtils::longestCommonSubstring( string1, string2, true ) );
}

static QVariant fcnHamming( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string1 = getStringValue( values.at( 0 ), parent );
QString string2 = getStringValue( values.at( 1 ), parent );
int dist = QgsStringUtils::hammingDistance( string1, string2 );
return ( dist < 0 ? QVariant() : QVariant( QgsStringUtils::hammingDistance( string1, string2, true ) ) );
}

static QVariant fcnSoundex( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
QString string = getStringValue( values.at( 0 ), parent );
return QVariant( QgsStringUtils::soundex( string ) );
}

static QVariant fcnWordwrap( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
{
if ( values.length() == 2 || values.length() == 3 )
@@ -1697,7 +1726,8 @@ const QStringList& QgsExpression::BuiltinFunctions()
<< "distance" << "intersection" << "sym_difference" << "combine"
<< "union" << "geom_to_wkt" << "geomToWKT" << "geometry"
<< "transform" << "get_feature" << "getFeature"
<< "attribute"
<< "attribute" << "levenshtein" << "longest_common_substring" << "hamming_distance"
<< "soundex"
<< "$rownum" << "$id" << "$scale" << "_specialcol_";
}
return gmBuiltinFunctions;
@@ -1757,6 +1787,10 @@ const QList<QgsExpression::Function*>& QgsExpression::Functions()
<< new StaticFunction( "upper", 1, fcnUpper, "String" )
<< new StaticFunction( "title", 1, fcnTitle, "String" )
<< new StaticFunction( "trim", 1, fcnTrim, "String" )
<< new StaticFunction( "levenshtein", 2, fcnLevenshtein, "Fuzzy Matching" )
<< new StaticFunction( "longest_common_substring", 2, fcnLCS, "Fuzzy Matching" )
<< new StaticFunction( "hamming_distance", 2, fcnHamming, "Fuzzy Matching" )
<< new StaticFunction( "soundex", 1, fcnSoundex, "Fuzzy Matching" )
<< new StaticFunction( "wordwrap", -1, fcnWordwrap, "String" )
<< new StaticFunction( "length", 1, fcnLength, "String" )
<< new StaticFunction( "replace", 3, fcnReplace, "String" )

0 comments on commit feb3bee

Please sign in to comment.
You can’t perform that action at this time.