-
-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a bunch of optimised fuzzy string matching algorithms.
A new QgsStringUtils class has been added containing some common fuzzy matching algorithms, including Levenshtein edit distance and Soundex. These can be used for finding "similar" strings in a table. Expression functions for these algorithms have also been added to a new "Fuzzy Matching" group.
- Loading branch information
1 parent
79305b2
commit feb3bee
Showing
13 changed files
with
661 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/** \ingroup core | ||
* \class QgsStringUtils | ||
* \brief Utility functions for working with strings. | ||
* \note Added in version 2.11 | ||
*/ | ||
|
||
class QgsStringUtils | ||
{ | ||
%TypeHeaderCode | ||
#include <qgsstringutils.h> | ||
%End | ||
|
||
public: | ||
/** Returns the Levenshtein edit distance between two strings. This equates to the minimum | ||
* number of character edits (insertions, deletions or substitutions) required to change | ||
* one string to another. | ||
* @param string1 first string | ||
* @param string2 second string | ||
* @param caseSensitive set to true for case sensitive comparison | ||
* @returns edit distance. Lower distances indicate more similiar strings. | ||
*/ | ||
static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false ); | ||
|
||
/** Returns the longest common substring between two strings. This substring is the longest | ||
* string that is a substring of the two input strings. Eg, the longest common substring | ||
* of "ABABC" and "BABCA" is "ABC". | ||
* @param string1 first string | ||
* @param string2 second string | ||
* @param caseSensitive set to true for case sensitive comparison | ||
* @returns longest common substring | ||
*/ | ||
static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false ); | ||
|
||
/** Returns the Hamming distance between two strings. This equates to the number of characters at | ||
* corresponding positions within the input strings where the characters are different. The input | ||
* strings must be the same length. | ||
* @param string1 first string | ||
* @param string2 second string | ||
* @param caseSensitive set to true for case sensitive comparison | ||
* @returns Hamming distance between strings, or -1 if strings are different lengths. | ||
*/ | ||
static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false ); | ||
|
||
/** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm, | ||
* so strings with similar sounds should be represented by the same Soundex code. | ||
* @param string input string | ||
* @returns 4 letter Soundex code | ||
*/ | ||
static QString soundex( const QString &string ); | ||
|
||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<h3>hamming_distance function</h3> | ||
Returns the Hamming distance between two strings. This equates to the number of characters at | ||
corresponding positions within the input strings where the characters are different. The input | ||
strings must be the same length, and the comparison is case-sensitive. | ||
|
||
<h4>Syntax</h4> | ||
<pre>hamming_distance(string1,string2)</pre> | ||
|
||
<h4>Arguments</h4> | ||
string1 → a string<br /> | ||
string2 → a string<br /> | ||
|
||
<h4>Example</h4> | ||
<pre> hamming_distance('abc','xec') → 2</pre><br /> | ||
<pre> hamming_distance('abc','ABc') → 2</pre><br /> | ||
<pre> hamming_distance(upper('abc'),upper('ABC')) → 0</pre> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<h3>levenshtein function</h3> | ||
Returns the Levenshtein edit distance between two strings. This equates to the minimum | ||
number of character edits (insertions, deletions or substitutions) required to change | ||
one string to another.<br /> | ||
The Levenshtein distance is a measure of the similarity between two strings. Smaller | ||
distances mean the strings are more similar, and larger distances indicate more | ||
different strings. The distance is case sensitive. | ||
|
||
<h4>Syntax</h4> | ||
<pre>levenshtein(string1,string2)</pre> | ||
|
||
<h4>Arguments</h4> | ||
string1 → a string<br /> | ||
string2 → a string<br /> | ||
|
||
<h4>Example</h4> | ||
<pre> levenshtein('kittens','mitten') → 2</pre><br /> | ||
<pre> levenshtein('Kitten','kitten') → 1</pre><br /> | ||
<pre> levenshtein(upper('Kitten'),upper('kitten')) → 0</pre> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<h3>longest_common_substring function</h3> | ||
Returns the longest common substring between two strings. This substring is the longest | ||
string that is a substring of the two input strings. Eg, the longest common substring | ||
of "ABABC" and "BABCA" is "ABC". The substring is case sensitive. | ||
|
||
<h4>Syntax</h4> | ||
<pre>longest_common_substring(string1,string2)</pre> | ||
|
||
<h4>Arguments</h4> | ||
string1 → a string<br /> | ||
string2 → a string<br /> | ||
|
||
<h4>Example</h4> | ||
<pre> longest_common_substring('ABABC','BABCA') → 'ABC'</pre><br /> | ||
<pre> longest_common_substring('abcDeF','abcdef') → 'abc'</pre><br /> | ||
<pre> longest_common_substring(upper('abcDeF'),upper('abcdex')) → 'ABCDE'</pre> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
<h3>soundex function</h3> | ||
Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm, | ||
so strings with similar sounds should be represented by the same Soundex code. | ||
|
||
<h4>Syntax</h4> | ||
<pre>soundex(string)</pre> | ||
|
||
<h4>Arguments</h4> | ||
string → a string | ||
|
||
<h4>Example</h4> | ||
<pre> soundex('robert') → 'R163'</pre><br /> | ||
<pre> soundex('rupert') → 'R163'</pre><br /> | ||
<pre> soundex('rubin') → 'R150'</pre> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.