Skip to content

Commit

Permalink
add optimal string alignment distance
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Sep 15, 2022
1 parent f3a9fee commit 06c5821
Show file tree
Hide file tree
Showing 12 changed files with 716 additions and 30 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Changelog

### [1.6.0] - 2022-09-16
#### Added
- add optimal string alignment (OSA) alignment

### [1.5.0] - 2022-09-11
#### Fix
- `fuzz::partial_ratio` did not find the optimal alignment in some edge cases
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
endif()

project(rapidfuzz LANGUAGES CXX VERSION 1.5.0)
project(rapidfuzz LANGUAGES CXX VERSION 1.6.0)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
include(GNUInstallDirs)
Expand Down
311 changes: 310 additions & 1 deletion extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2022-09-11 21:50:18.972945
// Generated: 2022-09-15 23:23:54.058287
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -4771,6 +4771,315 @@ CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeigh

} // namespace rapidfuzz

#include <cmath>
#include <numeric>

#include <stdexcept>

namespace rapidfuzz::detail {

/**
* @brief Bitparallel implementation of the OSA distance.
*
* This implementation requires the first string to have a length <= 64.
* The algorithm used is described @cite hyrro_2002 and has a time complexity
* of O(N). Comments and variable names in the implementation follow the
* paper. This implementation is used internally when the strings are short enough
*
* @tparam CharT1 This is the char type of the first sentence
* @tparam CharT2 This is the char type of the second sentence
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
*
* @return returns the OSA distance between s1 and s2
*/
template <typename PM_Vec, typename InputIt1, typename InputIt2>
int64_t osa_hyrroe2003(const PM_Vec& PM, Range<InputIt1> s1, Range<InputIt2> s2, int64_t max)
{
/* VP is set to 1^m. Shifting by bitwidth would be undefined behavior */
uint64_t VP = ~UINT64_C(0);
uint64_t VN = 0;
uint64_t D0 = 0;
uint64_t PM_j_old = 0;
int64_t currDist = s1.size();

/* mask used when computing D[m,j] in the paper 10^(m-1) */
uint64_t mask = UINT64_C(1) << (s1.size() - 1);

/* Searching */
for (const auto& ch : s2) {
/* Step 1: Computing D0 */
uint64_t PM_j = PM.get(0, ch);
uint64_t TR = (((~D0) & PM_j) << 1) & PM_j_old;
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN;
D0 = D0 | TR;

/* Step 2: Computing HP and HN */
uint64_t HP = VN | ~(D0 | VP);
uint64_t HN = D0 & VP;

/* Step 3: Computing the value D[m,j] */
currDist += bool(HP & mask);
currDist -= bool(HN & mask);

/* Step 4: Computing Vp and VN */
HP = (HP << 1) | 1;
HN = (HN << 1);

VP = HN | ~(D0 | HP);
VN = HP & D0;
PM_j_old = PM_j;
}

return (currDist <= max) ? currDist : max + 1;
}

template <typename InputIt1, typename InputIt2>
int64_t osa_hyrroe2003_block(const BlockPatternMatchVector& PM, Range<InputIt1> s1, Range<InputIt2> s2,
int64_t max = std::numeric_limits<int64_t>::max())
{
struct Row {
uint64_t VP;
uint64_t VN;
uint64_t D0;
uint64_t PM;

Row() : VP(~UINT64_C(0)), VN(0), D0(0), PM(0)
{}
};

ptrdiff_t word_size = sizeof(uint64_t) * 8;
auto words = PM.size();
uint64_t Last = UINT64_C(1) << ((s1.size() - 1) % word_size);

int64_t currDist = s1.size();
std::vector<Row> old_vecs(words + 1);
std::vector<Row> new_vecs(words + 1);

/* Searching */
for (ptrdiff_t row = 0; row < s2.size(); ++row) {
uint64_t HP_carry = 1;
uint64_t HN_carry = 0;

for (size_t word = 0; word < words; word++) {
/* retrieve bit vectors from last iterations */
uint64_t VN = old_vecs[word + 1].VN;
uint64_t VP = old_vecs[word + 1].VP;
uint64_t D0 = old_vecs[word + 1].D0;
/* D0 last word */
uint64_t D0_last = old_vecs[word].D0;

/* PM of last char same word */
uint64_t PM_j_old = old_vecs[word + 1].PM;
/* PM of last word */
uint64_t PM_last = new_vecs[word].PM;

uint64_t PM_j = PM.get(word, s2[row]);
uint64_t X = PM_j;
uint64_t TR = ((((~D0) & X) << 1) | (((~D0_last) & PM_last) >> 63)) & PM_j_old;

X |= HN_carry;
D0 = (((X & VP) + VP) ^ VP) | X | VN | TR;

uint64_t HP = VN | ~(D0 | VP);
uint64_t HN = D0 & VP;

if (word == words - 1) {
currDist += bool(HP & Last);
currDist -= bool(HN & Last);
}

uint64_t HP_carry_temp = HP_carry;
HP_carry = HP >> 63;
HP = (HP << 1) | HP_carry_temp;
uint64_t HN_carry_temp = HN_carry;
HN_carry = HN >> 63;
HN = (HN << 1) | HN_carry_temp;

new_vecs[word + 1].VP = HN | ~(D0 | HP);
new_vecs[word + 1].VN = HP & D0;
new_vecs[word + 1].D0 = D0;
new_vecs[word + 1].PM = PM_j;
}

std::swap(new_vecs, old_vecs);
}

return (currDist <= max) ? currDist : max + 1;
}

class OSA : public DistanceBase<OSA> {
friend DistanceBase<OSA>;
friend NormalizedMetricBase<OSA>;

template <typename InputIt1, typename InputIt2>
static int64_t maximum(Range<InputIt1> s1, Range<InputIt2> s2)
{
return std::max(s1.size(), s2.size());
}

template <typename InputIt1, typename InputIt2>
static int64_t _distance(Range<InputIt1> s1, Range<InputIt2> s2, int64_t score_cutoff)
{
if (s2.size() < s1.size())
return _distance(s2, s1, score_cutoff);
else if (s1.size() < 64)
return osa_hyrroe2003(PatternMatchVector(s1), s1, s2, score_cutoff);
else
return osa_hyrroe2003_block(BlockPatternMatchVector(s1), s1, s2, score_cutoff);
}
};

} // namespace rapidfuzz::detail

namespace rapidfuzz {

/**
* @brief Calculates the optimal string alignment (OSA) distance between two strings.
*
* @details
* Both strings require a similar length
*
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param max
* Maximum OSA distance between s1 and s2, that is
* considered as a result. If the distance is bigger than max,
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
* which deactivates this behaviour.
*
* @return OSA distance between s1 and s2
*/
template <typename InputIt1, typename InputIt2>
int64_t osa_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
{
return detail::OSA::distance(first1, last1, first2, last2, score_cutoff);
}

template <typename Sentence1, typename Sentence2>
int64_t osa_distance(const Sentence1& s1, const Sentence2& s2,
int64_t score_cutoff = std::numeric_limits<int64_t>::max())
{
return detail::OSA::distance(s1, s2, score_cutoff);
}

template <typename InputIt1, typename InputIt2>
int64_t osa_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
int64_t score_cutoff = 0)
{
return detail::OSA::similarity(first1, last1, first2, last2, score_cutoff);
}

template <typename Sentence1, typename Sentence2>
int64_t osa_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0)
{
return detail::OSA::similarity(s1, s2, score_cutoff);
}

template <typename InputIt1, typename InputIt2>
double osa_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 1.0)
{
return detail::OSA::normalized_distance(first1, last1, first2, last2, score_cutoff);
}

template <typename Sentence1, typename Sentence2>
double osa_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
{
return detail::OSA::normalized_distance(s1, s2, score_cutoff);
}

/**
* @brief Calculates a normalized hamming similarity
*
* @details
* Both string require a similar length
*
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 1.0.
* For ratio < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return Normalized hamming distance between s1 and s2
* as a float between 0 and 1.0
*/
template <typename InputIt1, typename InputIt2>
double osa_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
{
return detail::OSA::normalized_similarity(first1, last1, first2, last2, score_cutoff);
}

template <typename Sentence1, typename Sentence2>
double osa_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
{
return detail::OSA::normalized_similarity(s1, s2, score_cutoff);
}

template <typename CharT1>
struct CachedOSA : public detail::CachedDistanceBase<CachedOSA<CharT1>> {
template <typename Sentence1>
CachedOSA(const Sentence1& s1_) : CachedOSA(detail::to_begin(s1_), detail::to_end(s1_))
{}

template <typename InputIt1>
CachedOSA(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(detail::Range(first1, last1))
{}

private:
friend detail::CachedDistanceBase<CachedOSA<CharT1>>;
friend detail::CachedNormalizedMetricBase<CachedOSA<CharT1>>;

template <typename InputIt2>
int64_t maximum(detail::Range<InputIt2> s2) const
{
return std::max(static_cast<ptrdiff_t>(s1.size()), s2.size());
}

template <typename InputIt2>
int64_t _distance(detail::Range<InputIt2> s2, int64_t score_cutoff) const
{
if (s1.size() < 64)
return detail::osa_hyrroe2003(PM, detail::Range(s1), s2, score_cutoff);
else
return detail::osa_hyrroe2003_block(PM, detail::Range(s1), s2, score_cutoff);
}

std::basic_string<CharT1> s1;
detail::BlockPatternMatchVector PM;
};

template <typename Sentence1>
CachedOSA(const Sentence1& s1_) -> CachedOSA<char_type<Sentence1>>;

template <typename InputIt1>
CachedOSA(InputIt1 first1, InputIt1 last1) -> CachedOSA<iter_value_t<InputIt1>>;
/**@}*/

} // namespace rapidfuzz

namespace rapidfuzz {

template <typename CharT, typename InputIt1, typename InputIt2>
Expand Down
1 change: 1 addition & 0 deletions rapidfuzz/distance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <rapidfuzz/distance/Indel.hpp>
#include <rapidfuzz/distance/LCSseq.hpp>
#include <rapidfuzz/distance/Levenshtein.hpp>
#include <rapidfuzz/distance/OSA.hpp>

namespace rapidfuzz {

Expand Down
Loading

0 comments on commit 06c5821

Please sign in to comment.