Skip to content

Commit

Permalink
feat: spelling correction (#228)
Browse files Browse the repository at this point in the history
* correction

* sym delete search

* reverts

* edit distance

* You know nothing Jon Snow

* syllabify with correction

* shit OOP

* load corrector from dictionary

* perform syllabifier with correction in script translator

* comfort compiler

* add test

* more test cases

* fix edge case

* add test case for multiple edge

* allow correction edges exist

* BFS approximate NN search

* fix a wild pointer

* limit the correction candidates' showing up

* change the order of SpellingType

* only take normal spelling corrections

* fix syllabifier

* refactors on corrections

* address style issues

* chore(test/corrector_test): DISABLE_ non-passing test

* chore(dict/corrector): move to src/rime/dict/
  • Loading branch information
nameoverflow authored and lotem committed Dec 14, 2018
1 parent edf6a0b commit ad3638a
Show file tree
Hide file tree
Showing 16 changed files with 863 additions and 96 deletions.
57 changes: 49 additions & 8 deletions src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
//
#include <queue>
#include <boost/range/adaptor/reversed.hpp>
#include <rime/dict/prism.h>
#include <rime/algo/syllabifier.h>
#include <rime/dict/corrector.h>
#include <rime/dict/prism.h>
#include "syllabifier.h"

namespace rime {
using namespace corrector;

using Vertex = pair<size_t, SpellingType>;
using VertexQueue = std::priority_queue<Vertex,
Expand All @@ -35,16 +38,36 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// record a visit to the vertex
if (graph->vertices.find(current_pos) == graph->vertices.end())
graph->vertices.insert(vertex); // preferred spelling type comes first
else
else {
// graph->vertices[current_pos] = std::min(vertex.second, graph->vertices[current_pos]);
continue; // discard worse spelling types
}

if (current_pos > farthest)
farthest = current_pos;
DLOG(INFO) << "current_pos: " << current_pos;

// see where we can go by advancing a syllable
vector<Prism::Match> matches;
prism.CommonPrefixSearch(input.substr(current_pos), &matches);
set<SyllableId> match_set;
auto current_input = input.substr(current_pos);
prism.CommonPrefixSearch(current_input, &matches);
for (auto &m : matches) {
match_set.insert(m.value);
}
if (enable_correction_) {
Corrections corrections;
corrector_->ToleranceSearch(prism, current_input, &corrections, 5);
for (const auto &m : corrections) {
for (auto accessor = prism.QuerySpelling(m.first); !accessor.exhausted(); accessor.Next()) {
if (accessor.properties().type == kNormalSpelling) {
matches.push_back({ m.first, m.second.length });
break;
}
}
}
}

if (!matches.empty()) {
auto& end_vertices(graph->edges[current_pos]);
for (const auto& m : matches) {
Expand All @@ -56,15 +79,15 @@ int Syllabifier::BuildSyllableGraph(const string &input,
++end_pos;
DLOG(INFO) << "end_pos: " << end_pos;
bool matches_input = (current_pos == 0 && end_pos == input.length());
SpellingMap spellings;
SpellingMap& spellings(end_vertices[end_pos]);
SpellingType end_vertex_type = kInvalidSpelling;
// when spelling algebra is enabled,
// a spelling evaluates to a set of syllables;
// otherwise, it resembles exactly the syllable itself.
SpellingAccessor accessor(prism.QuerySpelling(m.value));
while (!accessor.exhausted()) {
SyllableId syllable_id = accessor.syllable_id();
SpellingProperties props = accessor.properties();
EdgeProperties props(accessor.properties());
if (strict_spelling_ &&
matches_input &&
props.type != kNormalSpelling) {
Expand All @@ -74,20 +97,29 @@ int Syllabifier::BuildSyllableGraph(const string &input,
props.end_pos = end_pos;
// add a syllable with properties to the edge's
// spelling-to-syllable map
spellings.insert({syllable_id, props});
if (match_set.find(m.value) == match_set.end()) {
props.is_correction = true;
props.credibility = 0.01;
}
auto it = spellings.find(syllable_id);
if (it == spellings.end()) {
spellings.insert({syllable_id, props});
} else {
it->second.type = std::min(it->second.type, props.type);
}
// let end_vertex_type be the best (smaller) type of spelling
// that ends at the vertex
if (end_vertex_type > props.type) {
if (end_vertex_type > props.type && !props.is_correction) {
end_vertex_type = props.type;
}
}
accessor.Next();
}
if (spellings.empty()) {
DLOG(INFO) << "not spelt.";
end_vertices.erase(end_pos);
continue;
}
end_vertices[end_pos].swap(spellings);
// find the best common type in a path up to the end vertex
// eg. pinyin "shurfa" has vertex type kNormalSpelling at position 3,
// kAbbreviation at position 4 and kAbbreviation at position 6
Expand Down Expand Up @@ -121,6 +153,10 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// when there is a path of more favored type
SpellingType edge_type = kInvalidSpelling;
for (auto k = j->second.begin(); k != j->second.end(); ) {
if (k->second.is_correction) {
++k;
continue; // Don't care correction edges
}
if (k->second.type > last_type) {
j->second.erase(k++);
}
Expand Down Expand Up @@ -245,4 +281,9 @@ void Syllabifier::Transpose(SyllableGraph* graph) {
}
}

void Syllabifier::EnableCorrection(an<Corrector> corrector) {
enable_correction_ = true;
corrector_ = std::move(corrector);
}

} // namespace rime
14 changes: 12 additions & 2 deletions src/rime/algo/syllabifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,22 @@
namespace rime {

class Prism;
class Corrector;

using SyllableId = int32_t;

using SpellingMap = map<SyllableId, SpellingProperties>;
struct EdgeProperties : SpellingProperties {
EdgeProperties(SpellingProperties sup): SpellingProperties(sup) {};
EdgeProperties() = default;
bool is_correction = false;
};

using SpellingMap = map<SyllableId, EdgeProperties>;
using VertexMap = map<size_t, SpellingType>;
using EndVertexMap = map<size_t, SpellingMap>;
using EdgeMap = map<size_t, EndVertexMap>;

using SpellingPropertiesList = vector<const SpellingProperties*>;
using SpellingPropertiesList = vector<const EdgeProperties*>;
using SpellingIndex = map<SyllableId, SpellingPropertiesList>;
using SpellingIndices = map<size_t, SpellingIndex>;

Expand All @@ -49,6 +56,7 @@ class Syllabifier {
RIME_API int BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph);
RIME_API void EnableCorrection(an<Corrector> corrector);

protected:
void CheckOverlappedSpellings(SyllableGraph *graph,
Expand All @@ -58,6 +66,8 @@ class Syllabifier {
string delimiters_;
bool enable_completion_ = false;
bool strict_spelling_ = false;
an<Corrector> corrector_ = nullptr;
bool enable_correction_ = false;
};

} // namespace rime
Expand Down
2 changes: 2 additions & 0 deletions src/rime/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional.hpp>
#define BOOST_BIND_NO_PLACEHOLDERS
#ifdef BOOST_SIGNALS2
#include <boost/signals2/connection.hpp>
Expand Down Expand Up @@ -47,6 +48,7 @@ using std::pair;
using std::set;
using std::string;
using std::vector;
using boost::optional;

template <class Key, class T>
using hash_map = std::unordered_map<Key, T>;
Expand Down
Loading

0 comments on commit ad3638a

Please sign in to comment.