diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 9c1bdf4..b5665b0 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -38,15 +38,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index - next_index = self.dct.follow_bytes(b_replace_char, next_index) + next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index is not None: - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) - res += extra_keys + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) + res += extra_keys index = self.dct.follow_bytes(b_step, index) if index is None: @@ -67,7 +67,7 @@ def similar_keys(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. This may be useful e.g. for handling single-character umlauts. @@ -77,14 +77,18 @@ def similar_keys(self, key, replaces): @classmethod def compile_replaces(cls, replaces): - for k, v in replaces.items(): - if len(k) != 1 or len(v) != 1: - raise ValueError("Keys and values must be single-char unicode strings.") + for k,v in replaces.items(): + if len(k) != 1: + raise ValueError("Keys must be single-char unicode strings.") + if (isinstance(v, str) and len(v) != 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") + if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") return dict( ( k.encode('utf8'), - (v.encode('utf8'), v), + [(v_entry.encode('utf8'), v_entry) for v_entry in v] ) for k, v in replaces.items() ) @@ -326,14 +330,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index - next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index: - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_items = self._similar_items(prefix, key, next_index, replace_chars) - res += extra_items + next_index = self.dct.follow_bytes(b_replace_char, next_index) + + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_items = self._similar_items(prefix, key, next_index, replace_chars) + res += extra_items index = self.dct.follow_bytes(b_step, index) if not index: @@ -356,7 +361,7 @@ def similar_items(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_items("", key, self.dct.ROOT, replaces) @@ -370,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index + + next_index = self.dct.follow_bytes(b_replace_char, next_index) - next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index: - extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) - res += extra_items + if next_index: + extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) + res += extra_items index = self.dct.follow_bytes(b_step, index) if not index: @@ -398,7 +404,7 @@ def similar_item_values(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_item_values(0, key, self.dct.ROOT, replaces) diff --git a/dev_data/small/prediction1917-record.csv b/dev_data/small/prediction1917-record.csv new file mode 100644 index 0000000..d35e369 --- /dev/null +++ b/dev_data/small/prediction1917-record.csv @@ -0,0 +1,14 @@ +,0,1,2,3 +хлѣб,98,51,54,49 +ёлка,54,99,99,98 +ель,51,53,101,101 +лѣс,101,56,102,48 +лѣсное,57,99,53,56 +всё,50,99,55,53 +всѣ,49,99,54,48 +бѣлёная,97,49,56,97 +изобрѣтён,51,99,99,99 +лев,98,50,52,56 +лёв,50,101,51,99 +лѣв,99,100,102,50 +вѣнскій,100,57,101,57 diff --git a/dev_data/small/prediction1917-record.dawg b/dev_data/small/prediction1917-record.dawg new file mode 100644 index 0000000..6f7510f Binary files /dev/null and b/dev_data/small/prediction1917-record.dawg differ diff --git a/dev_data/small/prediction1917.dawg b/dev_data/small/prediction1917.dawg new file mode 100644 index 0000000..bbee0ce Binary files /dev/null and b/dev_data/small/prediction1917.dawg differ diff --git a/dev_data/small/prediction1917.txt b/dev_data/small/prediction1917.txt new file mode 100644 index 0000000..17409bc --- /dev/null +++ b/dev_data/small/prediction1917.txt @@ -0,0 +1 @@ +хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій \ No newline at end of file diff --git a/tests/test_prediction.py b/tests/test_prediction.py index b948580..506ae4a 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -2,6 +2,12 @@ import dawg_python from .utils import data_path +from hashlib import md5 + + +def encode(w): + code = md5(w.encode('utf8')) + return tuple([ord(c) for c in code.hexdigest()])[:4] class TestPrediction: @@ -62,3 +68,74 @@ def test_record_dawg_items(self, word, prediction): def test_record_dawg_items_values(self, word, prediction): d = self.record_dawg() assert d.similar_item_values(word, self.REPLACES) == prediction + + +class TestMultiValuedPrediction(object): + + REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) + + DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") + + SUITE = [ + ('осел', []), + ('ель', ['ель']), + ('ёль', []), + ('хлеб', ['хлѣб']), + ('елка', ['ёлка']), + ('лесное', ['лѣсное']), + ('лесноё', []), + ('лёсное', []), + ('изобретен', ['изобрѣтён']), + ('беленая', ['бѣлёная']), + ('белёная', ['бѣлёная']), + ('бѣленая', ['бѣлёная']), + ('бѣлёная', ['бѣлёная']), + ('белѣная', []), + ('бѣлѣная', []), + ('все', ['всё', 'всѣ']), + ('лев', ['лев', 'лёв', 'лѣв']), + ('венский', ['вѣнскій']), + ] + + SUITE_ITEMS = [ + ( + it[0], # key + [ + (w, [encode(w)]) # item, value pair + for w in it[1] + ] + ) + for it in SUITE + ] + + SUITE_VALUES = [ + ( + it[0], # key + [[encode(w)] for w in it[1]] + ) + for it in SUITE + ] + + def record_dawg(self): + path = data_path("small", "prediction1917-record.dawg") + return dawg_python.RecordDAWG(str("=HHHH")).load(path) + + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_dawg_prediction(self, word, prediction): + d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg")) + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_record_dawg_prediction(self, word, prediction): + d = self.record_dawg() + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS) + def test_record_dawg_items(self, word, prediction): + d = self.record_dawg() + assert d.similar_items(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES) + def test_record_dawg_items_values(self, word, prediction): + d = self.record_dawg() + assert d.similar_item_values(word, self.REPLACES) == prediction