Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 33 additions & 27 deletions dawg_python/dawgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars):
b_step = key[word_pos].encode('utf8')

if b_step in replace_chars:
next_index = index
b_replace_char, u_replace_char = replace_chars[b_step]
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
next_index = index

next_index = self.dct.follow_bytes(b_replace_char, next_index)
next_index = self.dct.follow_bytes(b_replace_char, next_index)

if next_index is not None:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
res += extra_keys
if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
res += extra_keys

index = self.dct.follow_bytes(b_step, index)
if index is None:
Expand All @@ -67,7 +67,7 @@ def similar_keys(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.

This may be useful e.g. for handling single-character umlauts.
Expand All @@ -77,14 +77,18 @@ def similar_keys(self, key, replaces):
@classmethod
def compile_replaces(cls, replaces):

for k, v in replaces.items():
if len(k) != 1 or len(v) != 1:
raise ValueError("Keys and values must be single-char unicode strings.")
for k,v in replaces.items():
if len(k) != 1:
raise ValueError("Keys must be single-char unicode strings.")
if (isinstance(v, str) and len(v) != 1):
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")

return dict(
(
k.encode('utf8'),
(v.encode('utf8'), v),
[(v_entry.encode('utf8'), v_entry) for v_entry in v]
)
for k, v in replaces.items()
)
Expand Down Expand Up @@ -326,14 +330,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars):
b_step = key[word_pos].encode('utf8')

if b_step in replace_chars:
next_index = index
b_replace_char, u_replace_char = replace_chars[b_step]
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
next_index = index

next_index = self.dct.follow_bytes(b_replace_char, next_index)
if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
res += extra_items
next_index = self.dct.follow_bytes(b_replace_char, next_index)

if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
res += extra_items

index = self.dct.follow_bytes(b_step, index)
if not index:
Expand All @@ -356,7 +361,7 @@ def similar_items(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.
"""
return self._similar_items("", key, self.dct.ROOT, replaces)
Expand All @@ -370,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
b_step = key[word_pos].encode('utf8')

if b_step in replace_chars:
next_index = index
b_replace_char, u_replace_char = replace_chars[b_step]
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
next_index = index

next_index = self.dct.follow_bytes(b_replace_char, next_index)

next_index = self.dct.follow_bytes(b_replace_char, next_index)
if next_index:
extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
res += extra_items
if next_index:
extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
res += extra_items

index = self.dct.follow_bytes(b_step, index)
if not index:
Expand All @@ -398,7 +404,7 @@ def similar_item_values(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.
"""
return self._similar_item_values(0, key, self.dct.ROOT, replaces)
Expand Down
14 changes: 14 additions & 0 deletions dev_data/small/prediction1917-record.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
,0,1,2,3
хлѣб,98,51,54,49
ёлка,54,99,99,98
ель,51,53,101,101
лѣс,101,56,102,48
лѣсное,57,99,53,56
всё,50,99,55,53
всѣ,49,99,54,48
бѣлёная,97,49,56,97
изобрѣтён,51,99,99,99
лев,98,50,52,56
лёв,50,101,51,99
лѣв,99,100,102,50
вѣнскій,100,57,101,57
Binary file added dev_data/small/prediction1917-record.dawg
Binary file not shown.
Binary file added dev_data/small/prediction1917.dawg
Binary file not shown.
1 change: 1 addition & 0 deletions dev_data/small/prediction1917.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій
77 changes: 77 additions & 0 deletions tests/test_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

import dawg_python
from .utils import data_path
from hashlib import md5


def encode(w):
code = md5(w.encode('utf8'))
return tuple([ord(c) for c in code.hexdigest()])[:4]


class TestPrediction:
Expand Down Expand Up @@ -62,3 +68,74 @@ def test_record_dawg_items(self, word, prediction):
def test_record_dawg_items_values(self, word, prediction):
d = self.record_dawg()
assert d.similar_item_values(word, self.REPLACES) == prediction


class TestMultiValuedPrediction(object):

REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})

DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")

SUITE = [
('осел', []),
('ель', ['ель']),
('ёль', []),
('хлеб', ['хлѣб']),
('елка', ['ёлка']),
('лесное', ['лѣсное']),
('лесноё', []),
('лёсное', []),
('изобретен', ['изобрѣтён']),
('беленая', ['бѣлёная']),
('белёная', ['бѣлёная']),
('бѣленая', ['бѣлёная']),
('бѣлёная', ['бѣлёная']),
('белѣная', []),
('бѣлѣная', []),
('все', ['всё', 'всѣ']),
('лев', ['лев', 'лёв', 'лѣв']),
('венский', ['вѣнскій']),
]

SUITE_ITEMS = [
(
it[0], # key
[
(w, [encode(w)]) # item, value pair
for w in it[1]
]
)
for it in SUITE
]

SUITE_VALUES = [
(
it[0], # key
[[encode(w)] for w in it[1]]
)
for it in SUITE
]

def record_dawg(self):
path = data_path("small", "prediction1917-record.dawg")
return dawg_python.RecordDAWG(str("=HHHH")).load(path)

@pytest.mark.parametrize(("word", "prediction"), SUITE)
def test_dawg_prediction(self, word, prediction):
d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg"))
assert d.similar_keys(word, self.REPLACES) == prediction

@pytest.mark.parametrize(("word", "prediction"), SUITE)
def test_record_dawg_prediction(self, word, prediction):
d = self.record_dawg()
assert d.similar_keys(word, self.REPLACES) == prediction

@pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
def test_record_dawg_items(self, word, prediction):
d = self.record_dawg()
assert d.similar_items(word, self.REPLACES) == prediction

@pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
def test_record_dawg_items_values(self, word, prediction):
d = self.record_dawg()
assert d.similar_item_values(word, self.REPLACES) == prediction