From 3f072c35511c55b2ee2c2b0a2cedba9ece874f45 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 11 Sep 2023 15:42:53 +0300
Subject: [PATCH 1/7] try to port to a fork in a less fancy way

---
 dawg_python/dawgs.py     | 100 ++++++++++++++++++++++-----------------
 tests/test_prediction.py |  69 +++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 44 deletions(-)

diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py
index 9c1bdf4..80a76c9 100644
--- a/dawg_python/dawgs.py
+++ b/dawg_python/dawgs.py
@@ -1,14 +1,16 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+
 import struct
 from binascii import a2b_base64
 
 from . import wrapper
+from .compat import int_from_byte
 
-
-class DAWG:
+class DAWG(object):
     """
     Base DAWG wrapper.
     """
-
     def __init__(self):
         self.dct = None
 
@@ -38,15 +40,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
 
-                if next_index is not None:
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
-                    res += extra_keys
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+                        extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
+                        res += extra_keys
 
             index = self.dct.follow_bytes(b_step, index)
             if index is None:
@@ -67,7 +69,7 @@ def similar_keys(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
 
         This may be useful e.g. for handling single-character umlauts.
@@ -77,22 +79,26 @@ def similar_keys(self, key, replaces):
     @classmethod
     def compile_replaces(cls, replaces):
 
-        for k, v in replaces.items():
-            if len(k) != 1 or len(v) != 1:
-                raise ValueError("Keys and values must be single-char unicode strings.")
+        for k,v in replaces.items():
+            if len(k) != 1:
+                raise ValueError("Keys must be single-char unicode strings.")
+            if (isinstance(v, str) and len(v) != 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
+            if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
+                raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
 
         return dict(
             (
                 k.encode('utf8'),
-                (v.encode('utf8'), v),
+                [(v_entry.encode('utf8'), v_entry) for v_entry in v]
             )
             for k, v in replaces.items()
         )
 
     def prefixes(self, key):
-        """
+        '''
         Returns a list with keys of this DAWG that are prefixes of the ``key``.
-        """
+        '''
         res = []
         index = self.dct.ROOT
         if not isinstance(key, bytes):
@@ -101,7 +107,7 @@ def prefixes(self, key):
         pos = 1
 
         for ch in key:
-            index = self.dct.follow_char(ch, index)
+            index = self.dct.follow_char(int_from_byte(ch), index)
             if not index:
                 break
 
@@ -112,13 +118,14 @@ def prefixes(self, key):
         return res
 
 
+
 class CompletionDAWG(DAWG):
     """
     DAWG with key completion support.
     """
 
     def __init__(self):
-        super().__init__()
+        super(CompletionDAWG, self).__init__()
         self.guide = None
 
     def keys(self, prefix=""):
@@ -150,6 +157,7 @@ def iterkeys(self, prefix=""):
         while completer.next():
             yield completer.key.decode('utf8')
 
+
     def load(self, path):
         """
         Loads DAWG from a file.
@@ -167,7 +175,6 @@ def load(self, path):
 PAYLOAD_SEPARATOR = b'\x01'
 MAX_VALUE_SIZE = 32768
 
-
 class BytesDAWG(CompletionDAWG):
     """
     DAWG that is able to transparently store extra binary payload in keys;
@@ -178,7 +185,6 @@ class BytesDAWG(CompletionDAWG):
     """
 
     def __init__(self, payload_separator=PAYLOAD_SEPARATOR):
-        super().__init__()
         self._payload_separator = payload_separator
 
     def __contains__(self, key):
@@ -186,8 +192,8 @@ def __contains__(self, key):
             key = key.encode('utf8')
         return bool(self._follow_key(key))
 
-    # def b_has_key(self, key):
-    #     return bool(self._follow_key(key))
+#    def b_has_key(self, key):
+#        return bool(self._follow_key(key))
 
     def __getitem__(self, key):
         res = self.get(key)
@@ -223,7 +229,9 @@ def _value_for_index(self, index):
 
         completer.start(index)
         while completer.next():
-            b64_data = completer.key
+            # a2b_base64 doesn't support bytearray in python 2.6
+            # so it is converted (and copied) to bytes
+            b64_data = bytes(completer.key)
             res.append(a2b_base64(b64_data))
 
         return res
@@ -290,7 +298,9 @@ def items(self, prefix=""):
 
         while completer.next():
             key, value = completer.key.split(self._payload_separator)
-            res.append((key.decode('utf8'), a2b_base64(value)))
+            res.append(
+                (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
+            )
 
         return res
 
@@ -309,9 +319,10 @@ def iteritems(self, prefix=""):
 
         while completer.next():
             key, value = completer.key.split(self._payload_separator)
-            item = (key.decode('utf8'), a2b_base64(value))
+            item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
             yield item
 
+
     def _has_value(self, index):
         return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
 
@@ -326,14 +337,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
-                if next_index:
-                    prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
-                    extra_items = self._similar_items(prefix, key, next_index, replace_chars)
-                    res += extra_items
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
+
+                    if next_index:
+                        prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
+                        extra_items = self._similar_items(prefix, key, next_index, replace_chars)
+                        res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
             if not index:
@@ -356,11 +368,12 @@ def similar_items(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_items("", key, self.dct.ROOT, replaces)
 
+
     def _similar_item_values(self, start_pos, key, index, replace_chars):
         res = []
         end_pos = len(key)
@@ -375,7 +388,7 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
 
                 next_index = self.dct.follow_bytes(b_replace_char, next_index)
                 if next_index:
-                    extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
+                    extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
                     res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
@@ -398,7 +411,7 @@ def similar_item_values(self, key, replaces):
 
         ``replaces`` is an object obtained from
         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
-        that maps single-char unicode sitrings to another single-char
+        that maps single-char unicode strings to (one or more) single-char
         unicode strings.
         """
         return self._similar_item_values(0, key, self.dct.ROOT, replaces)
@@ -406,32 +419,30 @@ def similar_item_values(self, key, replaces):
 
 class RecordDAWG(BytesDAWG):
     def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR):
-        super().__init__(payload_separator)
+        super(RecordDAWG, self).__init__(payload_separator)
         self._struct = struct.Struct(str(fmt))
         self.fmt = fmt
 
     def _value_for_index(self, index):
-        value = super()._value_for_index(index)
+        value = super(RecordDAWG, self)._value_for_index(index)
         return [self._struct.unpack(val) for val in value]
 
     def items(self, prefix=""):
-        res = super().items(prefix)
+        res = super(RecordDAWG, self).items(prefix)
         return [(key, self._struct.unpack(val)) for (key, val) in res]
 
     def iteritems(self, prefix=""):
-        res = super().iteritems(prefix)
+        res = super(RecordDAWG, self).iteritems(prefix)
         return ((key, self._struct.unpack(val)) for (key, val) in res)
 
 
 LOOKUP_ERROR = -1
 
-
 class IntDAWG(DAWG):
     """
     Dict-like class based on DAWG.
     It can store integer values for unicode keys.
     """
-
     def __getitem__(self, key):
         res = self.get(key, LOOKUP_ERROR)
         if res == LOOKUP_ERROR:
@@ -458,7 +469,6 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG):
     Dict-like class based on DAWG.
     It can store integer values for unicode keys and support key completion.
     """
-
     def items(self, prefix=""):
         if not isinstance(prefix, bytes):
             prefix = prefix.encode('utf8')
@@ -474,7 +484,9 @@ def items(self, prefix=""):
         completer.start(index, prefix)
 
         while completer.next():
-            res.append((completer.key.decode('utf8'), completer.value()))
+            res.append(
+                (completer.key.decode('utf8'), completer.value())
+            )
 
         return res
 
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index b948580..69fc6cf 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -62,3 +62,72 @@ def test_record_dawg_items(self, word, prediction):
     def test_record_dawg_items_values(self, word, prediction):
         d = self.record_dawg()
         assert d.similar_item_values(word, self.REPLACES) == prediction
+
+class TestMultiValuedPrediction(object):
+
+    REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
+
+    DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
+    SUITE = [
+        ('осел', []),
+        ('ель', ['ель']),
+        ('ёль', []),
+        ('хлеб', ['хлѣб']),
+        ('елка', ['ёлка']),
+        ('лесное', ['лѣсное']),
+        ('лесноё', []),
+        ('лёсное', []),
+        ('изобретен', ['изобрѣтён']),
+        ('беленая', ['бѣлёная']),
+        ('белёная', ['бѣлёная']),
+        ('бѣленая', ['бѣлёная']),
+        ('бѣлёная', ['бѣлёная']),
+        ('белѣная', []),
+        ('бѣлѣная', []),
+        ('все', ['всё', 'всѣ']),
+        ('лев', ['лев', 'лёв', 'лѣв']),
+        ('венский', ['вѣнскій']),
+    ]
+
+    SUITE_ITEMS = [
+        (
+            it[0], # key
+            [
+                (w, [(len(w),)]) # item, value pair
+                for w in it[1]
+            ]
+        )
+        for it in SUITE
+    ]
+
+    SUITE_VALUES = [
+        (
+            it[0], # key
+            [[(len(w),)] for w in it[1]]
+        )
+        for it in SUITE
+    ]
+
+    def record_dawg(self):
+        path = data_path("small", "prediction-record.dawg")
+        return dawg_python.RecordDAWG(str("=H")).load(path)
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_dawg_prediction(self, word, prediction):
+        d = dawg_python.DAWG().load(data_path("small", "prediction.dawg"))
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE)
+    def test_record_dawg_prediction(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_keys(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
+    def test_record_dawg_items(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_items(word, self.REPLACES) == prediction
+
+    @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
+    def test_record_dawg_items_values(self, word, prediction):
+        d = self.record_dawg()
+        assert d.similar_item_values(word, self.REPLACES) == prediction

From 2fead760e2651b123b6be7b7e356237efbd91122 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 11 Sep 2023 15:53:17 +0300
Subject: [PATCH 2/7] fix merge artifacts

---
 dawg_python/dawgs.py | 46 ++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py
index 80a76c9..d6e6dc1 100644
--- a/dawg_python/dawgs.py
+++ b/dawg_python/dawgs.py
@@ -1,11 +1,8 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import, unicode_literals
-
 import struct
 from binascii import a2b_base64
 
 from . import wrapper
-from .compat import int_from_byte
+
 
 class DAWG(object):
     """
@@ -96,9 +93,9 @@ def compile_replaces(cls, replaces):
         )
 
     def prefixes(self, key):
-        '''
+        """
         Returns a list with keys of this DAWG that are prefixes of the ``key``.
-        '''
+        """
         res = []
         index = self.dct.ROOT
         if not isinstance(key, bytes):
@@ -107,7 +104,7 @@ def prefixes(self, key):
         pos = 1
 
         for ch in key:
-            index = self.dct.follow_char(int_from_byte(ch), index)
+            index = self.dct.follow_char(ch, index)
             if not index:
                 break
 
@@ -125,7 +122,7 @@ class CompletionDAWG(DAWG):
     """
 
     def __init__(self):
-        super(CompletionDAWG, self).__init__()
+        super().__init__()
         self.guide = None
 
     def keys(self, prefix=""):
@@ -175,6 +172,7 @@ def load(self, path):
 PAYLOAD_SEPARATOR = b'\x01'
 MAX_VALUE_SIZE = 32768
 
+
 class BytesDAWG(CompletionDAWG):
     """
     DAWG that is able to transparently store extra binary payload in keys;
@@ -185,6 +183,7 @@ class BytesDAWG(CompletionDAWG):
     """
 
     def __init__(self, payload_separator=PAYLOAD_SEPARATOR):
+        super().__init__()
         self._payload_separator = payload_separator
 
     def __contains__(self, key):
@@ -192,8 +191,8 @@ def __contains__(self, key):
             key = key.encode('utf8')
         return bool(self._follow_key(key))
 
-#    def b_has_key(self, key):
-#        return bool(self._follow_key(key))
+    # def b_has_key(self, key):
+    #    return bool(self._follow_key(key))
 
     def __getitem__(self, key):
         res = self.get(key)
@@ -229,9 +228,7 @@ def _value_for_index(self, index):
 
         completer.start(index)
         while completer.next():
-            # a2b_base64 doesn't support bytearray in python 2.6
-            # so it is converted (and copied) to bytes
-            b64_data = bytes(completer.key)
+            b64_data = completer.key
             res.append(a2b_base64(b64_data))
 
         return res
@@ -298,9 +295,7 @@ def items(self, prefix=""):
 
         while completer.next():
             key, value = completer.key.split(self._payload_separator)
-            res.append(
-                (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
-            )
+            res.append((key.decode('utf8'), a2b_base64(value)))
 
         return res
 
@@ -319,7 +314,7 @@ def iteritems(self, prefix=""):
 
         while completer.next():
             key, value = completer.key.split(self._payload_separator)
-            item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
+            item = (key.decode('utf8'), a2b_base64(value))
             yield item
 
 
@@ -388,7 +383,7 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
 
                 next_index = self.dct.follow_bytes(b_replace_char, next_index)
                 if next_index:
-                    extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
+                    extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
                     res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
@@ -419,30 +414,32 @@ def similar_item_values(self, key, replaces):
 
 class RecordDAWG(BytesDAWG):
     def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR):
-        super(RecordDAWG, self).__init__(payload_separator)
+        super().__init__(payload_separator)
         self._struct = struct.Struct(str(fmt))
         self.fmt = fmt
 
     def _value_for_index(self, index):
-        value = super(RecordDAWG, self)._value_for_index(index)
+        value = super()._value_for_index(index)
         return [self._struct.unpack(val) for val in value]
 
     def items(self, prefix=""):
-        res = super(RecordDAWG, self).items(prefix)
+        res = super().items(prefix)
         return [(key, self._struct.unpack(val)) for (key, val) in res]
 
     def iteritems(self, prefix=""):
-        res = super(RecordDAWG, self).iteritems(prefix)
+        res = super().iteritems(prefix)
         return ((key, self._struct.unpack(val)) for (key, val) in res)
 
 
 LOOKUP_ERROR = -1
 
+
 class IntDAWG(DAWG):
     """
     Dict-like class based on DAWG.
     It can store integer values for unicode keys.
     """
+
     def __getitem__(self, key):
         res = self.get(key, LOOKUP_ERROR)
         if res == LOOKUP_ERROR:
@@ -469,6 +466,7 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG):
     Dict-like class based on DAWG.
     It can store integer values for unicode keys and support key completion.
     """
+
     def items(self, prefix=""):
         if not isinstance(prefix, bytes):
             prefix = prefix.encode('utf8')
@@ -484,9 +482,7 @@ def items(self, prefix=""):
         completer.start(index, prefix)
 
         while completer.next():
-            res.append(
-                (completer.key.decode('utf8'), completer.value())
-            )
+            res.append((completer.key.decode('utf8'), completer.value()))
 
         return res
 

From bb6351322075ae940905b0cad90de7dc04301025 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 11 Sep 2023 15:56:39 +0300
Subject: [PATCH 3/7] fix some more merge artifacts

---
 dawg_python/dawgs.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py
index d6e6dc1..0a0f7f0 100644
--- a/dawg_python/dawgs.py
+++ b/dawg_python/dawgs.py
@@ -4,10 +4,11 @@
 from . import wrapper
 
 
-class DAWG(object):
+class DAWG:
     """
     Base DAWG wrapper.
     """
+
     def __init__(self):
         self.dct = None
 
@@ -115,7 +116,6 @@ def prefixes(self, key):
         return res
 
 
-
 class CompletionDAWG(DAWG):
     """
     DAWG with key completion support.
@@ -154,7 +154,6 @@ def iterkeys(self, prefix=""):
         while completer.next():
             yield completer.key.decode('utf8')
 
-
     def load(self, path):
         """
         Loads DAWG from a file.
@@ -192,7 +191,7 @@ def __contains__(self, key):
         return bool(self._follow_key(key))
 
     # def b_has_key(self, key):
-    #    return bool(self._follow_key(key))
+    #     return bool(self._follow_key(key))
 
     def __getitem__(self, key):
         res = self.get(key)
@@ -317,7 +316,6 @@ def iteritems(self, prefix=""):
             item = (key.decode('utf8'), a2b_base64(value))
             yield item
 
-
     def _has_value(self, index):
         return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
 
@@ -368,7 +366,6 @@ def similar_items(self, key, replaces):
         """
         return self._similar_items("", key, self.dct.ROOT, replaces)
 
-
     def _similar_item_values(self, start_pos, key, index, replace_chars):
         res = []
         end_pos = len(key)

From 10077c64a100fc3221fd4376ad6ee693f25fc31c Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Mon, 11 Sep 2023 17:35:59 +0300
Subject: [PATCH 4/7] fix similar_items_values

---
 dawg_python/dawgs.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py
index 0a0f7f0..b5665b0 100644
--- a/dawg_python/dawgs.py
+++ b/dawg_python/dawgs.py
@@ -375,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars):
             b_step = key[word_pos].encode('utf8')
 
             if b_step in replace_chars:
-                next_index = index
-                b_replace_char, u_replace_char = replace_chars[b_step]
+                for (b_replace_char, u_replace_char) in replace_chars[b_step]:
+                    next_index = index
+
+                    next_index = self.dct.follow_bytes(b_replace_char, next_index)
 
-                next_index = self.dct.follow_bytes(b_replace_char, next_index)
-                if next_index:
-                    extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
-                    res += extra_items
+                    if next_index:
+                        extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars)
+                        res += extra_items
 
             index = self.dct.follow_bytes(b_step, index)
             if not index:

From a6537e182fea69a900e22528a83e3ecbda893f31 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Thu, 14 Sep 2023 23:00:09 +0300
Subject: [PATCH 5/7] add test data

---
 dev_data/small/prediction1917-record.dawg | Bin 0 -> 3080 bytes
 dev_data/small/prediction1917.dawg        | Bin 0 -> 1028 bytes
 tests/test_prediction.py                  |  19 ++++++++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)
 create mode 100644 dev_data/small/prediction1917-record.dawg
 create mode 100644 dev_data/small/prediction1917.dawg

diff --git a/dev_data/small/prediction1917-record.dawg b/dev_data/small/prediction1917-record.dawg
new file mode 100644
index 0000000000000000000000000000000000000000..6f7510ff45a20539ab014cfb3057f41774dacaaf
GIT binary patch
literal 3080
zcmeIzdwkV%9LMqZ_xD>QVF*LWJ%nO06#b6FhE8iV+6<#u){yHe!<Hh9IwT>)aeEN9
z*|gf$g{XA<{TM=6_f_kDzk7Yp>VJRmpR>pF_xtVly?wr)@ArF-97$=ess=jp7dQ%P
zrJ&eRc{f)T^{!$?uKY$vHPi{ZIg0jnRh>lL?T+eFT-A)?dAg%ATb|-*M3Q83jk3~=
zGGyc}G#Y5{ZH>ZXjPjCrK2W~fsH-n|Eu~h6Fm9<!Fe(mwS0=?MV(TXxg~l7@C(?en
zQ6Kv8TB5s==|~;)pr6n<#!H}{?Y|9m`w@d3w>A1wk7s{9F%y3vM<v~vSCgYjWjxn2
zuTe%p*wMr`iqCCy*j08H#weh@?Pr11+1`=b(<sx%5T?&Dj?#;Wow?f_mD>5!&RZ`<
z78n&WekbatmQi=66OYZK&+IhDNpv)UaWXekMhvlbj*5G^nv8Or&o-<F@x@q+Ac^N3
z;xCY*lnr4XeuJYBZ6o!~xhO0$iVa{aJ3e`^bN1562{|gvWjw|Y8p+E{qbfW9IgTo}
zlWW@K5$B+`)NepL>XhzZKi-Gw*W?hR&s=A+FA}9ttx<vPf1pvcy{kO#g&i1^JXW-p
z!qtup`RPv%0``Z$o7i()g~l2sSJKydM_ElqlgGL85*d$pW8+<=7Lc26jGaQ?ytDhP
zL<t&}Og;8VY9f1e0J$AW%w1efVZ(UDZ-_b6m*<YOAuqAv#5z{0VgH&^)~D7XPp-n1
zQaNiJB_6{X)euL<R4IRslr@xfr7X|RYbx5)7jp~e5?@RH-z=5+>^J5ZTFCebMuQnQ
zzm3r};_&U7u#PtF)CSsCI+|hQV@|WUXLXlmbDz}D)m-lLFdvbnMjN<q#3nF|wqPr^
zVLNtUCw5^s_Fymep#h7q7)!7e%di|P(1?{-h1F=n8mz@S*pGqzDzGOo0WEM44hCr#
z$5X9vC|ct%9F8NvcF>VH3P*z!>KGi0<G_*7@i+k|qAgCs$v6e=a4Js2={N&t;w+qv
z_BaRU;yiS~`M3ZVq9Z!tB3z7QbjBsP6e+k2m*Wa_L04RftI!P|Lbw`XT!U+oiZon@
z>v02EM$VSh1L^3A4BUvD&<mNk8MmM}vXG5ik%QZCJMKU(?!;Z_gTA;M_aG1ba4+)F
z9|gD%_u~N!z=L=Q591L$ipNk0zo>-<aUYDwQG_S(B%VSsN-zXZV<?8<89a;Ocn;6w
z1(f1Nyo3=Li6~yiD;R}WF&eL73|_|@coSps7T(4?7>9T99%6VOAK*iLgpXlzTIdt*
zpJF^d!{_({Whh4lzC<Oe@D(PY8WS-IHK;`$CSwYwVj8~2H<*rZ@f~JhCT8J#%*Gtd
z#XQW1j{ra5N7Umd{ET0)0Keil{EmhA1ApQ#G++@HV+odG8J1%O8nF_quo_KRgSA+P
z_1J)o*o4j4g00ww?bv~x*oEELgT2@Xd+>ev)#br3@*h+LRX%wNSX18C)Y&UJ4p?ix
z2ifT!zvv#PB#&$J|E=QZ$?l;jCnJxOluucoc&#41$>1-ax3*Ol*9bh;AYkA5tV^IE
zW@4l1e<J^ToD`1}>~Z=sxuz3M2D>hxJi<vY<ef*!1j@51nH+CZ&NYZDwzFV2N2zb$
z)iHJ<Ekp74skF5*c&ZAby!Y8Rc3$?)p#8H-r`HJY?XP_S*RcFpi1zlEArh}!qKd%h
zG<u9~`>?(Gl-TS<*nNx~P3C`oC3s`MN79KR%uIah1eC<MXWDvvkNEN2*qR=vG|X;p
w&XR52oHyG?4Vg-%ce~%SkO#Z>?8lb>|M}+`u;>53{bV0ln*Vl~f2sGs0a8fH#Q*>R

literal 0
HcmV?d00001

diff --git a/dev_data/small/prediction1917.dawg b/dev_data/small/prediction1917.dawg
new file mode 100644
index 0000000000000000000000000000000000000000..bbee0ce6de89e86238a6aadff4dce61af8fce339
GIT binary patch
literal 1028
zcmXBROH_<u9LMqBJntswSg^Bkj)kR-U5geiEG%+tT7+D3zl1m;gqf6UbcT^T!?@%=
zlSVbu3=xVjuWJ!P2q6s0{rhI|InVR_pa11~&nuUtk(R=ZmJ%&S_5e$^+bEQ7sWQb<
zppbZ{QIs4z-N>6|siuoqk5MeuQq;%3!6@k5ry7+Frk~`tQ;a<HunVXeuK)SYc@Xgt
z)U-3pZxkU`?wpxX+e%(Hvz)v%qtYhM9Y%?yQ8hV%UMZeUZk3TcgPi{K;4YNLoim?v
z5%+;4`wZ%unN3|RB!x4nYazdam3ZZ0dh>3ZTyN0yy}2aUlBb^(qgOniJbH&mx%%Gb
zydyA#?;U0-l;cu#sHM?VMTi9lx@7Y`i8}aDY?R5J*O}R7G=_Z^#$hbRBO4Pj0Xdk2
zDVU6Dn2H&gj#-$AIhc*P$i;lj!vf?XkY}`zbrJHh1dFj01z3({Sb;*U!b*7H#%dIy
z1jSg3HCT^z*oY0-gi@4YGrUOnUE0dJ1>4}mc9df$c3?MlVK4SzKlb4O{0QJ6_%%9&
zqd0<NsK5yv$0?k|8Jq^cS3#UbCBmpeHO`?HHHadD80v5yan$1iF5x0BqXAcO1x;wg
zbzH*@G~*U-!bSpja2xk<7Y}eBt!Tj`Jj7$P;VGWr8QSpz&+!r+c#T(hgHF7|Tf9dX
fKH>vDp&MWD8DEjacYH%HdhioJ@EgDI7k}^%4eZ*9

literal 0
HcmV?d00001

diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index 69fc6cf..0130141 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -2,6 +2,12 @@
 
 import dawg_python
 from .utils import data_path
+from hashlib import md5
+
+
+def encode(w):
+    code = md5(w.encode('utf8'))
+    return tuple([ord(c) for c in code.hexdigest()])[:4]
 
 
 class TestPrediction:
@@ -63,11 +69,14 @@ def test_record_dawg_items_values(self, word, prediction):
         d = self.record_dawg()
         assert d.similar_item_values(word, self.REPLACES) == prediction
 
+
 class TestMultiValuedPrediction(object):
 
     REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
 
     DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
+    # STORED_DATA = list(zip(DATA, (encode(w) for w in DATA)))
+
     SUITE = [
         ('осел', []),
         ('ель', ['ель']),
@@ -93,7 +102,7 @@ class TestMultiValuedPrediction(object):
         (
             it[0], # key
             [
-                (w, [(len(w),)]) # item, value pair
+                (w, [encode(w)]) # item, value pair
                 for w in it[1]
             ]
         )
@@ -103,18 +112,18 @@ class TestMultiValuedPrediction(object):
     SUITE_VALUES = [
         (
             it[0], # key
-            [[(len(w),)] for w in it[1]]
+            [[encode(w)] for w in it[1]]
         )
         for it in SUITE
     ]
 
     def record_dawg(self):
-        path = data_path("small", "prediction-record.dawg")
-        return dawg_python.RecordDAWG(str("=H")).load(path)
+        path = data_path("small", "prediction1917-record.dawg")
+        return dawg_python.RecordDAWG(str("=HHHH")).load(path)
 
     @pytest.mark.parametrize(("word", "prediction"), SUITE)
     def test_dawg_prediction(self, word, prediction):
-        d = dawg_python.DAWG().load(data_path("small", "prediction.dawg"))
+        d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg"))
         assert d.similar_keys(word, self.REPLACES) == prediction
 
     @pytest.mark.parametrize(("word", "prediction"), SUITE)

From 085d1060a669032cfcec04a6719170e25623b6c2 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Fri, 15 Sep 2023 15:32:40 +0300
Subject: [PATCH 6/7] remove commented code

---
 tests/test_prediction.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index 0130141..506ae4a 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -75,7 +75,6 @@ class TestMultiValuedPrediction(object):
     REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'})
 
     DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ")
-    # STORED_DATA = list(zip(DATA, (encode(w) for w in DATA)))
 
     SUITE = [
         ('осел', []),

From a4af0744fd82c67b67fd90175d2bff08e2f2a261 Mon Sep 17 00:00:00 2001
From: Viktor Bulatov <bt.uytya@gmail.com>
Date: Wed, 20 Sep 2023 01:18:01 +0300
Subject: [PATCH 7/7] add sources for binary files

---
 dev_data/small/prediction1917-record.csv | 14 ++++++++++++++
 dev_data/small/prediction1917.txt        |  1 +
 2 files changed, 15 insertions(+)
 create mode 100644 dev_data/small/prediction1917-record.csv
 create mode 100644 dev_data/small/prediction1917.txt

diff --git a/dev_data/small/prediction1917-record.csv b/dev_data/small/prediction1917-record.csv
new file mode 100644
index 0000000..d35e369
--- /dev/null
+++ b/dev_data/small/prediction1917-record.csv
@@ -0,0 +1,14 @@
+,0,1,2,3
+хлѣб,98,51,54,49
+ёлка,54,99,99,98
+ель,51,53,101,101
+лѣс,101,56,102,48
+лѣсное,57,99,53,56
+всё,50,99,55,53
+всѣ,49,99,54,48
+бѣлёная,97,49,56,97
+изобрѣтён,51,99,99,99
+лев,98,50,52,56
+лёв,50,101,51,99
+лѣв,99,100,102,50
+вѣнскій,100,57,101,57
diff --git a/dev_data/small/prediction1917.txt b/dev_data/small/prediction1917.txt
new file mode 100644
index 0000000..17409bc
--- /dev/null
+++ b/dev_data/small/prediction1917.txt
@@ -0,0 +1 @@
+хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій
\ No newline at end of file