From 3f072c35511c55b2ee2c2b0a2cedba9ece874f45 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 11 Sep 2023 15:42:53 +0300 Subject: [PATCH 1/7] try to port to a fork in a less fancy way --- dawg_python/dawgs.py | 100 ++++++++++++++++++++++----------------- tests/test_prediction.py | 69 +++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 44 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 9c1bdf4..80a76c9 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -1,14 +1,16 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, unicode_literals + import struct from binascii import a2b_base64 from . import wrapper +from .compat import int_from_byte - -class DAWG: +class DAWG(object): """ Base DAWG wrapper. """ - def __init__(self): self.dct = None @@ -38,15 +40,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index - next_index = self.dct.follow_bytes(b_replace_char, next_index) + next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index is not None: - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) - res += extra_keys + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) + res += extra_keys index = self.dct.follow_bytes(b_step, index) if index is None: @@ -67,7 +69,7 @@ def similar_keys(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. This may be useful e.g. for handling single-character umlauts. @@ -77,22 +79,26 @@ def similar_keys(self, key, replaces): @classmethod def compile_replaces(cls, replaces): - for k, v in replaces.items(): - if len(k) != 1 or len(v) != 1: - raise ValueError("Keys and values must be single-char unicode strings.") + for k,v in replaces.items(): + if len(k) != 1: + raise ValueError("Keys must be single-char unicode strings.") + if (isinstance(v, str) and len(v) != 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") + if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1): + raise ValueError("Values must be single-char unicode strings or non-empty lists of such.") return dict( ( k.encode('utf8'), - (v.encode('utf8'), v), + [(v_entry.encode('utf8'), v_entry) for v_entry in v] ) for k, v in replaces.items() ) def prefixes(self, key): - """ + ''' Returns a list with keys of this DAWG that are prefixes of the ``key``. - """ + ''' res = [] index = self.dct.ROOT if not isinstance(key, bytes): @@ -101,7 +107,7 @@ def prefixes(self, key): pos = 1 for ch in key: - index = self.dct.follow_char(ch, index) + index = self.dct.follow_char(int_from_byte(ch), index) if not index: break @@ -112,13 +118,14 @@ def prefixes(self, key): return res + class CompletionDAWG(DAWG): """ DAWG with key completion support. """ def __init__(self): - super().__init__() + super(CompletionDAWG, self).__init__() self.guide = None def keys(self, prefix=""): @@ -150,6 +157,7 @@ def iterkeys(self, prefix=""): while completer.next(): yield completer.key.decode('utf8') + def load(self, path): """ Loads DAWG from a file. @@ -167,7 +175,6 @@ def load(self, path): PAYLOAD_SEPARATOR = b'\x01' MAX_VALUE_SIZE = 32768 - class BytesDAWG(CompletionDAWG): """ DAWG that is able to transparently store extra binary payload in keys; @@ -178,7 +185,6 @@ class BytesDAWG(CompletionDAWG): """ def __init__(self, payload_separator=PAYLOAD_SEPARATOR): - super().__init__() self._payload_separator = payload_separator def __contains__(self, key): @@ -186,8 +192,8 @@ def __contains__(self, key): key = key.encode('utf8') return bool(self._follow_key(key)) - # def b_has_key(self, key): - # return bool(self._follow_key(key)) +# def b_has_key(self, key): +# return bool(self._follow_key(key)) def __getitem__(self, key): res = self.get(key) @@ -223,7 +229,9 @@ def _value_for_index(self, index): completer.start(index) while completer.next(): - b64_data = completer.key + # a2b_base64 doesn't support bytearray in python 2.6 + # so it is converted (and copied) to bytes + b64_data = bytes(completer.key) res.append(a2b_base64(b64_data)) return res @@ -290,7 +298,9 @@ def items(self, prefix=""): while completer.next(): key, value = completer.key.split(self._payload_separator) - res.append((key.decode('utf8'), a2b_base64(value))) + res.append( + (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix + ) return res @@ -309,9 +319,10 @@ def iteritems(self, prefix=""): while completer.next(): key, value = completer.key.split(self._payload_separator) - item = (key.decode('utf8'), a2b_base64(value)) + item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix yield item + def _has_value(self, index): return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) @@ -326,14 +337,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index - next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index: - prefix = current_prefix + key[start_pos:word_pos] + u_replace_char - extra_items = self._similar_items(prefix, key, next_index, replace_chars) - res += extra_items + next_index = self.dct.follow_bytes(b_replace_char, next_index) + + if next_index: + prefix = current_prefix + key[start_pos:word_pos] + u_replace_char + extra_items = self._similar_items(prefix, key, next_index, replace_chars) + res += extra_items index = self.dct.follow_bytes(b_step, index) if not index: @@ -356,11 +368,12 @@ def similar_items(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_items("", key, self.dct.ROOT, replaces) + def _similar_item_values(self, start_pos, key, index, replace_chars): res = [] end_pos = len(key) @@ -375,7 +388,7 @@ def _similar_item_values(self, start_pos, key, index, replace_chars): next_index = self.dct.follow_bytes(b_replace_char, next_index) if next_index: - extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) + extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) res += extra_items index = self.dct.follow_bytes(b_step, index) @@ -398,7 +411,7 @@ def similar_item_values(self, key, replaces): ``replaces`` is an object obtained from ``DAWG.compile_replaces(mapping)`` where mapping is a dict - that maps single-char unicode sitrings to another single-char + that maps single-char unicode strings to (one or more) single-char unicode strings. """ return self._similar_item_values(0, key, self.dct.ROOT, replaces) @@ -406,32 +419,30 @@ def similar_item_values(self, key, replaces): class RecordDAWG(BytesDAWG): def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR): - super().__init__(payload_separator) + super(RecordDAWG, self).__init__(payload_separator) self._struct = struct.Struct(str(fmt)) self.fmt = fmt def _value_for_index(self, index): - value = super()._value_for_index(index) + value = super(RecordDAWG, self)._value_for_index(index) return [self._struct.unpack(val) for val in value] def items(self, prefix=""): - res = super().items(prefix) + res = super(RecordDAWG, self).items(prefix) return [(key, self._struct.unpack(val)) for (key, val) in res] def iteritems(self, prefix=""): - res = super().iteritems(prefix) + res = super(RecordDAWG, self).iteritems(prefix) return ((key, self._struct.unpack(val)) for (key, val) in res) LOOKUP_ERROR = -1 - class IntDAWG(DAWG): """ Dict-like class based on DAWG. It can store integer values for unicode keys. """ - def __getitem__(self, key): res = self.get(key, LOOKUP_ERROR) if res == LOOKUP_ERROR: @@ -458,7 +469,6 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG): Dict-like class based on DAWG. It can store integer values for unicode keys and support key completion. """ - def items(self, prefix=""): if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -474,7 +484,9 @@ def items(self, prefix=""): completer.start(index, prefix) while completer.next(): - res.append((completer.key.decode('utf8'), completer.value())) + res.append( + (completer.key.decode('utf8'), completer.value()) + ) return res diff --git a/tests/test_prediction.py b/tests/test_prediction.py index b948580..69fc6cf 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -62,3 +62,72 @@ def test_record_dawg_items(self, word, prediction): def test_record_dawg_items_values(self, word, prediction): d = self.record_dawg() assert d.similar_item_values(word, self.REPLACES) == prediction + +class TestMultiValuedPrediction(object): + + REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) + + DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") + SUITE = [ + ('осел', []), + ('ель', ['ель']), + ('ёль', []), + ('хлеб', ['хлѣб']), + ('елка', ['ёлка']), + ('лесное', ['лѣсное']), + ('лесноё', []), + ('лёсное', []), + ('изобретен', ['изобрѣтён']), + ('беленая', ['бѣлёная']), + ('белёная', ['бѣлёная']), + ('бѣленая', ['бѣлёная']), + ('бѣлёная', ['бѣлёная']), + ('белѣная', []), + ('бѣлѣная', []), + ('все', ['всё', 'всѣ']), + ('лев', ['лев', 'лёв', 'лѣв']), + ('венский', ['вѣнскій']), + ] + + SUITE_ITEMS = [ + ( + it[0], # key + [ + (w, [(len(w),)]) # item, value pair + for w in it[1] + ] + ) + for it in SUITE + ] + + SUITE_VALUES = [ + ( + it[0], # key + [[(len(w),)] for w in it[1]] + ) + for it in SUITE + ] + + def record_dawg(self): + path = data_path("small", "prediction-record.dawg") + return dawg_python.RecordDAWG(str("=H")).load(path) + + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_dawg_prediction(self, word, prediction): + d = dawg_python.DAWG().load(data_path("small", "prediction.dawg")) + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE) + def test_record_dawg_prediction(self, word, prediction): + d = self.record_dawg() + assert d.similar_keys(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS) + def test_record_dawg_items(self, word, prediction): + d = self.record_dawg() + assert d.similar_items(word, self.REPLACES) == prediction + + @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES) + def test_record_dawg_items_values(self, word, prediction): + d = self.record_dawg() + assert d.similar_item_values(word, self.REPLACES) == prediction From 2fead760e2651b123b6be7b7e356237efbd91122 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 11 Sep 2023 15:53:17 +0300 Subject: [PATCH 2/7] fix merge artifacts --- dawg_python/dawgs.py | 46 ++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 80a76c9..d6e6dc1 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -1,11 +1,8 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, unicode_literals - import struct from binascii import a2b_base64 from . import wrapper -from .compat import int_from_byte + class DAWG(object): """ @@ -96,9 +93,9 @@ def compile_replaces(cls, replaces): ) def prefixes(self, key): - ''' + """ Returns a list with keys of this DAWG that are prefixes of the ``key``. - ''' + """ res = [] index = self.dct.ROOT if not isinstance(key, bytes): @@ -107,7 +104,7 @@ def prefixes(self, key): pos = 1 for ch in key: - index = self.dct.follow_char(int_from_byte(ch), index) + index = self.dct.follow_char(ch, index) if not index: break @@ -125,7 +122,7 @@ class CompletionDAWG(DAWG): """ def __init__(self): - super(CompletionDAWG, self).__init__() + super().__init__() self.guide = None def keys(self, prefix=""): @@ -175,6 +172,7 @@ def load(self, path): PAYLOAD_SEPARATOR = b'\x01' MAX_VALUE_SIZE = 32768 + class BytesDAWG(CompletionDAWG): """ DAWG that is able to transparently store extra binary payload in keys; @@ -185,6 +183,7 @@ class BytesDAWG(CompletionDAWG): """ def __init__(self, payload_separator=PAYLOAD_SEPARATOR): + super().__init__() self._payload_separator = payload_separator def __contains__(self, key): @@ -192,8 +191,8 @@ def __contains__(self, key): key = key.encode('utf8') return bool(self._follow_key(key)) -# def b_has_key(self, key): -# return bool(self._follow_key(key)) + # def b_has_key(self, key): + # return bool(self._follow_key(key)) def __getitem__(self, key): res = self.get(key) @@ -229,9 +228,7 @@ def _value_for_index(self, index): completer.start(index) while completer.next(): - # a2b_base64 doesn't support bytearray in python 2.6 - # so it is converted (and copied) to bytes - b64_data = bytes(completer.key) + b64_data = completer.key res.append(a2b_base64(b64_data)) return res @@ -298,9 +295,7 @@ def items(self, prefix=""): while completer.next(): key, value = completer.key.split(self._payload_separator) - res.append( - (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix - ) + res.append((key.decode('utf8'), a2b_base64(value))) return res @@ -319,7 +314,7 @@ def iteritems(self, prefix=""): while completer.next(): key, value = completer.key.split(self._payload_separator) - item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix + item = (key.decode('utf8'), a2b_base64(value)) yield item @@ -388,7 +383,7 @@ def _similar_item_values(self, start_pos, key, index, replace_chars): next_index = self.dct.follow_bytes(b_replace_char, next_index) if next_index: - extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) + extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) res += extra_items index = self.dct.follow_bytes(b_step, index) @@ -419,30 +414,32 @@ def similar_item_values(self, key, replaces): class RecordDAWG(BytesDAWG): def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR): - super(RecordDAWG, self).__init__(payload_separator) + super().__init__(payload_separator) self._struct = struct.Struct(str(fmt)) self.fmt = fmt def _value_for_index(self, index): - value = super(RecordDAWG, self)._value_for_index(index) + value = super()._value_for_index(index) return [self._struct.unpack(val) for val in value] def items(self, prefix=""): - res = super(RecordDAWG, self).items(prefix) + res = super().items(prefix) return [(key, self._struct.unpack(val)) for (key, val) in res] def iteritems(self, prefix=""): - res = super(RecordDAWG, self).iteritems(prefix) + res = super().iteritems(prefix) return ((key, self._struct.unpack(val)) for (key, val) in res) LOOKUP_ERROR = -1 + class IntDAWG(DAWG): """ Dict-like class based on DAWG. It can store integer values for unicode keys. """ + def __getitem__(self, key): res = self.get(key, LOOKUP_ERROR) if res == LOOKUP_ERROR: @@ -469,6 +466,7 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG): Dict-like class based on DAWG. It can store integer values for unicode keys and support key completion. """ + def items(self, prefix=""): if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -484,9 +482,7 @@ def items(self, prefix=""): completer.start(index, prefix) while completer.next(): - res.append( - (completer.key.decode('utf8'), completer.value()) - ) + res.append((completer.key.decode('utf8'), completer.value())) return res From bb6351322075ae940905b0cad90de7dc04301025 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 11 Sep 2023 15:56:39 +0300 Subject: [PATCH 3/7] fix some more merge artifacts --- dawg_python/dawgs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index d6e6dc1..0a0f7f0 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -4,10 +4,11 @@ from . import wrapper -class DAWG(object): +class DAWG: """ Base DAWG wrapper. """ + def __init__(self): self.dct = None @@ -115,7 +116,6 @@ def prefixes(self, key): return res - class CompletionDAWG(DAWG): """ DAWG with key completion support. @@ -154,7 +154,6 @@ def iterkeys(self, prefix=""): while completer.next(): yield completer.key.decode('utf8') - def load(self, path): """ Loads DAWG from a file. @@ -192,7 +191,7 @@ def __contains__(self, key): return bool(self._follow_key(key)) # def b_has_key(self, key): - # return bool(self._follow_key(key)) + # return bool(self._follow_key(key)) def __getitem__(self, key): res = self.get(key) @@ -317,7 +316,6 @@ def iteritems(self, prefix=""): item = (key.decode('utf8'), a2b_base64(value)) yield item - def _has_value(self, index): return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) @@ -368,7 +366,6 @@ def similar_items(self, key, replaces): """ return self._similar_items("", key, self.dct.ROOT, replaces) - def _similar_item_values(self, start_pos, key, index, replace_chars): res = [] end_pos = len(key) From 10077c64a100fc3221fd4376ad6ee693f25fc31c Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Mon, 11 Sep 2023 17:35:59 +0300 Subject: [PATCH 4/7] fix similar_items_values --- dawg_python/dawgs.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 0a0f7f0..b5665b0 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -375,13 +375,14 @@ def _similar_item_values(self, start_pos, key, index, replace_chars): b_step = key[word_pos].encode('utf8') if b_step in replace_chars: - next_index = index - b_replace_char, u_replace_char = replace_chars[b_step] + for (b_replace_char, u_replace_char) in replace_chars[b_step]: + next_index = index + + next_index = self.dct.follow_bytes(b_replace_char, next_index) - next_index = self.dct.follow_bytes(b_replace_char, next_index) - if next_index: - extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) - res += extra_items + if next_index: + extra_items = self._similar_item_values(word_pos + 1, key, next_index, replace_chars) + res += extra_items index = self.dct.follow_bytes(b_step, index) if not index: From a6537e182fea69a900e22528a83e3ecbda893f31 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Thu, 14 Sep 2023 23:00:09 +0300 Subject: [PATCH 5/7] add test data --- dev_data/small/prediction1917-record.dawg | Bin 0 -> 3080 bytes dev_data/small/prediction1917.dawg | Bin 0 -> 1028 bytes tests/test_prediction.py | 19 ++++++++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) create mode 100644 dev_data/small/prediction1917-record.dawg create mode 100644 dev_data/small/prediction1917.dawg diff --git a/dev_data/small/prediction1917-record.dawg b/dev_data/small/prediction1917-record.dawg new file mode 100644 index 0000000000000000000000000000000000000000..6f7510ff45a20539ab014cfb3057f41774dacaaf GIT binary patch literal 3080 zcmeIzdwkV%9LMqZ_xD>QVF*LWJ%nO06#b6FhE8iV+6<#u){yHe!)aeEN9 z*|gf$g{XA<{TM=6_f_kDzk7Yp>VJRmpR>pF_xtVly?wr)@ArF-97$=ess=jp7dQ%P zrJ&eRc{f)T^{!$?uKY$vHPi{ZIg0jnRh>lL?T+eFT-A)?dAg%ATb|-*M3Q83jk3~= zGGyc}G#Y5{ZH>ZXjPjCrK2W~fsH-n|Eu~h6Fm9A1wk7s{9F%y3vM5!&RZ`< z78n&WekbatmQi=66OYZK&+IhDNpv)UaWXekMhvlbj*5G^nv8Or&o-XhzZKi-Gw*W?hR&s=A+FA}9ttx^J5ZTFCebMuQnQ zzm3r};_&U7u#PtF)CSsCI+|hQV@|WUXLXlmbDz}D)m-lLFdvbnMjNVH3P*z!>KGi0v02EM$VSh1L^3A4BUvD&-9T99%6VOAK*iLgpXlzTIdt* zpJF^d!{_({Whh4lzCev)#br3@*h+LRX%wNSX18C)Y&UJ4p?ix z2ifT!zvv#PB#&$J|E=QZ$?l;jCnJxOluucoc)$>1-ax3*Ol*9bh;AYkA5tV^IE zW@4l1e~Z=sxuz3M2D>hxJi?(Gl-TS<*nNx~P3C`oC3s`MN79KR%uIah1eC?8lb>|M}+`u;>53{bV0ln*Vl~f2sGs0a8fH#Q*>R literal 0 HcmV?d00001 diff --git a/dev_data/small/prediction1917.dawg b/dev_data/small/prediction1917.dawg new file mode 100644 index 0000000000000000000000000000000000000000..bbee0ce6de89e86238a6aadff4dce61af8fce339 GIT binary patch literal 1028 zcmXBROH_6|siuoqk5MeuQq;%3!6@k5ry7+Frk~`tQ;a3ZTyN0yy}2aUlBb^(qgOniJbH&mx%%Gb zydyA#?;U0-l;cu#sHM?VMTi9lx@7Y`i8}aDY?R5J*O}R7G=_Z^#$hbRBO4Pj0Xdk2 zDVU6Dn2H&gj#-$AIhc*P$i;lj!vf?XkY}`zbrJHh1dFj01z3({Sb;*U!b*7H#%dIy z1jSg3HCT^z*oY0-gi@4YGrUOnUE0dJ1>4}mc9df$c3?MlVK4SzKlb4O{0QJ6_%%9& zqd0vDp&MWD8DEjacYH%HdhioJ@EgDI7k}^%4eZ*9 literal 0 HcmV?d00001 diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 69fc6cf..0130141 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -2,6 +2,12 @@ import dawg_python from .utils import data_path +from hashlib import md5 + + +def encode(w): + code = md5(w.encode('utf8')) + return tuple([ord(c) for c in code.hexdigest()])[:4] class TestPrediction: @@ -63,11 +69,14 @@ def test_record_dawg_items_values(self, word, prediction): d = self.record_dawg() assert d.similar_item_values(word, self.REPLACES) == prediction + class TestMultiValuedPrediction(object): REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") + # STORED_DATA = list(zip(DATA, (encode(w) for w in DATA))) + SUITE = [ ('осел', []), ('ель', ['ель']), @@ -93,7 +102,7 @@ class TestMultiValuedPrediction(object): ( it[0], # key [ - (w, [(len(w),)]) # item, value pair + (w, [encode(w)]) # item, value pair for w in it[1] ] ) @@ -103,18 +112,18 @@ class TestMultiValuedPrediction(object): SUITE_VALUES = [ ( it[0], # key - [[(len(w),)] for w in it[1]] + [[encode(w)] for w in it[1]] ) for it in SUITE ] def record_dawg(self): - path = data_path("small", "prediction-record.dawg") - return dawg_python.RecordDAWG(str("=H")).load(path) + path = data_path("small", "prediction1917-record.dawg") + return dawg_python.RecordDAWG(str("=HHHH")).load(path) @pytest.mark.parametrize(("word", "prediction"), SUITE) def test_dawg_prediction(self, word, prediction): - d = dawg_python.DAWG().load(data_path("small", "prediction.dawg")) + d = dawg_python.DAWG().load(data_path("small", "prediction1917.dawg")) assert d.similar_keys(word, self.REPLACES) == prediction @pytest.mark.parametrize(("word", "prediction"), SUITE) From 085d1060a669032cfcec04a6719170e25623b6c2 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Fri, 15 Sep 2023 15:32:40 +0300 Subject: [PATCH 6/7] remove commented code --- tests/test_prediction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 0130141..506ae4a 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -75,7 +75,6 @@ class TestMultiValuedPrediction(object): REPLACES = dawg_python.DAWG.compile_replaces({'е': ['ё', 'ѣ'], 'и': 'і'}) DATA = "хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій".split(" ") - # STORED_DATA = list(zip(DATA, (encode(w) for w in DATA))) SUITE = [ ('осел', []), From a4af0744fd82c67b67fd90175d2bff08e2f2a261 Mon Sep 17 00:00:00 2001 From: Viktor Bulatov Date: Wed, 20 Sep 2023 01:18:01 +0300 Subject: [PATCH 7/7] add sources for binary files --- dev_data/small/prediction1917-record.csv | 14 ++++++++++++++ dev_data/small/prediction1917.txt | 1 + 2 files changed, 15 insertions(+) create mode 100644 dev_data/small/prediction1917-record.csv create mode 100644 dev_data/small/prediction1917.txt diff --git a/dev_data/small/prediction1917-record.csv b/dev_data/small/prediction1917-record.csv new file mode 100644 index 0000000..d35e369 --- /dev/null +++ b/dev_data/small/prediction1917-record.csv @@ -0,0 +1,14 @@ +,0,1,2,3 +хлѣб,98,51,54,49 +ёлка,54,99,99,98 +ель,51,53,101,101 +лѣс,101,56,102,48 +лѣсное,57,99,53,56 +всё,50,99,55,53 +всѣ,49,99,54,48 +бѣлёная,97,49,56,97 +изобрѣтён,51,99,99,99 +лев,98,50,52,56 +лёв,50,101,51,99 +лѣв,99,100,102,50 +вѣнскій,100,57,101,57 diff --git a/dev_data/small/prediction1917.txt b/dev_data/small/prediction1917.txt new file mode 100644 index 0000000..17409bc --- /dev/null +++ b/dev_data/small/prediction1917.txt @@ -0,0 +1 @@ +хлѣб ёлка ель лѣс лѣсное всё всѣ бѣлёная изобрѣтён лев лёв лѣв вѣнскій \ No newline at end of file