/
test_graphtransliterator.py
621 lines (560 loc) · 18 KB
/
test_graphtransliterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `graphtransliterator` package."""
# from click.testing import CliRunner
from graphtransliterator import process
from graphtransliterator.core import GraphTransliterator
from graphtransliterator.exceptions import (
IncorrectVersionException,
NoMatchingTransliterationRuleException,
UnrecognizableInputTokenException,
)
from graphtransliterator.graphs import DirectedGraph
from graphtransliterator.rules import OnMatchRule, TransliterationRule, WhitespaceRules
from itertools import combinations
from marshmallow import ValidationError
import graphtransliterator
import pytest
import re
import yaml
yaml_for_test = r"""
tokens:
a: [token, class1]
b: [token, class2]
u: [token]
' ': [wb]
rules:
a: A
b: B
<wb> u: \N{DEVANAGARI LETTER U}
onmatch_rules:
-
<class1> + <class2>: ","
-
<class1> + <token>: \N{DEVANAGARI SIGN VIRAMA}
whitespace:
default: ' '
token_class: 'wb'
consolidate: true
metadata:
author: Author Name
"""
def test_GraphTransliterator_from_YAML():
"""Test YAML loading of GraphTransliterator."""
good_yaml = """
tokens:
a: [class1]
' ': [wb]
rules:
a: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
assert GraphTransliterator.from_yaml(good_yaml)
bad_yaml = """
tokens:
a: class1
' ': wb
rules:
a: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: class1
' ': wb
rules:
a: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
# tokens values are not lists
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
rules:
a: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
rules:
a: A
tokens:
a: [token]
' ': [wb]
whitespace:
default: 'BAD'
consolidate: true
token_class: bad
"""
# whitespace errors
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: [class1]
' ': [wb]
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: [class1]
' ': [wb]
rules:
b: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: [class1]
' ': [wb]
rules:
(b) a: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: [class1]
' ': [wb]
rules:
a (b): A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
bad_yaml = """
tokens:
a: [class1]
' ': [wb]
rules:
a <class_nonexisting>: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
# test for bad tokens
bad_yaml = """
tokens: '7'
rules:
a <class_nonexisting>: A
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
with pytest.raises(ValidationError):
GraphTransliterator.from_yaml(bad_yaml)
def test_graphtransliterator_process():
"""Test graphtransliterator proccessing of rules."""
data = yaml.safe_load(yaml_for_test)
assert process._process_rules({"a": "A"})[0]["tokens"] == ["a"]
assert process._process_rules({"a": "A"})[0]["production"] == "A"
assert process._process_onmatch_rules(data["onmatch_rules"])[0]["prev_classes"][0] == "class1"
assert process._process_onmatch_rules(data["onmatch_rules"])[0]["next_classes"][0] == "class2"
def test_graphtransliterator_models():
"""Test internal models."""
tr = TransliterationRule(
production="A",
prev_classes=None,
prev_tokens=None,
tokens=["a"],
next_tokens=None,
next_classes=None,
cost=1,
)
assert tr.cost == 1
# assert TransliteratorOutput([tr], 'A').output == 'A'
assert OnMatchRule(prev_classes=["class1"], next_classes=["class2"], production=",")
assert WhitespaceRules(default=" ", token_class="wb", consolidate=False)
def test_graphtransliterator_structures():
# test graph
graph = DirectedGraph()
assert len(graph.node) == 0
assert len(graph.edge) == 0
# test with node data
graph.add_node({"type": "test1"})
graph.add_node({"type": "test2"})
assert graph.node[0]["type"] == "test1"
assert graph.node[1]["type"] == "test2"
# test if no node data
graph.add_node() # 2
# test add_edge
graph.add_edge(0, 1, {"type": "edge_test1"})
assert graph.edge[0][1]["type"] == "edge_test1"
# test add_edge with no edge data
graph.add_edge(1, 2)
# edge tail not in graph
with pytest.raises(ValueError):
graph.add_edge(0, 7, {})
# edge head not in graph
with pytest.raises(ValueError):
graph.add_edge(7, 0, {})
# invalid edge data
with pytest.raises(ValueError):
graph.add_edge(0, 1, "not a dict")
# invalid edge head type
with pytest.raises(ValueError):
graph.add_edge("zero", 1)
# invalid edge tail type
with pytest.raises(ValueError):
graph.add_edge(1, "zero")
# invalid node data
with pytest.raises(ValueError):
graph.add_node("Not a dict")
# test edge_list
assert len(graph.edge_list) > 1
# test create graph without node, edges but not edge_list ads edge_list
assert DirectedGraph(node=graph.node, edge=graph.edge).edge_list == graph.edge_list
def test_GraphTransliterator_transliterate(tmpdir):
"""Test GraphTransliterator transliterate."""
YAML = r"""
tokens:
a: [class_a]
b: [class_b]
c: [class_c]
" ": [wb]
d: []
Aa: [contrained_rule]
rules:
a: A
b: B
<class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C)
(<class_c> b) a: A(AFTER_B_AND_CLASS_C)
(<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C)
a <class_c>: A(BEFORE_CLASS_C)
a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B)
c: C
c c: C*2
a (b b b): A(BEFORE_B_B_B)
d (c <class_a>): D(BEFORE_C_AND_CLASS_A)
(b b) a: A(AFTER_B_B)
<wb> Aa: A(ONLY_A_CONSTRAINED_RULE)
onmatch_rules:
-
<class_a> <class_b> + <class_a> <class_b>: "!"
-
<class_a> + <class_b>: ","
whitespace:
default: ' '
consolidate: True
token_class: wb
"""
gt = GraphTransliterator.from_yaml(YAML)
# rules with single token
assert gt.transliterate("a") == "A"
# rules with multiple tokens
assert gt.transliterate("aa") == "AA"
# rules with multiple tokens (for rule_key)
assert gt.transliterate("cc") == "C*2"
# # rules with multiple tokens overlapping end of tokens
# assert gt.transliterate('c') == 'C'
# rules with prev class
assert gt.transliterate("ca") == "CA"
# rules with prev class and prev token
assert gt.transliterate("dca") == "D(BEFORE_C_AND_CLASS_A)CA"
# rules with prev class and prev tokens
assert gt.transliterate("cbba") == "CBBA(AFTER_B_B)"
# rules with next class
assert gt.transliterate("ac") == "A(BEFORE_CLASS_C)C"
# rules with next class and next tokens
assert gt.transliterate("acb") == "A(BEFORE_CLASS_C)CB"
# rules with onmatch rule of length 1
assert gt.transliterate("ab") == "A,B"
# rules that only have constraints on first element
assert gt.transliterate("Aa") == "A(ONLY_A_CONSTRAINED_RULE)"
# test whitespace consolidation
assert gt.transliterate(" a") == "A"
# test whitespace consolidation following
assert gt.transliterate("a ") == "A"
# rules with longer onmatch rules
assert gt.transliterate("abab") == "A,B!A,B"
# test last_matched_input_tokens
assert gt.last_input_tokens == [" ", "a", "b", "a", "b", " "]
# test last_matched_tokens
assert gt.last_matched_rule_tokens == [["a"], ["b"], ["a"], ["b"]]
# test last_matched_rules
assert len(gt.last_matched_rules) == 4
def test_serialization():
"""Test serialization of graphtransliterator"""
# Field definitions
required_fields = ["tokens", "rules", "whitespace"]
optional_fields = [
"onmatch_rules",
"metadata",
"ignore_errors",
"onmatch_rules_lookup",
"tokens_by_class",
"graph",
"tokenizer_pattern",
"graphtransliterator_version",
]
ordered_fields = required_fields + optional_fields
yaml_ = """
tokens:
a: [vowel]
' ': [wb]
rules:
a: A
' ': ' '
whitespace:
default: " "
consolidate: false
token_class: wb
onmatch_rules:
- <vowel> + <vowel>: ',' # add a comma between vowels
metadata:
author: "Author McAuthorson"
"""
gt = GraphTransliterator.from_yaml(yaml_)
# test dump
dump = gt.dump()
assert dump["graph"]["edge"]
# test ordering of dump fields
assert list(dump.keys()) == ordered_fields
# test dump version
assert dump["graphtransliterator_version"] == graphtransliterator.__version__
assert re.match(r"\d+\.\d+\.\d+$", gt.dump()["graphtransliterator_version"])
# test dumps
x = gt.dumps()
assert "graph" in gt.dumps()
assert type(x) == str
# test loads
new_gt = GraphTransliterator.loads(x)
assert GraphTransliterator.loads(gt.dumps()).dumps()
assert type(new_gt) == GraphTransliterator
# test load
settings = gt.dump()
assert type(GraphTransliterator.load(settings)) == GraphTransliterator
# confirm settings not affected by load
assert settings == settings
# confirm compacting (dropping) optional settings works
for length in range(1, len(optional_fields)):
for to_drop in combinations(optional_fields, length):
settings = gt.dump()
for _ in to_drop:
settings.pop(_)
if settings.get("onmatch_rules_lookup") and not settings.get("onmatch_rules"):
with pytest.raises(ValidationError):
assert GraphTransliterator.load(settings)
else:
assert GraphTransliterator.load(settings)
# test IncorrectVersionException
_ = gt.dump()
_["graphtransliterator_version"] += "1" # add 1 e.g. 1.0.11
with pytest.raises(IncorrectVersionException):
assert GraphTransliterator.load(_)
def test_version():
"""Tests to make sure version is not a mess (e.g. due to Black formatting)"""
assert re.match(r"\d+\.\d+\.\d+$", graphtransliterator.__version__)
def test_match_all():
"""Test GraphTransliterator transliterate."""
YAML = r"""
tokens:
a: [class_a]
" ": [wb]
rules:
a: A
a a: A*2
whitespace:
default: ' '
consolidate: True
token_class: wb
"""
gt = GraphTransliterator.from_yaml(YAML)
assert gt.rules[0].cost < gt.rules[1].cost
tokens = gt.tokenize("aa")
assert gt.match_at(1, tokens, match_all=False) == 0
assert gt.match_at(1, tokens, match_all=True) == [0, 1]
def test_GraphTransliterator(tmpdir):
"""Test GraphTransliterator."""
yaml_str = r"""
tokens:
a: [token, class1]
b: [token, class2]
u: [token]
' ': [wb]
rules:
a: A
b: B
<wb> u: \N{DEVANAGARI LETTER U}
onmatch_rules:
-
<class1> + <class2>: ","
-
<class1> + <token>: \N{DEVANAGARI SIGN VIRAMA}
whitespace:
default: ' '
token_class: 'wb'
consolidate: true
metadata:
author: Author
"""
input_dict = yaml.safe_load(yaml_str)
assert "a" in GraphTransliterator.from_easyreading_dict(input_dict).tokens.keys()
gt = GraphTransliterator.from_easyreading_dict(input_dict)
assert gt.onmatch_rules[0].production == ","
assert gt.tokens
assert gt.rules
assert gt.whitespace
assert gt.whitespace.default
assert gt.whitespace.token_class
assert gt.whitespace.consolidate
assert gt.metadata["author"] == "Author"
assert type(gt.graph) == DirectedGraph
yaml_file = tmpdir.join("yaml_test.yaml")
yaml_filename = str(yaml_file)
yaml_file.write(yaml_str)
assert yaml_file.read() == yaml_str
assert GraphTransliterator.from_yaml_file(yaml_filename)
assert len(set(GraphTransliterator.from_easyreading_dict(input_dict).tokens)) == 4
assert GraphTransliterator.from_yaml(yaml_str).transliterate("ab") == "A,B"
assert GraphTransliterator.from_yaml_file(yaml_filename).transliterate("ab") == "A,B"
assert (
GraphTransliterator.from_easyreading_dict(
{
"tokens": {"a": ["class_a"], "b": ["class_b"], " ": ["wb"]},
"onmatch_rules": [{"<class_a> + <class_b>": ","}],
"whitespace": {
"default": " ",
"token_class": "wb",
"consolidate": True,
},
"rules": {"a": "A", "b": "B"},
}
).transliterate("ab")
== "A,B"
)
def test_GraphTransliterator_ignore_errors():
# if ignore_errors is not set and no matching transliteration rule
# raise NoMatchingTransliterationRule exception
yaml_str = """
tokens:
a: [class1]
b: [class1]
' ': [wb]
rules:
a a: B2
b: B
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
# check that ignore_errors works
assert GraphTransliterator.from_yaml(yaml_str, ignore_errors=True).transliterate("a") == ""
with pytest.raises(NoMatchingTransliterationRuleException):
gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
assert gt.ignore_errors is False
gt.transliterate("a")
with pytest.raises(UnrecognizableInputTokenException):
gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
assert gt.ignore_errors is False
gt.transliterate("!")
gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=True)
assert gt.ignore_errors is True
assert gt.tokenize("b!b") == [" ", "b", "b", " "]
assert gt.transliterate("b!b") == "BB"
with pytest.raises(UnrecognizableInputTokenException):
gt = GraphTransliterator.from_yaml(yaml_str, ignore_errors=False)
assert gt.ignore_errors is False
gt.transliterate("b!")
# test ignore_errors setter and property
gt.ignore_errors = True
assert gt.ignore_errors is True
gt.ignore_errors = False
assert gt.ignore_errors is False
def test_GraphTransliterator_types():
"""Test internal types."""
pr = TransliterationRule(
production="A",
prev_classes=None,
prev_tokens=None,
tokens=["a"],
next_tokens=None,
next_classes=None,
cost=1,
)
assert pr.cost == 1
assert OnMatchRule(prev_classes=["class1"], next_classes=["class2"], production=",")
assert WhitespaceRules(default=" ", token_class="wb", consolidate=False)
graph = DirectedGraph()
assert len(graph.node) == 0
assert len(graph.edge) == 0
graph.add_node({"type": "test1"})
graph.add_node({"type": "test2"})
assert graph.node[0]["type"] == "test1"
assert graph.node[1]["type"] == "test2"
graph.add_edge(0, 1, {"type": "edge_test1"})
assert graph.edge[0][1]["type"] == "edge_test1"
def test_GraphTransliterator_productions():
"""Test productions."""
tokens = {"ab": ["class_ab"], " ": ["wb"]}
whitespace = {"default": " ", "token_class": "wb", "consolidate": True}
rules = {"ab": "AB", " ": "_"}
settings = {"tokens": tokens, "rules": rules, "whitespace": whitespace}
assert set(GraphTransliterator.from_easyreading_dict(settings).productions) == set(["AB", "_"])
def test_GraphTransliterator_pruned_of():
gt = GraphTransliterator.from_yaml(
"""
tokens:
a: [class1]
b: [class2]
' ': [wb]
rules:
a: A
b: B
whitespace:
default: ' '
consolidate: true
token_class: wb
"""
)
assert len(gt.rules) == 2
assert len(gt.pruned_of("B").rules) == 1
assert gt.pruned_of("B").rules[0].production == "A"
assert gt.pruned_of(["A", "B"]) # if no rules present will still work
def test_GraphTransliterator_graph():
"""Test graph."""
tokens = {"ab": ["class_ab"], " ": ["wb"]}
whitespace = {"default": " ", "token_class": "wb", "consolidate": True}
rules = {"ab": "AB", " ": "_"}
settings = {"tokens": tokens, "rules": rules, "whitespace": whitespace}
gt = GraphTransliterator.from_easyreading_dict(settings)
assert gt._graph
assert gt._graph.node[0]["type"] == "Start" # test for Start
assert gt