/
transliterator.py
85 lines (69 loc) · 2.98 KB
/
transliterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""The algorithm / machine for transliteration.
The simplest case of transliteration is when each character of input corresponds
to a unique string of output. When you see 'e' output <e>, when you see 'y'
output <y>, etc. In this case the code would be rather simple too: set up a
"replacement" lookup table, and
t = ''.join(replacement[c] for c in s)
The wrinkle that prevents such an approach from working is that in our case, you
may not be able to output until seeing multiple characters of input. When you
see 'a', you need to wait and see whether the next character is 'i'. And on
seeing 'k', you still need to check whether the next character is 'h'.
This can be solved by ordering and applying the substitution rules in, say,
descending order of length (so that 'ai' occurs before 'a').
Another solution (which we use) is to create and run a state machine. When you
see 'e' when in the default state, output <e> and return to the default state,
but when you see 'k', go into a state where
- if the next character is 'h', output <kh> and go to the default state (having
consumed both characters)
- if the next character is anything else, output <k>, and go to the default
state (having consumed only the 'k').
So each "state" is a dict containing two values for every "key" (character): on
seeing that character, which state to go to, and how many characters to consume.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
def MakeStateMachine(table):
"""Makes SM from a dict like {'a':'अ', 'A':'आ', 'ai':'ऐ', 'au':'औ'}."""
root = {}
for (key, value) in table.iteritems():
# Follow characters of 'key' down the table
where = root
for c in key:
where = where.setdefault(c, {})
where[''] = value
return root
def _LongestRead(state_machine, text):
"""Finds longest match from state machine at start of the text."""
where = state_machine
num_seen = 0
num_matched = 0
replacement = None
for c in text:
num_seen += 1
where = where.get(c)
if not where:
return (num_matched, replacement)
if '' in where: # If there's a key that terminates here
num_matched = num_seen
replacement = where['']
return (num_matched, replacement)
def Transliterate(state_machine, text, pass_through=None):
"""Transliterates text using the state machine."""
transliterated = ''
unparsed_characters = set()
num_parsed = 0
while num_parsed < len(text):
# TODO(shreevatsa): Is text[num_parsed:] doing many copies for long text?
(num_matched, replacement) = _LongestRead(state_machine, text[num_parsed:])
if num_matched > 0:
transliterated += replacement
num_parsed += num_matched
else:
# Couldn't match anything; strip one char and retry.
char = text[num_parsed]
if pass_through and char in pass_through:
transliterated += char
else:
unparsed_characters.add(char)
num_parsed += 1
return (transliterated, unparsed_characters)