In [1]:
from graphtransliterator import GraphTransliterator

In [2]:
yaml_ = """
  tokens:
    a: [vowel]               # type of token ("a") and its class (vowel)
    bb: [consonant, b_class] # type of token ("bb") and its classes (consonant, b_class)
    ' ': [wb]                # type of token (" ") and its class ("wb", for wordbreak)
  rules:
    a: A       # transliterate "a" to "A"
    bb: B      # transliterate "bb" to "B"
    a a: <2AS> # transliterate ("a", "a") to "<2AS>"
    ' ': ' '   # transliterate ' ' to ' '
  whitespace:
    default: " "        # default whitespace token
    consolidate: false  # whitespace should not be consolidated
    token_class: wb     # whitespace token class
"""
gt_one = GraphTransliterator.from_yaml(yaml_)
gt_one.transliterate('a')

'A'

In [3]:
gt_one.transliterate('bb')

'B'

In [4]:
gt_one.transliterate('aabb')

'<2AS>B'

In [5]:
gt_one.tokenize('abba')

[' ', 'a', 'bb', 'a', ' ']

In [6]:
yaml_ = """
  tokens:
    a: []      # "a" token with no classes
    aa: []     # "aa" token with no classes
    ' ': [wb]  # " " token and its class ("wb", for wordbreak)
  rules:
    aa: <DOUBLE_A>  # transliterate "aa" to "<DOUBLE_A>"
    a: <SINGLE_A>   # transliterate "a" to "<SINGLE_A>"
  whitespace:
    default: " "        # default whitespace token
    consolidate: false  # whitespace should not be consolidated
    token_class: wb     # whitespace token class
"""
gt_two = GraphTransliterator.from_yaml(yaml_)
gt_two.transliterate('a')

'<SINGLE_A>'

In [7]:
gt_two.transliterate('aa')

'<DOUBLE_A>'

In [8]:
gt_two.transliterate('aaa')

'<DOUBLE_A><SINGLE_A>'

In [9]:
gt_two.tokens

{'a': set(), 'aa': set(), ' ': {'wb'}}

In [10]:
yaml_ = """
  tokens:
    a: []
    b: []
    c: [class_of_c]
    ' ': [wb]
  rules:
    a: <<A>>
    a b: <<AB>>
    b: <<B>>
    c: <<C>>
    ' ': _
    <class_of_c> a b: <<AB_after_C>>
  whitespace:
    default: " "
    consolidate: false
    token_class: wb
"""
gt_three = GraphTransliterator.from_yaml(yaml_)
gt_three.transliterate("ab")  # should match rule "a b"

'<<AB>>'

In [11]:
gt_three.transliterate("cab") # should match rules: "c", and "<class_of_c> a b"

'<<C>><<AB_after_C>>'

In [12]:
gt_three.rules

[TransliterationRule(production='<<AB_after_C>>', prev_classes=['class_of_c'], prev_tokens=None, tokens=['a', 'b'], next_tokens=None, next_classes=None, cost=0.32192809488736235),
 TransliterationRule(production='<<AB>>', prev_classes=None, prev_tokens=None, tokens=['a', 'b'], next_tokens=None, next_classes=None, cost=0.41503749927884376),
 TransliterationRule(production='<<A>>', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562),
 TransliterationRule(production='<<B>>', prev_classes=None, prev_tokens=None, tokens=['b'], next_tokens=None, next_classes=None, cost=0.5849625007211562),
 TransliterationRule(production='<<C>>', prev_classes=None, prev_tokens=None, tokens=['c'], next_tokens=None, next_classes=None, cost=0.5849625007211562),
 TransliterationRule(production='_', prev_classes=None, prev_tokens=None, tokens=[' '], next_tokens=None, next_classes=None, cost=0.5849625007211562)]

In [13]:
yaml_ = """
  tokens:
    a: []
    ' ': [wb]
  rules:
    <wb> a: _A
    a <wb>: A_
    <wb> a <wb>: _A_
    a: a
    ' ': ' '
  whitespace:
    default: " "        # default whitespace token
    consolidate: true   # whitespace should be consolidated
    token_class: wb     # whitespace token class
"""
gt = GraphTransliterator.from_yaml(yaml_)
gt.transliterate('a')   # whitespace present at start of string

'_A_'

In [14]:
gt.transliterate('aa')  # whitespace present at start and end of string

'_AA_'

In [15]:
gt.transliterate(' a')  # consolidate removes whitespace at start of string

'_A_'

In [16]:
gt.transliterate('a ')  # consolidate removes whitespace at end of string

'_A_'

In [17]:
gt.whitespace

WhitespaceRules(default=' ', token_class='wb', consolidate=True)

In [18]:
yaml_ = """
  tokens:
    a: [vowel]
    ' ': [wb]
  rules:
    a: A
    ' ': ' '
  whitespace:
    default: " "
    consolidate: false
    token_class: wb
  onmatch_rules:
    - <vowel> + <vowel>: ',' # add a comma between vowels
 """
gt = GraphTransliterator.from_yaml(yaml_)
gt.transliterate('aa')

'A,A'

In [19]:
gt.onmatch_rules

[OnMatchRule(prev_classes=['vowel'], next_classes=['vowel'], production=',')]

In [20]:
yaml_ = """
  tokens:
    a: []
    ' ': [wb]
  rules:
    a: A
    ' ': ' '
  whitespace:
    default: " "
    consolidate: false
    token_class: wb
  metadata:
    author: Author McAuthorson
    version: 0.1.1
    description: A sample Graph Transliterator
  """
gt = GraphTransliterator.from_yaml(yaml_)
gt.metadata

{'author': 'Author McAuthorson',
 'version': '0.1.1',
 'description': 'A sample Graph Transliterator'}

In [21]:
yaml_ = """
  tokens:
    b: []
    c: []
    ' ': [wb]
  rules:
    b: \N{LATIN CAPITAL LETTER B}
    c: \u0043    # hexadecimal Unicode character code for 'C'
    ' ': ' '
  whitespace:
    default: " "
    consolidate: false
    token_class: wb
  """
gt = GraphTransliterator.from_yaml(yaml_)
gt.transliterate('b')

'B'

In [22]:
gt.transliterate('c')

'C'

In [23]:
settings = {
  'tokens': {'a': ['vowel'],
             ' ': ['wb']},
  'rules': [
      {'production': 'A', 'tokens': ['a']},
      {'production': ' ', 'tokens': [' ']}],
  'onmatch_rules': [
      {'prev_classes': ['vowel'],
       'next_classes': ['vowel'],
       'production': ','}],
  'whitespace': {
      'default': ' ',
      'consolidate': False,
      'token_class': 'wb'},
  'metadata': {
      'author': 'Author McAuthorson'}
}
gt = GraphTransliterator.from_dict(settings)
gt.transliterate('a')

'A'

In [24]:
%xmode Minimal

Exception reporting mode: Minimal


In [25]:
yaml_ = """
tokens:
  a: [class1, class2]
  b: []
  ' ': [wb]
rules:
  <class1> a: A
  <class2> a: AA # ambiguous rule
  <class1> b: BB
  b <class2>: BB # also ambiguous
whitespace:
  default: ' '
  consolidate: True
  token_class: wb
"""
gt = GraphTransliterator.from_yaml(yaml_)

  <class1> a
  <class2> a



  <class1> b
  b <class2>



AmbiguousTransliterationRulesException: 

In [26]:
GraphTransliterator.from_yaml(
'''
tokens:
  a: []
  ' ': [wb]
rules:
  a: A
  ' ': '_'
whitespace:
  default: ' '
  consolidate: True
  token_class: wb
''').transliterate("a a")

'A_A'

In [27]:
from graphtransliterator import GraphTransliterator
yaml_ = """
  tokens:
   a: []
   ' ': [wb]
  rules:
    a: A
    ' ': ' '
  whitespace:
    default: " "
    consolidate: true
    token_class: wb
"""
GraphTransliterator.from_yaml(yaml_).transliterate("a!a") # ignore_errors=False



UnrecognizableInputTokenException: 

In [28]:
GraphTransliterator.from_yaml(yaml_, ignore_errors=True).transliterate("a!a") # ignore_errors=True



'AA'

In [29]:
yaml_='''
  tokens:
    a: []
    b: []
    ' ': [wb]
  rules:
    a: A
    b (a): B
  whitespace:
    default: ' '
    token_class: wb
    consolidate: False
'''
gt = GraphTransliterator.from_yaml(yaml_)
gt.transliterate("ab")



NoMatchingTransliterationRuleException: 

In [30]:
gt.ignore_errors = True
gt.transliterate("ab")



'A'

In [31]:
gt = GraphTransliterator.from_yaml('''
        tokens:
            a: []
            a a: []
            ' ': [wb]
        rules:
            a: <A>
            a a: <AA>
        whitespace:
            default: ' '
            consolidate: True
            token_class: wb
''')
tokens = gt.tokenize("aa")
tokens # whitespace added to ends

[' ', 'a', 'a', ' ']

In [32]:
gt.match_at(1, tokens) # returns index to rule

0

In [33]:
gt.rules[gt.match_at(1, tokens)] # actual rule

TransliterationRule(production='<AA>', prev_classes=None, prev_tokens=None, tokens=['a', 'a'], next_tokens=None, next_classes=None, cost=0.41503749927884376)

In [34]:
gt.match_at(1, tokens, match_all=True) # index to rules, with match_all

[0, 1]

In [35]:
[gt.rules[_] for _ in gt.match_at(1, tokens, match_all=True)] # actual rules, with match_all

[TransliterationRule(production='<AA>', prev_classes=None, prev_tokens=None, tokens=['a', 'a'], next_tokens=None, next_classes=None, cost=0.41503749927884376),
 TransliterationRule(production='<A>', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562)]

In [36]:
gt.transliterate("aaa")

'<AA><A>'

In [37]:
gt.last_matched_rules

[TransliterationRule(production='<AA>', prev_classes=None, prev_tokens=None, tokens=['a', 'a'], next_tokens=None, next_classes=None, cost=0.41503749927884376),
 TransliterationRule(production='<A>', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562)]

In [38]:
gt.last_matched_rule_tokens

[['a', 'a'], ['a']]

In [39]:
gt.rules

[TransliterationRule(production='<AA>', prev_classes=None, prev_tokens=None, tokens=['a', 'a'], next_tokens=None, next_classes=None, cost=0.41503749927884376),
 TransliterationRule(production='<A>', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562)]

In [40]:
gt.pruned_of('<AA>').rules

[TransliterationRule(production='<A>', prev_classes=None, prev_tokens=None, tokens=['a'], next_tokens=None, next_classes=None, cost=0.5849625007211562)]

In [41]:
gt.pruned_of(['<A>', '<AA>']).rules

[]

In [42]:
gt = GraphTransliterator.from_yaml(
    """
    tokens:
      a: []
      ' ': [wb]
    rules:
      a: b
      <wb> a: B
      ' ': ' '
    whitespace:
      token_class: wb
      default: ' '
      consolidate: false
    """)
gt.graph

<graphtransliterator.graphs.DirectedGraph at 0x10a0f6a00>

In [43]:
gt.graph.node

[{'type': 'Start', 'ordered_children': {'a': [1], ' ': [4]}},
 {'type': 'token', 'token': 'a', 'ordered_children': {'__rules__': [2, 3]}},
 {'type': 'rule', 'rule_key': 0, 'accepting': True, 'ordered_children': {}},
 {'type': 'rule', 'rule_key': 1, 'accepting': True, 'ordered_children': {}},
 {'type': 'token', 'token': ' ', 'ordered_children': {'__rules__': [5]}},
 {'type': 'rule', 'rule_key': 2, 'accepting': True, 'ordered_children': {}}]

In [44]:
gt.graph.edge

{0: {1: {'token': 'a', 'cost': 0.41503749927884376},
  4: {'token': ' ', 'cost': 0.5849625007211562}},
 1: {2: {'cost': 0.41503749927884376, 'constraints': {'prev_classes': ['wb']}},
  3: {'cost': 0.5849625007211562}},
 4: {5: {'cost': 0.5849625007211562}}}

In [45]:
gt.graph.edge[0][1]

{'token': 'a', 'cost': 0.41503749927884376}

In [46]:
gt.graph.edge[1][2]

{'cost': 0.41503749927884376, 'constraints': {'prev_classes': ['wb']}}

In [47]:
gt.graph.edge_list

[(0, 1), (1, 2), (1, 3), (0, 4), (4, 5)]

In [48]:
gt.graph.node[0]

{'type': 'Start', 'ordered_children': {'a': [1], ' ': [4]}}

In [49]:
gt.graph.node[1]

{'type': 'token', 'token': 'a', 'ordered_children': {'__rules__': [2, 3]}}