In [None]:
import marshmallow # requires 3.0

In [None]:
from marshmallow import Schema, fields, pprint, validates_schema, ValidationError

In [None]:
marshmallow.Schema

In [None]:
import re
RULE_RE = re.compile(
    r"(?:\(((?:\s?<.+?>\s+)+)?(.+?)\s?\) |((?:\s?<.+?>\s+)+)?)?"
    r"((?:\n|\r|.)+?)"
    r"(?:"
    r"\s+(?:\((.+?)((?:\s+<.+?>?)?)\)$)"
    r"|"
    r"((?:\s+<.+?>)+)"
    r"|"
    r"$)"
)

ONMATCH_RE = re.compile(
    r"^(" r"(?:<[^+< \s]+>\s*)+" r")" r"\+" r"(" r"(?:\s*<[^+<]+>)+)\s*$"
)

ONMATCH_CLASS_RE = re.compile(r"(?<=<)[^+< \s]+(?=>)")


from graphtransliterator import WhitespaceRules

from marshmallow import Schema, fields, pprint, post_load, validate

class WhitespaceSettingsSchema(Schema):
    default = fields.Str(required=True)
    token_class = fields.Str(required=True)
    consolidate = fields.Boolean(required=True)


#class EasyReadingRuleSchema(Schema):
    
class EasyReadingSettingsSchema(Schema):
    tokens = fields.Dict(
        keys=fields.Str(),
        values=fields.List(fields.Str()),
        required=True,
    )
    rules = fields.Dict(
        keys=fields.Str(validate=validate.Regexp(RULE_RE)),
        values=fields.Str(),
        required=True
    )
    onmatch_rules = fields.Dict(
        keys=fields.Str(validate=validate.Regexp(ONMATCH_RE)),
        values=fields.Str(),
        required=False
    )
    metadata = fields.Dict(
        keys=fields.Str(),
        # no restriction on values
        required=False
    )
    whitespace = fields.Nested(WhitespaceSettingsSchema)
    
class UserSchema(Schema):
    name = fields.Str()
    email = fields.Email()
    created_at = fields.DateTime()
    
EASYREADING_SETTINGS_SCHEMA = {
    "rules": {
        "type": "dict",
        "required": True,
        "keysrules": {"type": "string", "regex": RULE_RE.pattern},
        "valuesrules": {"type": "string"},
    },
    "onmatch_rules": {
        "type": "list",
        "required": False,
        "schema": {
            "type": "dict",
            "minlength": 1,
            "maxlength": 1,
            "keysrules": {"type": "string", "regex": ONMATCH_RE.pattern},
            "valuesrules": {"type": "string"},
        },
    },
    "tokens": {
        "type": "dict",
        "required": True,
        "keysrules": {"type": "string"},
        "valuesrules": {"type": "list", "schema": {"type": "string"}},
    },
    "whitespace": {
        "type": "dict",
        "required": True,
        "schema": {
            "default": {"required": True, "type": "string"},
            "token_class": {"required": True, "type": "string"},
            "consolidate": {"required": True, "type": "boolean"},
        },
    },
    "metadata": {"type": "dict", "required": False},
}


In [None]:
import graphtransliterator
import yaml
yaml_ = """
   tokens:
     a: [vowel]               # type of token ("a") and its class (vowel)
     bb: [consonant, b_class] # type of token ("bb") and its classes (consonant, b_class)
     ' ': [wb]                # type of token (" ") and its class ("wb", for wordbreak)
   rules:
     a: A       # transliterate "a" to "A"
     bb: B      # transliterate "bb" to "B"
     a a: <2AS> # transliterate ("a", "a") to "<2AS>"
     ' ': ' '   # transliterate ' ' to ' '
   whitespace:
     default: " "        # default whitespace token
     consolidate: false  # whitespace should not be consolidated
     token_class: wb     # whitespace token class
 """
data = yaml.safe_load(yaml_)
EasyReadingSettingsSchema().load(data)#['whitespace'])

In [None]:
    yaml_str = r"""
    tokens:
      a: [token, class1]
      b: [token, class2]
      u: [token]
      ' ': [wb]
    rules:
      a: A
      b: B
      <wb> u: \N{DEVANAGARI LETTER U}
    onmatch_rules:
      -
        <class1> + <class2>: ","
      -
        <class1> + <token>: \N{DEVANAGARI SIGN VIRAMA}
    whitespace:
      default: ' '
      token_class: 'wb'
      consolidate: true
    metadata:
      author: Author
    """

    input_dict = yaml.safe_load(yaml_str)
    assert "a" in GraphTransliterator.from_dict(input_dict).tokens.keys()

In [None]:

EASYREADING_SETTINGS_SCHEMA = {
    "rules": {
        "type": "dict",
        "required": True,
        "keysrules": {"type": "string", "regex": RULE_RE.pattern},
        "valuesrules": {"type": "string"},
    },
    "onmatch_rules": {
        "type": "list",
        "required": False,
        "schema": {
            "type": "dict",
            "minlength": 1,
            "maxlength": 1,
            "keysrules": {"type": "string", "regex": ONMATCH_RE.pattern},
            "valuesrules": {"type": "string"},
        },
    },
    "tokens": {
        "type": "dict",
        "required": True,
        "keysrules": {"type": "string"},
        "valuesrules": {"type": "list", "schema": {"type": "string"}},
    },
    "whitespace": {
        "type": "dict",
        "required": True,
        "schema": {
            "default": {"required": True, "type": "string"},
            "token_class": {"required": True, "type": "string"},
            "consolidate": {"required": True, "type": "boolean"},
        },
    },
    "metadata": {"type": "dict", "required": False},
}



In [None]:
    validator = Validator()

    tokens_schema = {
        "tokens": {
            "keysrules": {"type": "string"},
            "type": "dict",
            "valuesrules": {"schema": {"type": "string"}, "type": "list"},
        }
    }

    validator.validate({"tokens": tokens}, tokens_schema)

    if validator.errors:
        raise ValueError(
            "GraphTransliterator `tokens` contains invalid entries:\n %s"
            % validator.errors
        )

    token_keys = list(tokens.keys())
    token_classes = list(set().union(*tokens.values()))

    rules_schema = {
        "type": "list",
        "schema": {
            "type": "dict",
            "schema": {
                "tokens": {"required": True, "type": "list", "allowed": token_keys},
                "prev_classes": {
                    "required": False,
                    "type": "list",
                    "allowed": token_classes,
                },
                "prev_tokens": {
                    "required": False,
                    "type": "list",
                    "allowed": token_keys,
                },
                "next_tokens": {
                    "required": False,
                    "type": "list",
                    "allowed": token_keys,
                },
                "next_classes": {
                    "required": False,
                    "type": "list",
                    "allowed": token_classes,
                },
                "production": {"required": True, "type": "string"},
            },
        },
    }

    onmatch_rules_schema = {
        "type": "list",
        "required": False,
        "schema": {
            "type": "dict",
            "schema": {
                "prev_classes": {"type": "list", "schema": {"allowed": token_classes}},
                "production": {"type": "string"},
                "next_classes": {"type": "list", "schema": {"allowed": token_classes}},
            },
        },
    }

    whitespace_schema = {
        "type": "dict",
        "required": True,
        "schema": {
            "default": {"type": "string", "allowed": token_keys},
            "token_class": {"type": "string", "allowed": token_classes},
            "consolidate": {"type": "boolean"},
        },
    }

    metadata_schema = {"type": "dict", "required": False}

    schemas = {
        "whitespace": whitespace_schema,
        "onmatch_rules": onmatch_rules_schema,
        "rules": rules_schema,
        "metadata": metadata_schema,
    }

    document = {
        "whitespace": whitespace,
        "rules": rules,
        "onmatch_rules": onmatch_rules,
        "metadata": metadata,
    }  # Cerberus needs a dict

    validator.validate(document, schemas)

    if validator.errors:
        raise ValueError(
            "GraphTransliterator settings contain invalid entries:\n%s"
            % validator.errors
        )


In [2]:
import importlib
import graphtransliterator
importlib.reload(graphtransliterator)
bad_yaml = """
      tokens:
        a: class1
        ' ': wb\
      rules:
        a: A
      whitespace:
        default: BAD_TOKEN
        consolidate: true
        token_class: BAD_CLASS
    """
from graphtransliterator import GraphTransliterator
GraphTransliterator.from_yaml(bad_yaml)

ScannerError: mapping values are not allowed here
  in "<unicode string>", line 4, column 27:
            ' ': wb      rules:
                              ^

In [None]:
bad_yaml = """
          tokens:
            a: [class1]
            ' ': [wb]
          rules:
            a <class_nonexisting>: A
          whitespace:
            default: ' '
            consolidate: true
            token_class: wb
    """
import importlib
import graphtransliterator
importlib.reload(graphtransliterator)
graphtransliterator.GraphTransliterator.from_yaml(bad_yaml)
import yaml
x = graphtransliterator.process._process_easyreading_settings(graphtransliterator.validate.EasyReadingSettingsSchema().load(yaml.safe_load(bad_yaml)))
graphtransliterator.validate.SettingsSchema().load(x)

In [None]:
import importlib
import graphtransliterator
importlib.reload(graphtransliterator)
#from graphtransliterator import GraphTransliterator
YAML = r"""
tokens:
    a: [class_a]
    b: [class_b]
    c: [class_c]
    " ": [wb]
    d: []
    Aa: [contrained_rule]
rules:
    a: A
    b: B
    <class_c> <class_c> a: A(AFTER_CLASS_C_AND_CLASS_C)
    (<class_c> b) a: A(AFTER_B_AND_CLASS_C)
    (<class_c> b b) a a: AA(AFTER_BB_AND_CLASS_C)
    a <class_c>: A(BEFORE_CLASS_C)
    a b (c <class_b>): AB(BEFORE_C_AND_CLASS_B)
    c: C
    c c: C*2
    a (b b b): A(BEFORE_B_B_B)
    d (c <class_a>): D(BEFORE_C_AND_CLASS_A)
    (b b) a: A(AFTER_B_B)
    <wb> Aa: A(ONLY_A_CONSTRAINED_RULE)
onmatch_rules:
    -
        <class_a> <class_b> + <class_a> <class_b>: "!"
    -
        <class_a> + <class_b>: ","
whitespace:
    default: ' '
    consolidate: True
    token_class: wb
"""
gt = graphtransliterator.GraphTransliterator.from_yaml(YAML)
gt

In [None]:
class RuleSchema(Schema):
    production = fields.Str()
    tokens = fields.List(fields.Str())

class OnMatchRuleSchema(Schema):
    prev_classes = fields.List(fields.Str())
    next_classes = fields.List(fields.Str())
    production = fields.Str()
    
class SettingsSchema(Schema):
    tokens = fields.Dict(
        keys=fields.Str(),
        values=fields.List(fields.Str()),
        required=True,
    )
    rules = fields.Nested(RuleSchema, many=True, required=True)
    whitespace = fields.Nested(WhitespaceSettingsSchema, many=False, required=True)
    metadata = fields.Dict(
        keys=fields.Str(), # no restriction on values
        required=False
    )
    onmatch_rules = fields.Nested(OnMatchRuleSchema, many=True, required=False)
    whitespace = fields.Nested(WhitespaceSettingsSchema, required=True)

    from collections import defaultdict
    
    @validates_schema
    def validate_token_classes(self, data, **kwargs):
        errors = defaultdict(list)
        token_classes = list(set().union(*data['tokens'].values()))

        # validate onmatch_rules
        for onmatch_rule in data['onmatch_rules']:
            for _ in onmatch_rule['prev_classes']:
                if not _ in token_classes:
                    errors['onmatch_rules'].append('Invalid token class "{}" in prev_classes of OnMatchRule {}'.format(_, onmatch_rule))
            for _ in onmatch_rule['next_classes']:
                if not _ in token_classes:
                    errors['onmatch_rules'].append('Invalid token class "{}" in next_classes of OnMatchRule {}'.format(_, onmatch_rule))
        # validate whitespace token_class
        
        whitespace = data['whitespace']
        whitespace_token_class = whitespace['token_class']
        if not whitespace_token_class in token_classes:
            errors['whitespace'].append('Invalid token class "{}" in whitespace.'.format(whitespace_token_class, whitespace))
        if errors:
            raise ValidationError(dict(errors))


    @validates_schema
    def validate_tokens(self, data, **kwargs):
        errors = defaultdict(list)
        token_types = data['tokens'].keys()
        
        # validate whitespace
        whitespace = data['whitespace']
        default_whitespace = whitespace['default']
        if default_whitespace not in token_types:
            errors['whitespace'].append('Invalid default token "{}" in whitespace.'.format(default_whitespace))

        # validate rules
        rules = data['rules']
        for rule in rules:
            for _ in rule['tokens']:
                if not _ in token_types:
                    errors['rules'].append('Invalid token "{}" in rule {}'.format(_, rule))
        if errors:
            raise ValidationError(dict(errors))

settings = {
'tokens': {'a': ['vowel'],
           ' ': ['wb']},
'rules': [
{'production': 'A', 'tokens': ['a']},
 {'production': ' ', 'tokens': [' ']}],
  'onmatch_rules': [
       {'prev_classes': ['vozwel'],
        'next_classes': ['vowel'],
        'production': ','}],
   'whitespace': {
       'default': ' x',
       'consolidate': False,
       'token_class': 'wb'},
   'metadata': {
       'author': 'Author McAuthorson'}
 }

x=SettingsSchema().load(settings)
x

In [None]:
SettingsSchema.validate

In [None]:
token_classes = list(set().union(*data['tokens'].values()))

In [None]:
token_classes

In [None]:
from collections import defaultdict

In [None]:
defaultdict(list)