In [18]:
import re
from pregex.core.classes import AnyFrom
from pregex.core.quantifiers import AtLeastAtMost, OneOrMore
from pregex.core.operators import Either
from pregex.core.groups import Capture
from pregex.core.pre import Pregex


In [24]:
SIMPLE_VOWELS = AnyFrom("a", "e", "i", "o", "u")
STRESSED_VOWELS = AnyFrom("á", "é", "í", "ó", "ú")
LONG_VOWELS = Either(SIMPLE_VOWELS + Pregex("ː"), STRESSED_VOWELS + Pregex("ː"))
ANY_SINGLE_VOWEL = Either(SIMPLE_VOWELS, STRESSED_VOWELS, LONG_VOWELS)
DIPTHONGS = AtLeastAtMost(ANY_SINGLE_VOWEL, 2, 3)


LARYNGEALS = Pregex("h") + AnyFrom("₁", "₂", "₃")
RHOTICS = AnyFrom("r", "ɹ", "ɾ")
LIQUIDS = AnyFrom("l", "ɭ", "ʎ", "ɫ")
NASALS = AnyFrom("m", "n", "ŋ")
GLIDES = AnyFrom("y", "w")
SIBILANTS = AnyFrom("s", "z", "ʃ", "ʒ")
STOPS = AnyFrom("p", "b", "t", "d", "k", "g", "q")
FRICATIVES = AnyFrom("f", "v", "θ", "ð", "x", "ɬ", "ɮ")
AFFRICATES = Either("ts", "dz", "tʃ", "dʒ", "tɬ", "dɮ", "tç", "dʝ")
PALATALIZED = STOPS + Pregex("ʲ")
VELARIZED = STOPS + Pregex("ˠ")
ASPIRATED = STOPS + Pregex("ʰ")
LABIALIZED = STOPS + Pregex("ʷ")

VOWEL_LIKE = Capture(Either(ANY_SINGLE_VOWEL, DIPTHONGS))
SIBILANT = Capture(SIBILANTS)
GLIDE = Capture(GLIDES)
NASAL = Capture(NASALS)
LIQUID = Capture(LIQUIDS)
LARYNGEAL = Capture(LARYNGEALS)
RHOTIC = Capture(RHOTICS)
CONSONANT = Capture(Either(VELARIZED, PALATALIZED, ASPIRATED, LABIALIZED, STOPS, FRICATIVES, AFFRICATES))
SYLLABLE_SEPARATOR = Pregex(".")
SYLLABLE = OneOrMore(Either(VOWEL_LIKE, SIBILANT, GLIDE, NASAL, LIQUID, LARYNGEAL, RHOTIC, CONSONANT))
WORD = SYLLABLE + OneOrMore(SYLLABLE_SEPARATOR + SYLLABLE)

[]

In [15]:
regexes = {
    "VOWEL_LIKE": re.compile(Capture(Either(ANY_SINGLE_VOWEL, DIPTHONGS), name="vowel_like").get_pattern()),
    "SIBILANT": re.compile(Capture(SIBILANTS, name="sibilant").get_pattern()),
    "GLIDE": re.compile(Capture(GLIDES, name="glide").get_pattern()),
    "NASAL": re.compile(Capture(NASALS, name="nasal").get_pattern()),
    "LIQUID": re.compile(Capture(LIQUIDS, name="liquid").get_pattern()),
    "LARYNGEAL": re.compile(Capture(LARYNGEALS, name="laryngeal").get_pattern()),
    "RHOTIC": re.compile(Capture(RHOTICS, name="rhotic").get_pattern()),
    "CONSONANT": re.compile(Capture(Either(VELARIZED, PALATALIZED, ASPIRATED, LABIALIZED, STOPS, FRICATIVES, AFFRICATES), name="consonant").get_pattern()),
}

In [16]:
regexes["CONSONANT"].match("pʰaltser")

<re.Match object; span=(0, 2), match='pʰ'>