In [1]:
import pandas as pd
from io import StringIO
from syllable_analysis import *

ph₂
teːr


In [2]:
words = """
words
mh₂.teːr
ph₂.téːr
bʰréh₂.teːr
swé.soːr
dʰugh₂.téːr
suh₁.nús
h₂.né.poːts
h₁.lewdʰ
dʰgʰe.móːn
pó.tis
wih₁.rós
gʷéːn
gʰós.tis
déms
h₂égh₂
túh₂
wéy
só
h₁óy.nos
dwoh₁
tré.yes
kʷe.twó.res
pén.kʷe
swéks
sép.tem
h₁ok.tóːw
h₁.néw.nh₂
dék.mh₃t
krep
káput
h₃.dón.th₂s
néh₂s
póːds
h₃.nóːgʰs
h₃.bʰrúh₂s
pé.th₂r
h₂óst
h₂ous
h₃ókʷs
h₁óh₃s
kerd
gʰésr
kerd
"""

In [3]:
df = pd.read_csv(StringIO(words))
words = df.words.values.tolist()

In [4]:
def get_next_character(word):
    for name, regex in regexes.items():
        match = regex.match(word)
        if match is not None:
            character = match.group()
            rest = word[match.end():]
            return character, name, rest
    else:
        raise ValueError(f"Not Found: {word}")

def get_next_syllable(word):
    syllable_type = []
    syllable_chars = []
    rest = word
    while len(rest) > 0:
        character, name, rest = get_next_character(rest)
        if name == "SYLLABLE_SEPARATOR":
            return "".join(syllable_chars), "".join(syllable_type), rest
        else:
            syllable_chars.append(character)
            syllable_type.append(f"[{name}]")
    else:
        return "".join(syllable_chars), "".join(syllable_type), rest

def word_to_syllables(word):
    syllables = []
    rest = word
    while len(rest) > 0:
        syllable_chars, syllable_type, rest = get_next_syllable(rest)
        syllables.append((syllable_chars, syllable_type))
    return syllables

def group_syllables_by_type(words):
    unique_syllables = sorted({(syl, syl_type) for word in words for syl, syl_type in word_to_syllables(word)}, key=lambda x: (x[1], x[0]))
    return {
        syl_type: list([syl for syl, syl_type in syls])
        for syl_type, syls in it.groupby(unique_syllables, key=lambda x: x[1])
    }


In [5]:
word_to_syllables("h₂né.poːts")

[('h₂né', '[LARYNGEAL][CONSONANT][VOWEL_LIKE]'),
 ('poːts', '[CONSONANT][VOWEL_LIKE][CONSONANT][CONSONANT]')]

In [6]:
for (syl_type, syls) in group_syllables_by_type(words).items():
    print(syl_type)
    print("\t" + ", ".join(syls))

[CONSONANT][CONSONANT][VOWEL_LIKE]
	dʰgʰe, swé, tré, twó
[CONSONANT][CONSONANT][VOWEL_LIKE][CONSONANT]
	krep
[CONSONANT][CONSONANT][VOWEL_LIKE][CONSONANT][CONSONANT]
	swéks
[CONSONANT][CONSONANT][VOWEL_LIKE][LARYNGEAL]
	bʰréh₂, dwoh₁
[CONSONANT][CONSONANT][VOWEL_LIKE][LARYNGEAL][CONSONANT]
	bʰrúh₂s
[CONSONANT][LARYNGEAL]
	mh₂, nh₂, ph₂
[CONSONANT][LARYNGEAL][CONSONANT]
	mh₃t, th₂r, th₂s
[CONSONANT][VOWEL_LIKE]
	kʷe, né, pé, pó, só
[CONSONANT][VOWEL_LIKE][CONSONANT]
	dék, dón, gʰós, gʷéːn, móːn, nos, néw, nús, pén, res, rós, soːr, sép, tem, teːr, tis, téːr, tóːw, wéy, yes
[CONSONANT][VOWEL_LIKE][CONSONANT][CONSONANT]
	déms, gʰésr, kerd, lewdʰ, nóːgʰs, poːts, póːds
[CONSONANT][VOWEL_LIKE][CONSONANT][LARYNGEAL]
	dʰugh₂
[CONSONANT][VOWEL_LIKE][CONSONANT][VOWEL_LIKE][CONSONANT]
	káput
[CONSONANT][VOWEL_LIKE][LARYNGEAL]
	suh₁, túh₂, wih₁
[CONSONANT][VOWEL_LIKE][LARYNGEAL][CONSONANT]
	néh₂s
[LARYNGEAL]
	h₁, h₂, h₃
[LARYNGEAL][VOWEL_LIKE][CONSONANT]
	h₁ok, h₁óy
[LARYNGEAL][VOWEL_LIKE][CONSONAN

In [25]:
WORD = (
    SYLLABLE.capture("syllable").match_at_start() + 
    Indefinite(SYLLABLE, is_greedy=False)\
        .capture("rest").match_at_end()

)

In [33]:
def parse_word(word):
    syllables = []    
    rest = word
    while len(rest) > 0:
        captures = WORD.get_named_captures(rest)
        for capture in captures:
            rest = capture["rest"]
            syllables.append(capture["syllable"])
    return syllables

In [34]:
parse_word("mh₂teːr")

['mh₂', 'teːr']