In [1]:
# Step 1: Read in the CMU dictionary and extract the phones and vowels and consonants
cmudict = [f.strip().split('#')[0].split() for f in open('cmudict.dict')]
len(cmudict)

135155

In [2]:
phones = set([p for w in cmudict for p in w[1:]])
print(len(phones))

69


In [3]:
vowels = [p for p in phones if p[0] in ['A','E','I','O','U']]
consonants = phones - set(vowels)
print(len(vowels),len(consonants))

45 24


In [4]:
print(vowels)

['EY0', 'IY0', 'AY1', 'AW1', 'EH0', 'EH2', 'AE2', 'AO1', 'IH1', 'EY1', 'IH2', 'AA1', 'AA0', 'AW2', 'OW1', 'UW2', 'IY1', 'ER1', 'EY2', 'IH0', 'AE1', 'UH1', 'OW0', 'AE0', 'AY0', 'OY1', 'EH1', 'UH2', 'AH0', 'OY0', 'ER0', 'OW2', 'UH0', 'UW1', 'AW0', 'AH2', 'AO2', 'OY2', 'ER2', 'AY2', 'UW0', 'AH1', 'AA2', 'IY2', 'AO0']


In [5]:
print(consonants)

{'DH', 'L', 'R', 'CH', 'ZH', 'JH', 'D', 'S', 'W', 'M', 'NG', 'Y', 'K', 'F', 'HH', 'P', 'TH', 'N', 'G', 'B', 'SH', 'Z', 'T', 'V'}


In [6]:
# Step 2: Sonority Sequencing Principle - Sonority peaks at syllable nuclei and troughs at syllable boundaries 
# Syllable boundaries are points of lowest sonority between vowels
# Sonority hierarchy - vowels > glides > laterals > nasals > voiced fricatives > voiced affricate > voiced stop > voiceless fricative > voiceless affricate > voiceless stop

In [7]:
# Weights to reflect sonority hierarchy
sonority = {}
for p in phones:
    if p in vowels:
        sonority[p] = 30
    elif p in ['Y','W']:
        sonority[p] = 28
    elif p in ['R','L']:
        sonority[p] = 26
    elif p in ['N','M','NG']:
        sonority[p] = 24
    elif p in ['Z','ZH','V','DH','HH']:
        sonority[p] = 22
    elif p in ['JH']:
        sonority[p] = 20
    elif p in ['B','D','G']:
        sonority[p] = 18
    elif p in ['S','SH','F','TH']:
        sonority[p] = 16
    elif p in ['CH']:
        sonority[p] = 14
    elif p in ['P','T','K']:
        sonority[p] = 12
    else:
        sonority[p] = 0
        
print(sonority)

{'DH': 22, 'R': 26, 'CH': 14, 'EY0': 30, 'IY0': 30, 'ZH': 22, 'AY1': 30, 'JH': 20, 'AW1': 30, 'EH0': 30, 'EH2': 30, 'D': 18, 'AE2': 30, 'AO1': 30, 'IH1': 30, 'S': 16, 'EY1': 30, 'W': 28, 'IH2': 30, 'AA1': 30, 'AA0': 30, 'M': 24, 'NG': 24, 'AW2': 30, 'OW1': 30, 'UW2': 30, 'IY1': 30, 'ER1': 30, 'Y': 28, 'EY2': 30, 'IH0': 30, 'AE1': 30, 'UH1': 30, 'OW0': 30, 'K': 12, 'AE0': 30, 'F': 16, 'AY0': 30, 'OY1': 30, 'EH1': 30, 'UH2': 30, 'AH0': 30, 'HH': 22, 'OY0': 30, 'P': 12, 'ER0': 30, 'TH': 16, 'OW2': 30, 'UH0': 30, 'N': 24, 'G': 18, 'B': 18, 'UW1': 30, 'V': 22, 'SH': 16, 'AW0': 30, 'AH2': 30, 'AO2': 30, 'OY2': 30, 'Z': 22, 'ER2': 30, 'AY2': 30, 'UW0': 30, 'T': 12, 'AH1': 30, 'L': 26, 'AA2': 30, 'IY2': 30, 'AO0': 30}


In [8]:
#Check if any phone was not assigned a sonority value
0 in sonority.values()

False

In [9]:
# Step 3: Implement the syllable parser 
# Logic - Left-to-right parsing
# if sonority of next phone is higher than sonority of current phone, introduce syllable break if it creates valid syllable
# What's a valid syllable - must contain a nucleus (vowel)

In [10]:
# Function to check if sequence of phones form a syllable
def is_syll(temp):
    for p in temp:
        if p in vowels:
            return True
    return False

In [13]:
# parser function based on Sonority Sequencing Principle
def syllable_parser(trans):
    syll = []
    temp = []
    for i in range(len(trans)):
        if i < len(trans) - 1 and sonority[trans[i]] < sonority[trans[i+1]]:
            if is_syll(temp):
                syll.append(temp)
                temp = []
        temp.append(trans[i])
    if temp != []:
        syll.append(temp)
    
    return syll  

In [14]:
words = ['sing','sprain','string','pretend','report','entertain','oscillate','extension','information','education']
dev_set = [c for c in cmudict if c[0] in words]
print(dev_set)

[['education', 'EH2', 'JH', 'AH0', 'K', 'EY1', 'SH', 'AH0', 'N'], ['entertain', 'EH2', 'N', 'T', 'ER0', 'T', 'EY1', 'N'], ['extension', 'IH0', 'K', 'S', 'T', 'EH1', 'N', 'SH', 'AH0', 'N'], ['information', 'IH2', 'N', 'F', 'ER0', 'M', 'EY1', 'SH', 'AH0', 'N'], ['oscillate', 'AA1', 'S', 'AH0', 'L', 'EY2', 'T'], ['pretend', 'P', 'R', 'IY0', 'T', 'EH1', 'N', 'D'], ['report', 'R', 'IY0', 'P', 'AO1', 'R', 'T'], ['sing', 'S', 'IH1', 'NG'], ['sprain', 'S', 'P', 'R', 'EY1', 'N'], ['string', 'S', 'T', 'R', 'IH1', 'NG']]


In [15]:
for d in dev_set:
    print('Syllabification for %s : %s'%(d[0], syllable_parser(d[1:])))

Syllabification for education : [['EH2'], ['JH', 'AH0'], ['K', 'EY1'], ['SH', 'AH0', 'N']]
Syllabification for entertain : [['EH2', 'N'], ['T', 'ER0'], ['T', 'EY1', 'N']]
Syllabification for extension : [['IH0'], ['K', 'S', 'T', 'EH1', 'N'], ['SH', 'AH0', 'N']]
Syllabification for information : [['IH2', 'N'], ['F', 'ER0'], ['M', 'EY1'], ['SH', 'AH0', 'N']]
Syllabification for oscillate : [['AA1'], ['S', 'AH0'], ['L', 'EY2', 'T']]
Syllabification for pretend : [['P', 'R', 'IY0'], ['T', 'EH1', 'N', 'D']]
Syllabification for report : [['R', 'IY0'], ['P', 'AO1', 'R', 'T']]
Syllabification for sing : [['S', 'IH1', 'NG']]
Syllabification for sprain : [['S', 'P', 'R', 'EY1', 'N']]
Syllabification for string : [['S', 'T', 'R', 'IH1', 'NG']]
