In [1]:
import glob
import pandas as pd
import string
import unicodedata

In [2]:
jsonl_files = glob.glob("/Users/raileymontalan/Documents/railey-aisg/pacute/output_folder/*.jsonl")
df1 = pd.read_json(jsonl_files[25], lines=True)
df1.tail()

Unnamed: 0,word,part_of_speech,etymology,word_sense
1166,nyuk,png,[ Mag ],niyóg.
1167,nyuknyúk,png,[ Tau ],kadalisayan o kapinuhan ng hábi ng tela.
1168,nyur,png,[ Mag ],niyóg.
1169,nyuy,png,[ Iva ],niyóg.
1170,Nyx,png,[ Gri ],babaeng personipikasyon ng gabí ; anak ni Chaos.


In [3]:
# https://github.com/itudidyay/Tagalog-Word-Syllabization-Python/blob/main/tglSyllabification.py

vowels = set("AEIOUaeiouÀÁÂÈÉÊÌÍÎÒÓÔÙÚÛàáâèéêìíîòóôùúû")
letter_pairs = set(["bl", "br", "dr", "pl", "tr"])

def haveVowel(word):
    for let in word:
        if let in vowels:
            return True
    return False


def sliceValueInList(listSlice, valueSlice, indexSlice):
    result = listSlice[:]
    result.insert(valueSlice + 1, result[valueSlice][indexSlice:])
    result[valueSlice] = result[valueSlice][:indexSlice]
    return result


def mergeValueInList(listMerge, fromMerge, toMerge):
    result = listMerge[:]
    result[fromMerge : toMerge + 1] = ["".join(result[fromMerge : toMerge + 1])]
    return result


def syllabify(wordToSyllabify):
    word = wordToSyllabify

    # Break down word to constants and vowels. Ex: maglakad = ['m','a','gl','a','k','a','d']

    nextNg = False
    for letter in word:
        if letter in vowels:
            word = word.replace(letter, f" {letter} ")
        elif letter == "-":
            word = word.replace(letter, f" - ")
    word = word.replace("ng", "ŋ").replace("NG", "Ŋ")  # ng is temporarily replaced with ŋ so that it counts as one letter, hope its not some bullshit like 'Ng' or "nG"
    word = word.replace("'", "") # dont like apostrophes
    word = word.split()

    offset = 0

    for index, group in enumerate(word[:]):
        index += offset
        if index == 0 or index == len(word[:]) - 1 or word[index-1] == '-': # ignore at start or beginning of word, or if prev group was a hyphen
            continue
        elif len(group) == 2 and word: # if two letters, then split in half
            word = sliceValueInList(word[:], index, 1)
            offset += 1
        elif len(group) == 3:
            if (
                any((group[0].lower() == "n", group[0].lower() == "m"))
                and group[1:3].lower() in letter_pairs
            ):  # if three letters and 1st letter is n/m and 2nd-3rd letter is bl, br, dr, pl, or tr, split n/m from letter pairs
                word = sliceValueInList(word[:], index, 1)
                offset += 1
            else: # if three letters and none of above rules apply, split first two letters from last letter
                word = sliceValueInList(word[:], index, 2)
                offset += 1
        elif len(group) > 3: # if four or more letters, detach first two letters
            word = sliceValueInList(word[:], index, 2)
            offset += 1

    # Join vowels with the constants that precede them

    joinWord = word[:]
    offset = 0
    for index, group in enumerate(joinWord):
        if (
            group[-1] in vowels
            and joinWord[index - 1] not in vowels
            and joinWord[index - 1] != "-"
            and index != 0
        ):
            word = mergeValueInList(word, index - 1 - offset, index - offset)
            offset += 1

    # Join vowels with the constants that follow them

    joinWord = word[:]
    offset = 0
    for index, group in enumerate(joinWord):
        if index != len(joinWord) - 1:
            if (
                group[-1] in vowels
                and not haveVowel(joinWord[index + 1])
                and joinWord[index + 1] != "-"
            ):
                word = mergeValueInList(word, index - offset, index + 1 - offset)
                offset += 1
    for i in range(len(word)):
        word[i] = word[i].replace("ŋ", "ng").replace("Ŋ", "NG")  # ŋ returns to ng

    while "-" in word: # bye bye hyphen
        word.remove("-")

    return word

In [4]:
accented_vowels = set("ÀÁÂÈÉÊÌÍÎÒÓÔÙÚÛàáâèéêìíîòóôùúû")
mabilis = set("ÁÉÍÓÚáéíóú")
malumi = set("ÀÈÌÒÙàèìòù")
maragsa = set("ÂÊÎÔÛâêîôû")


def is_filipino(etymology):
    return any(tag in etymology for tag in ["Tag", "ST", "none"])

def is_single_word(word):
    return len(word.split()) == 1

def has_one_accented_syllable(word):
    syllables = syllabify(word)
    count = sum(1 for syllable in syllables if any(char in accented_vowels for char in syllable))
    return count == 1

def not_circumfixed_with_dash(word):
    return not (word.startswith('-') or word.endswith('-'))

def normalize_text(text):
    if isinstance(text, str):
        # Strip whitespace
        text = text.strip()
        # Remove punctuation except dashes (-)
        punctuation = string.punctuation.replace('-', '')
        text = ''.join(c for c in text if c not in punctuation)
        # Remove accents
        text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return text

def find_accented_syllable(syllables):
    for i, syllable in enumerate(syllables):
        if any(char in accented_vowels for char in syllable):
            return syllable, i
    return "", -1  # Return -1 if no accented syllable is found

def find_last_syllable(syllables):
    return syllables[-1], len(syllables) - 1

def classify_last_syllable_pronunciation(last_syllable):
    if any(char in mabilis for char in last_syllable):
        return "mabilis"
    elif any(char in malumi for char in last_syllable):
        return "malumi"
    elif any(char in maragsa for char in last_syllable):
        return "maragsa"
    else:
        return "malumay"


In [5]:
data = pd.DataFrame(columns=df1.columns)
for file in jsonl_files:
    df = pd.read_json(file, lines=True)
    df_filtered = df[df['etymology'].apply(is_filipino)].copy()
    df_filtered = df_filtered[df_filtered['word'].apply(is_single_word)].copy()
    df_filtered = df_filtered[df_filtered['word'].apply(has_one_accented_syllable)].copy()
    df_filtered = df_filtered[df_filtered['word'].apply(not_circumfixed_with_dash)].copy()
    if df_filtered.empty:
        continue
    df_filtered['accented_syllable_list'] = df_filtered['word'].apply(syllabify)
    df_filtered['accented_syllable'] = df_filtered['accented_syllable_list'].apply(lambda x: find_accented_syllable(x)[0])
    df_filtered['accented_syllable_index'] = df_filtered['accented_syllable_list'].apply(lambda x: find_accented_syllable(x)[1])
    df_filtered['accented_syllable_index'] = df_filtered['accented_syllable_index'].astype(pd.Int64Dtype())
    df_filtered['last_syllable'] = df_filtered['accented_syllable_list'].apply(lambda x: find_last_syllable(x)[0])
    df_filtered['last_syllable_index'] = df_filtered['accented_syllable_list'].apply(lambda x: find_last_syllable(x)[1])
    df_filtered['last_syllable_index'] = df_filtered['last_syllable_index'].astype(pd.Int64Dtype())
    df_filtered['last_syllable_pronunciation'] = df_filtered['last_syllable'].apply(classify_last_syllable_pronunciation)
    df_filtered['normalized_word'] = df_filtered['word'].apply(normalize_text)
    df_filtered['normalized_syllable_list'] = df_filtered['normalized_word'].apply(syllabify)
    data = pd.concat([data, df_filtered], ignore_index=True)

processed_data = data.sort_values(by='normalized_word').reset_index(drop=True)


In [6]:
processed_data.to_json("data/syllables.jsonl", orient="records", lines=True, force_ascii=False)