## Collect Words from File

In [33]:
import pandas as pd

data = pd.read_pickle("data.pkl")

words = data["Word"]

## Create Function to Phonemize Words

In [None]:
from phonemizer import phonemize
import re

def word_to_phoneme_string(word):
    ipa = phonemize(
        word,
        language="en-us",
        backend="espeak",
        strip=True,
        preserve_punctuation=False,
        with_stress=True
    )
    phoneme_string = re.findall(r"[^\s]", ipa)
    return " ".join(phoneme_string)

## Phonemize Words and File in Chunks, Then Combine Files

In [None]:
import os

words_per_file = len(words) // 10
if len(words) % 10 != 0:
    words_per_file += 1

os.makedirs("phoneme_chunks", exist_ok=True)

for chunk_index in range(0, 10):
    start_idx = chunk_index * words_per_file
    end_idx = min((chunk_index + 1) * words_per_file, len(words))
    chunk_words = words[start_idx:end_idx]

    phoneme_strings = []
    total = len(chunk_words)

    for i, w in enumerate(chunk_words, start=1):
        phoneme_strings.append(word_to_phoneme_string(w))
        print(f"\rChunk {chunk_index+1}/10: Phonemizing {i}/{total}", end="", flush=True)

    print()
    
    with open(f"phoneme_chunks/phonemes_part_{chunk_index+1}.txt", "w", encoding="utf-8") as f:
        for phoneme_str in phoneme_strings:
            f.write(phoneme_str + "\n")

print("All chunks done! Combining files...")

with open("phoneme_chunks/phonemes_all.txt", "w", encoding="utf-8") as outfile:
    for chunk_index in range(10):
        with open(f"phoneme_chunks/phonemes_part_{chunk_index+1}.txt", "r", encoding="utf-8") as infile:
            for line in infile:
                outfile.write(line)

print("Combined all phoneme strings into phonemes_all.txt")

## Create Text Vectorization Layer

In [55]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D

phoneme_strings = []

with open("phoneme_chunks/phonemes_all.txt", "r", encoding="utf-8") as file:
    phoneme_strings = [line.strip() for line in file if line.strip()]

phoneme_vectorizer = TextVectorization(
    output_mode="int",
    output_sequence_length=23
)

phoneme_vectorizer.adapt(phoneme_strings)

phoneme_embedding = tf.keras.Sequential([
    phoneme_vectorizer,
    Embedding(
        input_dim=len(phoneme_vectorizer.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    GlobalAveragePooling1D()
])

phonemes = phoneme_embedding(tf.constant(phoneme_strings)).numpy()

## Update Shared Dataframe and Save to File

In [56]:
import pandas as pd

data["Phoneme"] = list(phonemes)  # now each row holds a numpy array of shape (32,)
data.to_pickle("data.pkl")