## Set up Word to BERT Vector Embedder

In [2]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
model = AutoModel.from_pretrained("bert-large-uncased")

def get_bert_embedding(word):    
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.squeeze(0)
    word_emb = embeddings.mean(dim=0).detach().numpy()
    return word_emb

  if not hasattr(np, "object"):


## Set up Phonemizer

In [3]:
from phonemizer import phonemize
import re

def word_to_phoneme_string(word):
    ipa = phonemize(
        word,
        language="en-us",
        backend="espeak",
        strip=True,
        preserve_punctuation=False,
        with_stress=True
    )
    phoneme_string = re.findall(r"[^\s]", ipa)
    return " ".join(phoneme_string)

In [4]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D

phoneme_strings = []

with open("phoneme_chunks/phonemes_all.txt", "r", encoding="utf-8") as file:
    phoneme_strings = [line.strip() for line in file if line.strip()]

phoneme_vectorizer = TextVectorization(
    output_mode="int",
    output_sequence_length=23
)

phoneme_vectorizer.adapt(phoneme_strings)

phoneme_embedding = tf.keras.Sequential([
    phoneme_vectorizer,
    Embedding(
        input_dim=len(phoneme_vectorizer.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    GlobalAveragePooling1D()
])

phonemes = phoneme_embedding(tf.constant(phoneme_strings)).numpy()

## Create Predicting Function to Call Tensorflow Model

In [None]:
import numpy as np
import tensorflow as tf

def predict_aoa_for_words(model, words):
    bert_vectors = []
    phoneme_strings = []
    valid_words = []

    for w in words:
        try:
            bert_vec = get_bert_embedding(w).astype(np.float32)
            phon_str = word_to_phoneme_string(w)

            if phon_str.strip():
                bert_vectors.append(bert_vec)
                phoneme_strings.append(phon_str)
                valid_words.append(w)

        except Exception as e:
            print(f"Skipping '{w}': {e}")

    if not valid_words:
        print("No valid words to predict.")
        return

    X_bert = np.vstack(bert_vectors)

    X_phon = phoneme_embedding(
        tf.constant(phoneme_strings).numpy()
    )

    preds = model.predict([X_bert, X_phon])

    print(f"{'Word':<15} | {'Predicted AoA'}")
    print("-" * 30)

    for word, pred in zip(valid_words, preds):
        aoa = float(pred.item())
        print(f"{word:<15} | {aoa:.2f}")


## Create List of Words and Run Predictions Here 

In [6]:
words_to_predict = [
    "sniffle", "achoo", "beautiful", "apple", "momma",
    "deliberate", "xenophobic", "bologna", "woof", "dog",
    "onomatopoeia", "pow", "bang", "achromatize", "sneeze", "expedition", "za"
]

predict_aoa_for_words(tf.keras.models.load_model("AoA_model_phase3.keras"), words_to_predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
Word            | Predicted AoA
------------------------------
sniffle         | 7.56
achoo           | 11.34
beautiful       | 6.61
apple           | 8.75
momma           | 4.50
deliberate      | 9.57
xenophobic      | 13.21
bologna         | 9.77
woof            | 7.41
dog             | 3.81
onomatopoeia    | 14.22
pow             | 6.97
bang            | 5.40
achromatize     | 13.80
sneeze          | 5.55
expedition      | 9.17
za              | 8.24
