In [105]:
from convokit import Corpus, download
from convokit import text_processing
from spacy.lang.en import stop_words

In [106]:
corpus = Corpus(filename=download("movie-corpus"))
corpus.print_summary_stats()

Downloading movie-corpus to C:\Users\rjs20\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [107]:
from cleantext import clean
clean_str = lambda s: clean(
    s,
    fix_unicode=True,  # fix various unicode errors
    to_ascii=True,  # transliterate to closest ASCII representation
    lower=True,  # lowercase text
    no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
    no_urls=True,  # replace all URLs with a special token
    no_emails=True,  # replace all email addresses with a special token
    no_phone_numbers=True,  # replace all phone numbers with a special token
    no_numbers=True,  # replace all numbers with a special token
    no_digits=False,  # replace all digits with a special token
    no_currency_symbols=True,  # replace all currency symbols with a special token
    no_punct=True,  # fully remove punctuation
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="0",
    replace_with_currency_symbol="",
    lang="en",
)


In [108]:
def text_preprocess(corpus, clean_str):

    # Preprocessing step
    tc = text_processing.textCleaner.TextCleaner(clean_str)
    corpus = tc.transform(corpus)
    sentences = []
    uids = corpus.get_utterance_ids()

    for uid in uids:
        parsed_data = corpus.get_utterance(uid).text
        sentences.append(parsed_data)

    return(sentences)
    
sentences = text_preprocess(corpus, clean_str)

100/304713 utterances processed
200/304713 utterances processed
300/304713 utterances processed
400/304713 utterances processed
500/304713 utterances processed
600/304713 utterances processed
700/304713 utterances processed
800/304713 utterances processed
900/304713 utterances processed
1000/304713 utterances processed
1100/304713 utterances processed
1200/304713 utterances processed
1300/304713 utterances processed
1400/304713 utterances processed
1500/304713 utterances processed
1600/304713 utterances processed
1700/304713 utterances processed
1800/304713 utterances processed
1900/304713 utterances processed
2000/304713 utterances processed
2100/304713 utterances processed
2200/304713 utterances processed
2300/304713 utterances processed
2400/304713 utterances processed
2500/304713 utterances processed
2600/304713 utterances processed
2700/304713 utterances processed
2800/304713 utterances processed
2900/304713 utterances processed
3000/304713 utterances processed
3100/304713 utteran

In [109]:
sentences

['they do not',
 'they do to',
 'i hope so',
 'she okay',
 'lets go',
 'wow',
 'okay youre gonna need to learn how to lie',
 'no',
 'im kidding you know how sometimes you just become this persona and you dont know how to quit',
 'like my fear of wearing pastels',
 'the real you',
 'what good stuff',
 'i figured youd get to the good stuff eventually',
 'thank god if i had to hear one more story about your coiffure',
 'me this endless blonde babble im like boring myself',
 'what crap',
 'do you listen to this crap',
 'no',
 'then guillermo says if you go any lighter youre gonna look like an extra on',
 'you always been this selfish',
 'but',
 'then thats all you had to say',
 'well no',
 'you never wanted to go out with me did you',
 'i was',
 'i looked for you back at the party but you always seemed to be occupied',
 'tons',
 'have fun tonight',
 'i believe we share an art instructor',
 'you know chastity',
 'looks like things worked out tonight huh',
 'hi',
 'who knows all ive ever hea

In [110]:
from speechbrain.inference.text import GraphemeToPhoneme
# Convert graphemes to phonemes
g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")

transcriptions = []
for sentence in sentences:
    try:
        transcriptions.append(g2p(sentence)[:-1])
    except:
        pass

display(transcriptions)

[['DH', 'EY', ' ', 'D', 'UW', ' ', 'N', 'AA', 'T'],
 ['DH', 'EY', ' ', 'D', 'UW', ' ', 'T', 'UW'],
 ['AY', ' ', 'HH', 'OW', 'P', ' ', 'S', 'OW'],
 ['SH', 'IY', ' ', 'OW', 'K', 'EY'],
 ['L', 'EH', 'T', 'S', ' ', 'G', 'OW'],
 ['W', 'AW'],
 ['OW',
  'K',
  'EY',
  ' ',
  'Y',
  'UH',
  'R',
  ' ',
  'G',
  'AA',
  'N',
  'AH',
  ' ',
  'N',
  'IY',
  'D',
  ' ',
  'T',
  'UW',
  ' ',
  'L',
  'ER',
  'N',
  ' ',
  'HH',
  'AW',
  ' ',
  'T',
  'UW',
  ' ',
  'L',
  'AY'],
 ['N', 'OW'],
 ['IH',
  'M',
  ' ',
  'K',
  'IH',
  'D',
  'IH',
  'NG',
  ' ',
  'Y',
  'UW',
  ' ',
  'N',
  'OW',
  ' ',
  'HH',
  'AW',
  ' ',
  'S',
  'AH',
  'M',
  'T',
  'AY',
  'M',
  'Z',
  ' ',
  'Y',
  'UW',
  ' ',
  'JH',
  'AH',
  'S',
  'T',
  ' ',
  'B',
  'IH',
  'K',
  'AH',
  'M',
  ' ',
  'DH',
  'IH',
  'S',
  ' ',
  'P',
  'ER',
  'S',
  'OW',
  'N',
  'AH',
  ' ',
  'AH',
  'N',
  'D',
  ' ',
  'Y',
  'UW',
  ' ',
  'D',
  'AA',
  'N',
  'T',
  ' ',
  'N',
  'OW',
  ' ',
  'HH',
  'AW',
  ' ',
  '

In [111]:
import pickle
with open("transcriptions.pkl", "wb") as fp:
    pickle.dump(transcriptions, fp)

In [112]:
from speechbrain.inference.text import GraphemeToPhoneme
import spacy
from nltk.corpus import cmudict
import re
from spacy.lang.en import stop_words

'''
Methods for converting between CMU Arpabet and 
International Phonetic Alphabet (IPA) symbols
'''
class IPA:

    # init class and set translation table (tt)
    def __init__(self):

        # phoneme to ipa tt
        self.phoneme_ipa_dict = {
            'AA':'ɑ', 'AE':'æ', 'AH':'ʌ', 'AO':'ɔ', 'AW':'ə', 'AY':'ī', 'EH':'ɛ',
            'ER':'ɝ', 'EY':'ā', 'IH':'ɪ', 'IY':'i', 'OW':'ō', 'OY':'ʉ', 'UH':'ʊ',
            'UW':'u', 'B':'b', 'CH':'ʧ', 'D':'d', 'DH':'ð', 'F':'f', 'G':'ɡ',
            'HH':'h', 'JH':'ʤ', 'K':'k', 'L':'l', 'M':'m', 'N':'n', 'NG':'ŋ',
            'P':'p', 'R':'ɹ', 'S':'s', 'SH':'ʃ', 'T':'t', 'TH':'θ', 'V':'v',
            'W':'w', 'Y':'j', 'Z':'z', 'ZH':'ʒ', ' ':''
        }

        # ipa symbol to phoneme tt
        self.ipa_phoneme_dict = {v: k for k, v in self.phoneme_ipa_dict.items()}

        # english to phoneme tt
        self.cmudict_tuple = self.cmudict_data()

    # Convert CMUdict phonemes to IPA symbols
    def phoneme_to_ipa(self, text):
        ipa_text = []
        for sentence in text:
            separated_word = []
            for word in sentence:
                print(word)
                separated_word.append(self.phoneme_ipa_dict[word])

            ipa_text.append(separated_word)

        ipa_text = [''.join(i) for i in ipa_text]
        return ipa_text
    
    # Convert IPA symbols to CMUdict Phonemes
    def ipa_to_phoneme(self, word):
        phoneme_list = [self.ipa_phoneme_dict[char] for char in word]
        return phoneme_list
    
    # Create cmudict lookup tuple
    def cmudict_data(self):
        cmudict_tuple = cmudict.entries()
        cmudict_tuple = [([re.sub(r'[0-9]+', '', s) \
                           for s in t[1]], t[0]) \
                           for t in cmudict_tuple]

        return cmudict_tuple

    ''' 
    Convert a list of phonemes to a
    list of unique words with 
    matching pronunciations
    '''
    def phoneme_to_word(self, word):
        potential_words = []
        for i in self.cmudict_tuple:
            if i[0] == word and i[1] not in potential_words:
                potential_words.append(i[1])
        return(potential_words)
    
    # Get list of stop words in IPA symbols
    def get_common_words(self):
        word_list = stop_words.STOP_WORDS

        # Convert to CMUdict style phonemes
        phonemes = []
        for i in word_list:
            try:
                phonemes.append(g2p(i)) # type: ignore
            except:
                print(f"Failed to add '{i}' to word list.")

        # Convert to custom phonemic character set
        ipa_text = self.phoneme_to_ipa(phonemes)

        # Plaintext
        ipa_plaintext = [] 
        for word in ipa_text:
            if word[0] not in ipa_plaintext:
                ipa_plaintext.append(word[0])

        return(ipa_plaintext)

In [113]:
def reformat_list(l):
    result = []
    sublist = []

    for item in l:
        if item == ' ':
            if sublist:  # add current sublist to result if it's not empty
                result.append(sublist)
                sublist = []  # reset sublist
        else:
            sublist.append(item)

    # Add any remaining items in sublist
    if sublist:
        result.append(sublist)
    
    return result

In [None]:
ipa_converter = IPA()
ipa_text = []
for l in transcriptions:
    l = reformat_list(l)
    print(l)
    ipa_text_part = ipa_converter.phoneme_to_ipa(l)
    ipa_text.append(ipa_text_part)

[['DH', 'EY'], ['D', 'UW'], ['N', 'AA', 'T']]
DH
EY
D
UW
N
AA
T
[['DH', 'EY'], ['D', 'UW'], ['T', 'UW']]
DH
EY
D
UW
T
UW
[['AY'], ['HH', 'OW', 'P'], ['S', 'OW']]
AY
HH
OW
P
S
OW
[['SH', 'IY'], ['OW', 'K', 'EY']]
SH
IY
OW
K
EY
[['L', 'EH', 'T', 'S'], ['G', 'OW']]
L
EH
T
S
G
OW
[['W', 'AW']]
W
AW
[['OW', 'K', 'EY'], ['Y', 'UH', 'R'], ['G', 'AA', 'N', 'AH'], ['N', 'IY', 'D'], ['T', 'UW'], ['L', 'ER', 'N'], ['HH', 'AW'], ['T', 'UW'], ['L', 'AY']]
OW
K
EY
Y
UH
R
G
AA
N
AH
N
IY
D
T
UW
L
ER
N
HH
AW
T
UW
L
AY
[['N', 'OW']]
N
OW
[['IH', 'M'], ['K', 'IH', 'D', 'IH', 'NG'], ['Y', 'UW'], ['N', 'OW'], ['HH', 'AW'], ['S', 'AH', 'M', 'T', 'AY', 'M', 'Z'], ['Y', 'UW'], ['JH', 'AH', 'S', 'T'], ['B', 'IH', 'K', 'AH', 'M'], ['DH', 'IH', 'S'], ['P', 'ER', 'S', 'OW', 'N', 'AH'], ['AH', 'N', 'D'], ['Y', 'UW'], ['D', 'AA', 'N', 'T'], ['N', 'OW'], ['HH', 'AW'], ['T', 'UW'], ['K', 'W', 'IH', 'T']]
IH
M
K
IH
D
IH
NG
Y
UW
N
OW
HH
AW
S
AH
M
T
AY
M
Z
Y
UW
JH
AH
S
T
B
IH
K
AH
M
DH
IH
S
P
ER
S
OW
N
AH
AH
N
D
Y
UW
D


In [101]:
import keras
from keras.layers import TextVectorization
import pickle
max_seq_length = 20
embedding_dim = 100
lstm_units = 128

# Plaintext
ipa_plaintext = [] 
for sentence in ipa_text:
    ipa_plaintext.append(' '.join(sentence))

print(ipa_plaintext)

'''
Instance of TextVectorization to tokenize 
input text and output a vector of integers
'''
vectorize_layer = TextVectorization(standardize=None, # type: ignore
                                            split="whitespace",
                                            output_mode='int')

# Adapt vectorizer to the input text
vectorize_layer.adapt(ipa_plaintext)

# Get vocabulary
vocab = vectorize_layer.get_vocabulary()

# Save vocabulary
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(vocab, 
                handle, 
                protocol=pickle.HIGHEST_PROTOCOL)

# Vector of integers
sequences = vectorize_layer(ipa_plaintext)

# Generate data: X is the sequence, y is the next word
X = sequences[:, :-1] # type: ignore
y = sequences[:, 1:] # type: ignore
vocab_size = len(vectorize_layer.get_vocabulary())     

# Define layers of model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim),
    keras.layers.LSTM(lstm_units,return_sequences=True),
    keras.layers.Dense(vocab_size,activation='softmax')
])

# Prepare model for training
model.compile(loss='sparse_categorical_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])

# Output details of compiled model
model.summary()

# Train the model
model.fit(X, y, 
                epochs=10)

# Save the model
model.save("current.keras")

['ðā du nɑt', 'ðā du tu', 'ī hōp sō', 'ʃi ōkā', 'lɛts ɡō', 'wə', 'ōkā jʊɹ ɡɑnʌ nid tu lɝn hə tu lī', 'nō', 'ɪm kɪdɪŋ ju nō hə sʌmtīmz ju ʤʌst bɪkʌm ðɪs pɝsōnʌ ʌnd ju dɑnt nō hə tu kwɪt', 'līk mī fɪɹ ʌv wɛɹɪŋ pāstʌlz', 'ðʌ ɹil ju', 'wʌt ɡʊd stʌf', 'ī fɪɡjɝd jud ɡɛt tu ðʌ ɡʊd stʌf ɪvɛnʧʌwʌli', 'θæŋk ɡɑd ɪf ī hæd tu hiɹ wʌn mɔɹ stɔɹi ʌbət jɔɹ kʉfjʊɹ', 'mi ðɪs ɛndlʌs blɑnd bæbʌl ɪm līk bɔɹɪŋ mīsɛlf', 'wʌt kɹæp', 'du ju lɪsʌn tu ðɪs kɹæp', 'nō', 'ðɛn ɡlɛɹmō sɛz ɪf ju ɡō ɛni lītɝ jɔɹ ɡɑnʌ lʊk līk æn ɛkstɹʌ ɑn', 'ju ɔlwāz bɪn ðɪs sɛlfɪʃ', 'bʌt', 'ðɛn ðæts ɔl ju hæd tu sā', 'wɛl nō', 'ju nɛvɝ wɑntɪd tu ɡō ət wɪð mi dɪd ju', 'ī wɑz', 'ī lʊkt fɔɹ ju bæk æt ðʌ pɑɹti bʌt ju ɔlwāz simd tu bi ɑkjʌpīd', 'tʌnz', 'hæv fʌn tʌnīt', 'ī bɪliv wi ʃɛɹ æn ɑɹt ɪnstɹʌktɝ', 'ju nō ʧæstʌti', 'lʊks līk θɪŋz wɝkt ət tʌnīt hu', 'hī', 'hu nōz ɔl īv ɛvɝ hɝd hɝ sā ɪz ðæt ʃɛd dɪp bɪfɔɹ dātɪŋ ʌ ɡī ðæt smōks', 'sō ðæts ðʌ kīnd ʌv ɡī ʃi līks pɹɪti wʌnz', 'lɛzbiʌn nō ī fənd ʌ pɪkʧɝ ʌv ʤɛɹd litō ɪn wʌn ʌv hɝ dɹɔɹz sō ɪm pɹɪt

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.6968 - loss: 6.0391
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8635 - loss: 1.5212
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8632 - loss: 1.1273
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8606 - loss: 1.1163
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8660 - loss: 1.0329
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8621 - loss: 1.0008
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8595 - loss: 0.9857
Epoch 8/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8586 - loss: 0.9652
Epoch 9/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━

In [102]:
vocab = vectorize_layer.get_vocabulary()
vocab

['',
 '[UNK]',
 'ju',
 'ī',
 'tu',
 'ʌ',
 'ðʌ',
 'nō',
 'ʌnd',
 'ðæt',
 'mi',
 'jʊɹ',
 'ɪt',
 'ʌv',
 'wʌt',
 'wɪð',
 'ɡō',
 'līk',
 'ɪm',
 'ʤʌst',
 'nɑt',
 'mī',
 'ət',
 'du',
 'hæv',
 'fɔɹ',
 'bʌt',
 'hɝ',
 'ɪz',
 'ðɪs',
 'hi',
 'sō',
 'ɡɛt',
 'dɑnt',
 'ɪn',
 'ɪts',
 'ʌp',
 'ʃi',
 'ɑn',
 'wi',
 'θɪŋk',
 'wɑz',
 'bi',
 'hu',
 'ɪf',
 'ɡɑt',
 'ɑɹ',
 'ðɛn',
 'ðæts',
 'ðɛɹ',
 'æt',
 'wʌn',
 'wɝ',
 'wɑnt',
 'nɛvɝ',
 'hɪm',
 'wɛn',
 'wī',
 'sɪstɝ',
 'jɔɹ',
 'hə',
 'dāt',
 'ɡōɪŋ',
 'ɔl',
 'kænt',
 'bɪkɔz',
 'sʌmθɪŋ',
 'nid',
 'lɛt',
 'kʊd',
 'kæn',
 'hɛz',
 'hæd',
 'dɪd',
 'dædi',
 'θɔt',
 'ʌbət',
 'ɡɝl',
 'ōkā',
 'æn',
 'sā',
 'pɑɹti',
 'dɪdnt',
 'ɹīt',
 'ɡʊd',
 'ɔlwāz',
 'ðā',
 'jʌ',
 'ʤōi',
 'ʃʊɹ',
 'ɹɪli',
 'ɡī',
 'ō',
 'īv',
 'wɛɹ',
 'wɛl',
 'stɑɹt',
 'skul',
 'pipʌl',
 'nīt',
 'hiɹ',
 'ɡɛs',
 'ɛni',
 'ɔɹ',
 'wʌts',
 'tɛl',
 'tōld',
 'sʌm',
 'si',
 'nu',
 'lʊk',
 'ivɪn',
 'hā',
 'hæz',
 'dʌz',
 'duɪŋ',
 'bī',
 'ōnli',
 'wʊd',
 'tāk',
 'sʌmwʌn',
 's',
 'nə',
 'nɔɹmʌl',
 'mīnd',
 'kʌmplitl

In [103]:
import re
import tensorflow as tf
import numpy as np
# Given previously typed words, predict potential next word
def predict_next_words(text):
    
    # Predict probabilities for the next word
    probabilities = model.predict(text) # type: ignore
    
    return probabilities
def predict_user_input_with_prefix(input_text, top_k=5):
    """
    Predict the next phoneme or word based on partial user input, using a prefix match.

    Args:
        input_text (str): The text input from the user.
        top_k (int): Number of top predictions to return.
    
    Returns:
        List[str]: A list of the top-k predicted phonemes or words matching the input prefix.
    """
    # Tokenize and get the last entered word (prefix)
    words = input_text.split()

    # Check if the last character is a space, meaning a new word is expected
    if input_text and input_text[-1] == " ":
        prefix = ""  # Empty prefix means we predict any likely next word
    else:
        prefix = words[-1] if words else ""
    
    # Vectorize the input sequence (without the prefix if one exists)
    input_sequence = vectorize_layer([" ".join(words[:-1]) if prefix else input_text])

    # Predict the next word probabilities
    predictions = model.predict(input_sequence, verbose=0)

    # Get the vocabulary and filter by prefix
    vocab = vectorize_layer.get_vocabulary()
    top_k_indices = np.argsort(predictions[0][-1])[::-1]  # Sorted in descending order

    if prefix:
        # If there is a prefix, filter predictions by it
        prefix_pattern = re.compile(f"^{re.escape(prefix)}")
        matching_predictions = [vocab[idx] for idx in top_k_indices if prefix_pattern.match(vocab[idx])]
    else:
        # If no prefix, return the top-k predictions directly
        matching_predictions = [vocab[idx] for idx in top_k_indices]
    
    # Return up to top_k matching predictions
    return matching_predictions[:top_k]


# Example of using the new prediction function
user_input = "ʃʊɹ h"
predictions = predict_user_input_with_prefix(user_input, top_k=6)
print("Top predictions:", predictions)


Top predictions: ['hæv', 'hɝ', 'hi', 'hɪm', 'hæd', 'hu']


In [104]:
display(transcriptions)

[['DH', 'EY', ' ', 'D', 'UW', ' ', 'N', 'AA', 'T'],
 ['DH', 'EY', ' ', 'D', 'UW', ' ', 'T', 'UW'],
 ['AY', ' ', 'HH', 'OW', 'P', ' ', 'S', 'OW'],
 ['SH', 'IY', ' ', 'OW', 'K', 'EY'],
 ['L', 'EH', 'T', 'S', ' ', 'G', 'OW'],
 ['W', 'AW'],
 ['OW',
  'K',
  'EY',
  ' ',
  'Y',
  'UH',
  'R',
  ' ',
  'G',
  'AA',
  'N',
  'AH',
  ' ',
  'N',
  'IY',
  'D',
  ' ',
  'T',
  'UW',
  ' ',
  'L',
  'ER',
  'N',
  ' ',
  'HH',
  'AW',
  ' ',
  'T',
  'UW',
  ' ',
  'L',
  'AY'],
 ['N', 'OW'],
 ['IH',
  'M',
  ' ',
  'K',
  'IH',
  'D',
  'IH',
  'NG',
  ' ',
  'Y',
  'UW',
  ' ',
  'N',
  'OW',
  ' ',
  'HH',
  'AW',
  ' ',
  'S',
  'AH',
  'M',
  'T',
  'AY',
  'M',
  'Z',
  ' ',
  'Y',
  'UW',
  ' ',
  'JH',
  'AH',
  'S',
  'T',
  ' ',
  'B',
  'IH',
  'K',
  'AH',
  'M',
  ' ',
  'DH',
  'IH',
  'S',
  ' ',
  'P',
  'ER',
  'S',
  'OW',
  'N',
  'AH',
  ' ',
  'AH',
  'N',
  'D',
  ' ',
  'Y',
  'UW',
  ' ',
  'D',
  'AA',
  'N',
  'T',
  ' ',
  'N',
  'OW',
  ' ',
  'HH',
  'AW',
  ' ',
  '