# Installing required packages

In [27]:
!pip install eng-to-ipa
!pip install transformers
!pip install nltk
!pip install metaphone
!pip install librosa==0.8.1

Collecting librosa==0.8.1
  Downloading librosa-0.8.1-py3-none-any.whl (203 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.8/203.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting resampy>=0.2.2 (from librosa==0.8.1)
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: resampy, librosa
  Attempting uninstall: librosa
    Found existing installation: librosa 0.10.0.post2
    Uninstalling librosa-0.10.0.post2:
      Successfully uninstalled librosa-0.10.0.post2
Successfully installed librosa-0.8.1 resampy-0.4.2


# Importing libraries

In [28]:
import time
import eng_to_ipa as ipa
import nltk
from metaphone import doublemetaphone

from IPython.display import Audio, display

import librosa

In [29]:
# Download the words dataset
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Helper Functions

In [30]:
def count_words(text):
    # Split the text into words using whitespace as the separator
    words = text.split()
    # Return the count of words
    return len(words)

## Load the Phoneme Recognizer from HuggingFace

In [31]:
from transformers import pipeline

# Load the model
pipe = pipeline(model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")


## Experiment - 1

In [44]:
text="The lion holds the top spot as my favorite animal. During our last Sunday's safari, we had the incredible opportunity to witness a pride of lions gracefully roaming the landscape."
ipa_transcription = ipa.convert(text)


In [47]:
# Load the audio file
test_1_file = "/content/drive/MyDrive/TextAnalyticsProject/Supplementary Materials/Test-1.wav"
audio, sample_rate = librosa.load(test_1_file )

# Play the audio
display(Audio(audio, rate=sample_rate))

In [48]:
# Record the inference time
start_time = time.time()

# Process raw audio
output = pipe(test_1_file , chunk_length_s=10, stride_length_s=(4, 2))

end_time = time.time()

# Calculate the time taken for the inference
time_taken = end_time - start_time

In [49]:
print("The text: ", text)
print("The actual IPA transcription: ", ipa_transcription)
print("The predicted IPA transcription: ", output['text'])


The text:  The lion holds the top spot as my favorite animal. During our last Sunday's safari, we had the incredible opportunity to witness a pride of lions gracefully roaming the landscape.
The actual IPA transcription:  ðə laɪən hoʊldz ðə tɔp spɑt ɛz maɪ ˈfeɪvərɪt ˈænəməl. ˈdʊrɪŋ ɑr læst ˈsənˌdiz səˈfɑri, wi hæd ðə ˌɪnˈkrɛdəbəl ˌɑpərˈtunəti tɪ ˈwɪtnəs ə praɪd əv laɪənz ˈgreɪsfəli ˈroʊmɪŋ ðə ˈlænˌskeɪp.
The predicted IPA transcription:  ðəlaɪɪnhoʊl zðɪ tɑ ps pɑ t ɛz maɪfeɪv ɹɪɾænɪm l dɝɪŋɑɹ læs sən deɪsɪfɑɹi wihæ ðiɪn k ɹɛɾə b lɑ pɝ tunɪɾi tuwɪ nɪsə p ɹaɪɾɪv laɪɪn z g ɹeɪs f liɹoʊmɪŋðəlæn s keɪ p


**Analysis:**
- We can see that the phoneme characters are predicted correctly but the word breaks and sentences breaks are not identified correctly. The models tends to club phonemes of different words together.

- This is becayse of the presence of the "h#" symbol
as a sentence boundary marker is leading to most
of the insertions and deletions type of errors dur-
ing phoneme recognition.

- Hence, we will use only one word for our application of pronunciation training.

In [50]:
word_count = count_words(text)
print(f"Time taken for inference for {word_count} words: {time_taken:.6f} seconds")
print(f"Average time taken for one word {(time_taken/word_count):.6f} seconds")

Time taken for inference for 30 words: 11.537920 seconds
Average time taken for one word 0.384597 seconds


## Experiment - 2

In [51]:
text="Lion"
ipa_transcription = ipa.convert(text)

In [52]:
test_2_file = "/content/drive/MyDrive/TextAnalyticsProject/Supplementary Materials/Lion.wav"
# Load the audio file
audio, sample_rate = librosa.load(test_2_file)

# Play the audio
display(Audio(audio, rate=sample_rate))

In [53]:
# Record the inference time
start_time = time.time()

# Process raw audio
output = pipe(test_2_file, chunk_length_s=10, stride_length_s=(4, 2))

end_time = time.time()

# Calculate the time taken for the inference
time_taken = end_time - start_time

In [54]:
print("The text: ", text)
print("The actual IPA transcription: ", ipa_transcription)
print("The predicted IPA transcription: ", output['text'])

The text:  Lion
The actual IPA transcription:  laɪən
The predicted IPA transcription:  laɪɛn


In [55]:
word_count = count_words(text)
print(f"Time taken for inference for {word_count} words: {time_taken:.6f} seconds")
print(f"Average time taken for one word {(time_taken/word_count):.6f} seconds")

Time taken for inference for 1 words: 1.010439 seconds
Average time taken for one word 1.010439 seconds


# Phonetically similar word generation

In [56]:
# Get the English word list from nltk
english_words = nltk.corpus.words.words()


In [57]:
# Get the double metaphone of the word
if output['text'] is ipa_transcription:
    print("You did a great job, you said it right!")

else:
    print("You got it slightly wrong, try speaking these words:")
    text_metaphone = doublemetaphone(text)

    # Initialize an empty list to store phonetically similar words
    similar_words = []

    # Iterate over all English words
    for word in english_words:
        # If the double metaphone of the current word matches either of the archipelago metaphone
        if text_metaphone[0] == doublemetaphone(word)[0]:
            # Add the current word to the list of similar sounding words
            similar_words.append(word)
            # Break the loop if we have already found five similar words
            if len(similar_words) >= 5:
                break

    # Print the similar sounding words
    print(similar_words)

You got it slightly wrong, try speaking these words:
['lagna', 'lain', 'laine', 'lan', 'Lana']


**Analysis:**

- We can see that we use multiple words/sentences then the inference time per word is ~0.38 seconds but when we use it for one word only then the inference time per word is ~1 seconds