Library

In [42]:
import speech_recognition as sr
import difflib

from g2p_en import G2p
from colorama import Fore

Source

In [2]:
sentence = "london the capital city of the united kingdom is a vibrant metropolis rich in history and culture Known as the square mile the city of london is the historic core where the romans first established londinium today it's a major business and financial center housing the Bank of England the royal exchange and the london stock exchange despite its modern skyscrapers like the gherkin and the walkie talkie london retains its historical charm with landmarks such as tower of london the city's boundaries have remained nearly unchanged since medieval times making it a unique blend of ancient and contemporary with a small resident population but a bustling daytime workforce the city is always alive with activity reflecting its status as one of the world's leading financial hubs."

In [3]:
audio_file = "E:\Perkuliahan\Semester 4\Pembelajaran Mesin\Tugas Membuat Makalah\Speech-to-text\Data Suara\Data 8.wav"

Function to get input from Speech-to-Text

In [4]:
def get_speech_to_text(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data, language="en-us", show_all=False)
            return text.lower()
        except sr.UnknownValueError as e:
            print("Speech recognition could not understand audio")
            return ""
        except sr.RequestError as e:
            print("Error from Google Speech Recognition service")
            return ""

Function to convert Text to ARPAbet

In [5]:
g2p = G2p()
def text_to_arpabet(text):
    phonemes = g2p(text)
    return phonemes

Function to Compare word based on ARPAbet

In [6]:
def compare_pronunciations(word1, word2):
    phonemes1 = text_to_arpabet(word1)
    phonemes2 = text_to_arpabet(word2)
    return phonemes1 == phonemes2

Function Error Detection and Correction

In [44]:
def Error_Detection_and_Correction(reference_sentence, user_sentence):
    reference_words = reference_sentence.split()
    user_words = user_sentence.split()

    difference = difflib.ndiff(reference_words, user_words)
    matcher = difflib.SequenceMatcher(None, reference_words, user_words)

    # Look for the difference between user words and reference words
    diff = []
    for word in difference:
        if word.startswith('- '):
            diff.append(f"Reference missing: {word[2:]}")
        elif word.startswith('+ '):
            diff.append(f"Extra in test: {word[2:]}")
        elif word.startswith('? '):
            pass  # This line shows markers for different characters, we can ignore it
    
    # Calculate Errors
    errors = 0
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag != 'equal':
            errors += max(i2 - i1, j2 - j1)

    # Correct the user sentence
    corrected_sentence = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            corrected_sentence.extend(user_words[j1:j2])
        elif tag == 'replace':
            corrected_sentence.extend(reference_words[i1:i2])
        elif tag == 'insert':
            corrected_sentence.extend(reference_words[i1:i2])
        elif tag == 'delete':
            continue

    return diff, errors, corrected_sentence


In [16]:
def detect_pronunciation_error(provided_sentences, user_sentences):
    refernce_words = provided_sentences.split()
    user_words = user_sentences.split()
    result = []
    for reference, user in zip(refernce_words, user_words):
        correct = compare_pronunciations(reference, user)
        if not correct:
            result.append({
                'ref' : Fore.WHITE + reference,
                'user' : Fore.RED + user,
                'Note' : False
            })
        else:
            result.append({
                'ref' : Fore.WHITE + reference,
                'user' : Fore.GREEN + user,
                'Note' : True
            })
    return result

Error Detection

In [27]:
user_words = get_speech_to_text(audio_file)
results = detect_pronunciation_error(sentence, user_words)

for result in results:
    print(f"{result['ref']} <-> {result['user']}")

[37mlondon <-> [32mlondon
[37mthe <-> [32mthe
[37mcapital <-> [32mcapital
[37mcity <-> [32mcity
[37mof <-> [32mof
[37mthe <-> [32mthe
[37munited <-> [32munited
[37mkingdom <-> [32mkingdom
[37mis <-> [32mis
[37ma <-> [32ma
[37mvibrant <-> [31mfire
[37mmetropolis <-> [31mmetabolic
[37mrich <-> [31mrate
[37min <-> [32min
[37mhistory <-> [32mhistory
[37mand <-> [32mand
[37mculture <-> [32mculture
[37mKnown <-> [31mknow
[37mas <-> [32mas
[37mthe <-> [32mthe
[37msquare <-> [32msquare
[37mmile <-> [32mmile
[37mthe <-> [32mthe
[37mcity <-> [32mcity
[37mof <-> [32mof
[37mlondon <-> [32mlondon
[37mis <-> [32mis
[37mthe <-> [32mthe
[37mhistoric <-> [31mhistorical
[37mcore <-> [31mwhere
[37mwhere <-> [31mthe
[37mthe <-> [31mromans
[37mromans <-> [31mfirst
[37mfirst <-> [31mestablished
[37mestablished <-> [31mlondon
[37mlondinium <-> [31myou
[37mtoday <-> [32mtoday
[37mit's <-> [32mit's
[37ma <-> [32ma
[37mmajor <-> [32

Error Correction

In [40]:
Correct_words = []
Wrong_words = []
Error = 0
for result in results:
    Wrong_words.append(result['user'])
    if (result['Note'] == False):
        Correct_words.append(result['ref'])
        Error += 1
    else:
        Correct_words.append(result['user'])
print(Fore.WHITE + "Wrong Words : " + ' '.join(Wrong_words))
print(Fore.WHITE + "Correct Words : " + ' '.join(Correct_words) + '\n')
print(f"Error Precentage from detection : {(Error/len(Correct_words)*100)} %")

[37mWrong Words : [32mlondon [32mthe [32mcapital [32mcity [32mof [32mthe [32munited [32mkingdom [32mis [32ma [31mfire [31mmetabolic [31mrate [32min [32mhistory [32mand [32mculture [31mknow [32mas [32mthe [32msquare [32mmile [32mthe [32mcity [32mof [32mlondon [32mis [32mthe [31mhistorical [31mwhere [31mthe [31mromans [31mfirst [31mestablished [31mlondon [31myou [32mtoday [32mit's [32ma [32mmajor [32mbusiness [32mand [32mfinancial [32mcenter [31mhosting [32mthe [32mbank [32mof [32mengland [32mthe [32mroyal [32mexchange [32mand [31mlondon [31mstock [31mexchange [31mdespite [31mhis [31mmother [31mand [31msky [31mcharacters [31mlike [31mthe [31mhurricane [31mand [31mthe [31mwalking [31mdistance [31mfrom [31mtower [31mof [31mlondon [31mthe [31mcity [31mmissouri [31mweather
[37mCorrect Words : [32mlondon [32mthe [32mcapital [32mcity [32mof [32mthe [32munited [32mkingdom [32mis [32ma [37mvibrant [37mmetr

Error Detection and Correction

In [45]:
user_words = get_speech_to_text(audio_file)
Error_words, error, corrected_sentence = Error_Detection_and_Correction(sentence, user_words)

for word in Error_words:
    print(word)
print('\n')
print(f"Number of errors: {(error/len(sentence.split()))*100} %")
print("Corrected sentence:", corrected_sentence)

Reference missing: vibrant
Reference missing: metropolis
Reference missing: rich
Extra in test: fire
Extra in test: metabolic
Extra in test: rate
Reference missing: Known
Extra in test: know
Reference missing: historic
Extra in test: historical
Reference missing: core
Reference missing: londinium
Extra in test: london
Extra in test: you
Reference missing: housing
Extra in test: hosting
Reference missing: Bank
Extra in test: bank
Reference missing: England
Extra in test: england
Reference missing: the
Reference missing: its
Reference missing: modern
Reference missing: skyscrapers
Extra in test: his
Extra in test: mother
Extra in test: and
Extra in test: sky
Extra in test: characters
Reference missing: gherkin
Extra in test: hurricane
Reference missing: walkie
Extra in test: walking
Extra in test: distance
Extra in test: from
Reference missing: talkie
Reference missing: london
Reference missing: retains
Reference missing: its
Reference missing: historical
Reference missing: charm
Referen