In [None]:
import whisper
import gradio as gr
import random
import re

from integrated_gradient import load_models, attribution_score, get_smooth_framewise_error, get_feature, evaluate
from utils import get_sentences, get_pronunciation, get_target_words, get_padded_words

In [None]:
vocab_db = ["model", "algorithm", "data", "feature", "training", "loss", "accuracy", "classification", "regression", "network"]
init_examples = ["None"]
target_sentence = "This was a curious coincidence."
target_words = get_target_words(target_sentence)
practice_word = ""

whisper_model = whisper.load_model("tiny.en")
upstream_model, downstream_model, device = load_models(isWav= True)

In [None]:
def analyze_audio(audio_path):
    result = whisper_model.transcribe(audio_path, word_timestamps=True)
    words = [w for seg in result["segments"] for w in seg["words"]]
    transcribed_words = [re.sub(r'[^\w]', '', w["word"]) for w in words]
    print(f'''Target: {target_words}\nTranscribed: {transcribed_words}
        ''')
    if len(transcribed_words) != len(target_words):
        print(f'''
            Mismatch between transcribed words length and target sentence
            Target: {len(target_words)}
            Transcribed: {len(words)}
        ''')
        target_words, transcribed_words = get_padded_words(target_words, transcribed_words)
    
    tokens = []
    missed_words = []
    for i, w in enumerate(words):
        target_word = target_words[i]
        transcribed_word = transcribed_words[i]
        if transcribed_word.lower() != target_word.lower():
            tokens.append((target_word, 'Target'))
            tokens.append((transcribed_word, 'Transcribed'))
            missed_words.append(target_word.lower())
        else:
            tokens.append((transcribed_word, None))
        tokens.append((" ",None))
    
    print("transcribe result:", result)
    yield tokens, gr.Dataset(samples=[[s] for s in []])
    
    feature = get_feature(audio_path, upstream_model, device, isWav = True)
    predicted, _ = evaluate(downstream_model, feature, device)
    
    if predicted == 1:
        feedback = 'Your pronunciation is very accurate, and you are recognized as a native speaker!'
        yield ((w, None) for w in feedback), gr.Dataset(samples=[[s] for s in missed_words])
    else:
    
        attributions, _ = attribution_score(feature, downstream_model)
        frame_errors, peaks = get_smooth_framewise_error(attributions)
        print(f"frame_errors len: {len(frame_errors)}")

        bad_s = peaks*0.02
        print("bad_windows:", bad_s)
        
        tokens = []
        bad_words = []
        for i, w in enumerate(words):
            target_word = target_words[i]
            transcribed_word = transcribed_words[i]
            if transcribed_word.lower() != target_word.lower():
                tokens.append((target_word, 'Target'))
                tokens.append((transcribed_word, 'Transcribed'))
                tokens.append((" ",None))
                bad_words.append(target_word.lower())
                continue
            for peak in bad_s:
                if w["start"] <= peak and peak <= w["end"]:
                    tokens.append((transcribed_word, 'Pronunciation'))
                    tokens.append((" ",None))
                    bad_words.append(target_word)
                    break
            else:
                tokens.append((transcribed_word, None))
                tokens.append((" ",None))

        print(f"{len(bad_words)} problematic words collected")
        
        yield tokens, gr.Dataset(samples=[[s] for s in bad_words])

In [62]:
def update_sentence():
    global target_sentence, target_words, practice_word
    if practice_word == "":
        practice_word = random.choice(vocab_db)
    target_sentence = get_sentences(practice_word)[0]
    target_words = get_target_words(target_sentence)
    return f"**Please read aloud the following sentence:**\n\n## {target_sentence}"

In [15]:
def update_components(selected_string):
    global practice_word
    practice_word = selected_string
    phonetic, mp3_path = get_pronunciation(selected_string)
    print(phonetic, mp3_path)
    display_text = f"{selected_string}: {phonetic}"
    return display_text, mp3_path

In [None]:
target_sentence = "This was a curious coincidence."
target_words = get_target_words(target_sentence)
with gr.Blocks() as demo:
    markdown = gr.Markdown(f"**Please read aloud the following sentence:**\n\n## {target_sentence}")
    with gr.Row():
        with gr.Column(scale=1):
            audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
            submit_btn = gr.Button("Submit",size='sm')
            highlighted = gr.HighlightedText(
                label="Result",
                combine_adjacent=True,
                show_legend=True,
                color_map={
                    "Target": "green",
                    "Transcribed": "red",
                    "Pronunciation": "yellow"
                }
            )
            
        with gr.Column(scale=1):
            text_input = gr.Textbox(label="Cambridge Dictionary")
            audio_component = gr.Audio(label="Audio", type="filepath")
            examples = gr.Examples(
                examples=[[s] for s in init_examples],
                inputs=text_input,
                outputs=[text_input, audio_component],
                fn=update_components,
                run_on_click=True
            )
            next_btn = gr.ClearButton(components=audio,value='Practice',size='sm')
        
        submit_btn.click(fn=analyze_audio, inputs=audio, outputs=[highlighted,examples.dataset])
        next_btn.click(fn=update_sentence, inputs=[], outputs=[markdown])
        
        

demo.launch(share=True)