<a href="https://colab.research.google.com/github/sheikmohdimran/Experiments_2021/blob/main/Hackathon_Final_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [13]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -qqq omegaconf ipymarkup pydub torchaudio fastcore gradio

In [14]:
#@title Imports
SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
import torch
import zipfile
import torchaudio
from glob import glob
import difflib
import re
from ipymarkup import show_span_box_markup
import torch
from fastcore.basics import patch_to
import soundfile as sf
import gradio as gr
import numpy as np



In [15]:
#@title Model & helper functions - Speech to Text

# Voice Activity Detector
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

 
# Speech to Text
device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
model_stt, decoder, utils_stt = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en', # also available 'de', 'es'
                                       device=device)
(_, _, _, prepare_model_input) = utils_stt  # see function signature for details

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip
Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


# Pronunciation checker

In [16]:
#@title Helper functions
def tokenize(s):
    return re.split('\s+', s)
def untokenize(ts):
    return ' '.join(ts)

# Compare ground truth to transcribed text        
def equalize(s1, s2):
    l1 = tokenize(s1)
    l2 = tokenize(s2)
    res1 = []
    res2 = []
    prev = difflib.Match(0,0,0)
    for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (prev.a + prev.size != match.a):
            for i in range(prev.a + prev.size, match.a):
                res2 += ['_' * len(l1[i])]
            res1 += l1[prev.a + prev.size:match.a]
        if (prev.b + prev.size != match.b):
            for i in range(prev.b + prev.size, match.b):
                res1 += ['_' * len(l2[i])]
            res2 += l2[prev.b + prev.size:match.b]
        res1 += l1[match.a:match.a+match.size]
        res2 += l2[match.b:match.b+match.size]
        prev = match
    return untokenize(res1), untokenize(res2)

# Identify misspoken words
def find_spans(prediction, ground_truth):
  new1, new2 = equalize(prediction, ground_truth)
  wrong_list=[]
  for i in range(len(new1.split())):
    if new1.split()[i] != new2.split()[i]:
      a = new2.split()[i].replace("_", "")
      if a: wrong_list.append(a)

  spans=[]
  for i in range(len(wrong_list)):
    m = re.search(wrong_list[i],ground_truth)
    spans.append(([m.start(),m.end(),'❌']))

  return wrong_list,spans

# Create list of tuples for gradio display
def create_highlights(ground_truth,wrong_list):
  a = ground_truth.split(" ")
  b=[]
  c=[]
  for i in a:
    c.append('DET' if i in wrong_list else None)
    c.append("")
    b.append(i)
    b.append(" ")

  return list(zip(b,c))

# Patch function from library to save file
@patch_to(gr.processing_utils)
def audio_to_file(sample_rate, data, filename):
  sf.write(filename, data, sample_rate)


# Pronunciation helper

In [17]:
#@title Model - Text to Speech
language = 'en'
speaker = 'lj_v2'
sample_rate = 16000
device = torch.device('cpu')

model_tts, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language=language,
                                     speaker=speaker)
model_tts.to(device)  # gpu or cpu

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [18]:
ground_truth ='this is a sample test for speech to text models'

def transcribe(file):
    prediction=[]
    waveform = read_audio(file)
    speech_timestamps = get_speech_timestamps(waveform, model, sampling_rate=SAMPLING_RATE)
    for i in speech_timestamps:
      input = prepare_model_input(waveform[i['start']: i['end']].unsqueeze(0),device=device)
      output = model_stt(input).squeeze()
      prediction.append(decoder(output.cpu()))
    print(prediction[0])
    wrongs,_=find_spans(prediction[0], ground_truth)
    output = create_highlights(ground_truth,wrongs)
    print(wrongs)
    audio = model_tts.apply_tts(texts=wrongs,sample_rate=SAMPLING_RATE)
    audio_np=[i.numpy() for i in audio]
    return output,(SAMPLING_RATE,np.hstack(audio_np))

In [19]:
iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type='filepath')
    ],
    outputs=[
             gr.outputs.HighlightedText(color_map={ "": "", }), #"text",
             "audio"

    ],
    layout="horizontal",
    theme="huggingface",
    title="NUHA - Your personal reading assistant",
    description="Please read this: "+ground_truth
)

iface.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Your interface requires microphone or webcam permissions - this may cause issues in Colab. Use the External URL in case of issues.
Running on public URL: https://30628.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


this is a sample test force speech to text models
['for']
Keyboard interruption in main thread... closing server.


(<fastapi.applications.FastAPI at 0x7ff78b74c410>,
 'http://127.0.0.1:7860/',
 'https://30628.gradio.app')