<a href="https://colab.research.google.com/github/sheikmohdimran/Experiments_2021/blob/main/Speech_to_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -qqq omegaconf ipymarkup pydub torchaudio

In [40]:
#@title Imports
SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
import torch
import zipfile
import torchaudio
from glob import glob

In [41]:
#@title Model & helper functions - Speech to Text

# Voice Activity Detector
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

 
# Speech to Text
device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
model_stt, decoder, utils_stt = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en', # also available 'de', 'es'
                                       device=device)
(_, _, _, prepare_model_input) = utils_stt  # see function signature for details


# Function to record audio

def record(seconds=1,gt='sample'):

    from google.colab import output as colab_output
    from base64 import b64decode
    from io import BytesIO
    from pydub import AudioSegment

    RECORD = (
        b"const sleep  = time => new Promise(resolve => setTimeout(resolve, time))\n"
        b"const b2text = blob => new Promise(resolve => {\n"
        b"  const reader = new FileReader()\n"
        b"  reader.onloadend = e => resolve(e.srcElement.result)\n"
        b"  reader.readAsDataURL(blob)\n"
        b"})\n"
        b"var record = time => new Promise(async resolve => {\n"
        b"  stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n"
        b"  recorder = new MediaRecorder(stream)\n"
        b"  chunks = []\n"
        b"  recorder.ondataavailable = e => chunks.push(e.data)\n"
        b"  recorder.start()\n"
        b"  await sleep(time)\n"
        b"  recorder.onstop = async ()=>{\n"
        b"    blob = new Blob(chunks)\n"
        b"    text = await b2text(blob)\n"
        b"    resolve(text)\n"
        b"  }\n"
        b"  recorder.stop()\n"
        b"})"
    )
    RECORD = RECORD.decode("ascii")

    print(f"Recording started for {seconds} seconds.")
    print(f"Read this -- \"{gt}\"")
    display(ipd.Javascript(RECORD))
    s = colab_output.eval_js("record(%d)" % (seconds * 1000))
    print("Recording ended.")
    b = b64decode(s.split(",")[1])

    fileformat = "wav"
    filename = f"_audio.{fileformat}"
    AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
    return filename

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip
Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [42]:
import sys
import IPython.display as ipd

# Detect whether notebook runs in google colab
prediction=[]
ground_truth ='this is a sample test for speech to text models'

if "google.colab" in sys.modules:
    waveform = read_audio(record(5,ground_truth))
    speech_timestamps = get_speech_timestamps(waveform, model, sampling_rate=SAMPLING_RATE)
    for i in speech_timestamps:
      input = prepare_model_input(waveform[i['start']: i['end']].unsqueeze(0),device=device)
      output = model_stt(input).squeeze()
      prediction.append(decoder(output.cpu()))

print(prediction)
Audio(waveform.numpy(), rate=SAMPLING_RATE)

Recording started for 5 seconds.
Read this -- "this is a sample test for speech to text models"


<IPython.core.display.Javascript object>

Recording ended.
['this is a sample text for speed to text models']


# Pronunciation checker

In [43]:
#@title Helper functions
import difflib
import re
from ipymarkup import show_span_box_markup

def tokenize(s):
    return re.split('\s+', s)
def untokenize(ts):
    return ' '.join(ts)
        
def equalize(s1, s2):
    l1 = tokenize(s1)
    l2 = tokenize(s2)
    res1 = []
    res2 = []
    prev = difflib.Match(0,0,0)
    for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (prev.a + prev.size != match.a):
            for i in range(prev.a + prev.size, match.a):
                res2 += ['_' * len(l1[i])]
            res1 += l1[prev.a + prev.size:match.a]
        if (prev.b + prev.size != match.b):
            for i in range(prev.b + prev.size, match.b):
                res1 += ['_' * len(l2[i])]
            res2 += l2[prev.b + prev.size:match.b]
        res1 += l1[match.a:match.a+match.size]
        res2 += l2[match.b:match.b+match.size]
        prev = match
    return untokenize(res1), untokenize(res2)

def find_spans(prediction, ground_truth):
  new1, new2 = equalize(prediction, ground_truth)
  wrong_list=[]
  for i in range(len(new1.split())):
    if new1.split()[i] != new2.split()[i]:
      a = new2.split()[i].replace("_", "")
      if a: wrong_list.append(a)

  spans=[]
  for i in range(len(wrong_list)):
    m = re.search(wrong_list[i],ground_truth)
    spans.append(([m.start(),m.end(),'❌']))

  return wrong_list,spans

In [44]:
pred = prediction[0]
wrongs,final_spans=find_spans(pred, ground_truth)
show_span_box_markup(ground_truth, final_spans)

# Pronunciation helper

In [45]:
#@title Model - Text to Speech
import torch

language = 'en'
speaker = 'lj_v2'
sample_rate = 16000
device = torch.device('cpu')

model_tts, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language=language,
                                     speaker=speaker)
model_tts.to(device)  # gpu or cpu



Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [46]:
audio = model_tts.apply_tts(texts=wrongs,sample_rate=sample_rate)
from IPython.display import Audio, display
for i in range(len(audio)):
  display(Audio(audio[i].numpy(), rate=SAMPLING_RATE))