In [1]:
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install matplotlib>=3.3.2
!pip install ffmpeg-python
## Install NeMo
BRANCH = 'r1.1.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9673 sha256=721b7d71a3a44cc669954e0be05140fda21345d90179650066ef8865c0e9c7b0
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.1).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following additional packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3
Suggested packages:
  file libsox-fmt-all
The following NEW packages will be installed:
  libmagic-mgc libmagic1 libopenco

In [1]:
import librosa
import IPython
import pandas as pd
import numpy as np
import os
import glob

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip small.zip

In [7]:
if not os.path.exists("scripts/get_commonvoice_data.py"):
  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/dataset_processing/get_commonvoice_data.py

--2021-08-04 09:17:34--  https://raw.githubusercontent.com/NVIDIA/NeMo/r1.1.0/scripts/dataset_processing/get_commonvoice_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6778 (6.6K) [text/plain]
Saving to: ‘scripts/get_commonvoice_data.py’


2021-08-04 09:17:34 (93.1 MB/s) - ‘scripts/get_commonvoice_data.py’ saved [6778/6778]



In [70]:
VERSION = "cv-corpus-6.1-2020-12-11"
LANGUAGE = "en"

In [None]:
import argparse
import csv
import json
import logging
import multiprocessing
import os
import subprocess
import sys
import tarfile
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import List
import yaml

import sox
from sox import Transformer
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.')
parser.add_argument("--data_root", default='small/', type=str, help="Directory to store the dataset.")
parser.add_argument('--manifest_dir', default='./', type=str,help='Output directory for manifests')
parser.add_argument("--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset.")
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--n_channels', default=1, type=int, help='Number of channels for output wav files')
parser.add_argument(
    '--files_to_process',
    nargs='+',
    default=['test.tsv', 'dev.tsv', 'train.tsv'],
    type=str,
    help='list of *.csv file names to process',
    required=False
)
parser.add_argument(
    '--version',
    default='cv-corpus-5.1-2020-06-22',
    type=str,
    required=False,
    help='Version of the dataset (obtainable via https://commonvoice.mozilla.org/en/datasets',
)
parser.add_argument(
    '--language',
    default='en',
    type=str,
    required=False,
    help='Which language to download.(default english,'
    'check https://commonvoice.mozilla.org/en/datasets for more language codes',
)
sys.argv = ['-f']
args = parser.parse_args(['--version',VERSION,'--files_to_process','validated.tsv'])

COMMON_VOICE_URL = (
    f"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/"
    "{}/{}.tar.gz".format(args.version, args.language)
)



def create_manifest(data: List[tuple], output_name: str, manifest_path: str):
    output_file = Path(manifest_path) / output_name
    output_file.parent.mkdir(exist_ok=True, parents=True)

    with output_file.open(mode='w') as f:
        for wav_path, duration, text in tqdm(data, total=len(data)):
            if wav_path:
              f.write(
                  json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text}) + '\n'
              )


def process_files(csv_file, data_root, num_workers):
    """ Read *.csv file description, convert mp3 to wav, process text.
        Save results to data_root.
    Args:
        csv_file: str, path to *.csv file with data description, usually start from 'cv-'
        data_root: str, path to dir to save results; wav/ dir will be created
    """
    wav_dir = os.path.join(data_root, 'wav/')
    os.makedirs(wav_dir, exist_ok=True)
    audio_clips_path = os.path.dirname(csv_file) + '/clips/'

    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')
        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.channels(n_channels=args.n_channels)
        # tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
        os.system(
        f'ffmpeg -i {audio_path} -acodec pcm_s16le -ac 1 -af aresample=resampler=soxr -ar {args.sample_rate} {output_wav_path} -y'
    )
        duration = sox.file_info.duration(output_wav_path)
        return output_wav_path, duration, text


    logging.info('Converting mp3 to wav for {}.'.format(csv_file))
    with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        next(reader, None)  # skip the headers
        curated = os.listdir(f'{audio_clips_path}')
        data = [(row['path'], row['sentence']) for row in reader if row['path'] in curated] 
        with ThreadPool(num_workers) as pool:
            data = list(tqdm(pool.imap(process, data), total=len(data)))
    return data


def main():
    data_root = args.data_root
    os.makedirs(data_root, exist_ok=True)

    target_unpacked_dir = os.path.join(data_root, "CV_unpacked")

    if os.path.exists(target_unpacked_dir):
        logging.info('Find existing folder {}'.format(target_unpacked_dir))
    else:
        logging.info("Could not find Common Voice, Downloading corpus...")

        commands = [
            'wget',
            '--user-agent',
            '"Mozilla/5.0 (Windows NT 10.0; WOW64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"',
            '-P',
            data_root,
            f'{COMMON_VOICE_URL}',
        ]
        commands = " ".join(commands)
        subprocess.run(commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
        filename = f"{args.language}.tar.gz"
        target_file = os.path.join(data_root, os.path.basename(filename))

        os.makedirs(target_unpacked_dir, exist_ok=True)
        logging.info("Unpacking corpus to {} ...".format(target_unpacked_dir))
        tar = tarfile.open(target_file)
        tar.extractall(target_unpacked_dir)
        tar.close()

    folder_path = os.path.join(target_unpacked_dir, args.version + f'/{args.language}/')

    for csv_file in args.files_to_process:
        data = process_files(
            csv_file=os.path.join(folder_path, csv_file),
            data_root=os.path.join(data_root, os.path.splitext(csv_file)[0]),
            num_workers=args.num_workers,
        )
        logging.info('Creating manifests...')
        create_manifest(
            data=data,
            output_name=f'commonvoice_{os.path.splitext(csv_file)[0]}_manifest.json',
            manifest_path=args.manifest_dir,
        )


if __name__ == "__main__":
    main()


In [None]:
from subprocess import call
r=call('ffmpeg -i "/content/small/CV_unpacked/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_1.mp3" -acodec pcm_u8 -ar 22050 "test.wav"',shell=True)

In [None]:
y, sr = librosa.load("test.wav")

In [None]:
import IPython.display as ipd
ipd.Audio('test.wav')

In [28]:
import json
import ast
import nemo
import nemo.collections.asr as nemo_asr

In [74]:
validated = pd.read_csv('/content/small/CV_unpacked/cv-corpus-6.1-2020-12-11/en/validated.tsv',sep='\t')
curated_wav = os.listdir('/content/small/validated/wav')

In [None]:
nemo_asr.models.EncDecCTCModelBPE.list_available_models()

In [None]:
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

In [75]:
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_small_ls")

[NeMo I 2021-08-04 13:19:05 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.1.0/stt_en_conformer_ctc_small_ls/cf1b6bbcc08433257c12442c92b9996a/stt_en_conformer_ctc_small_ls.nemo.
[NeMo I 2021-08-04 13:19:05 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.1.0/stt_en_conformer_ctc_small_ls/cf1b6bbcc08433257c12442c92b9996a/stt_en_conformer_ctc_small_ls.nemo
[NeMo I 2021-08-04 13:19:05 common:676] Instantiating model from pre-trained checkpoint
[NeMo I 2021-08-04 13:19:06 mixins:149] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2021-08-04 13:19:06 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/Librispeech_SP_Tarred/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/Librispeech_SP_Tarred/audio__OP_0..511_CL_.tar
    
[NeMo W 2021-08-04 13:19:06 modelPT:146] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /manifests/librispeech/librivox-dev-other.json
    - /manife

[NeMo I 2021-08-04 13:19:06 features:252] PADDING: 0
[NeMo I 2021-08-04 13:19:06 features:269] STFT using torch
[NeMo I 2021-08-04 13:19:07 modelPT:439] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_1.1.0/stt_en_conformer_ctc_small_ls/cf1b6bbcc08433257c12442c92b9996a/stt_en_conformer_ctc_small_ls.nemo.


In [67]:
files = [f'/content/small/validated/wav/{a}' for a in curated_wav]
for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):
  print(f"{transcription}, real: {validated.set_index('path').loc[fname.split('/')[-1].replace('wav','mp3')]['sentence']}")

HBox(children=(FloatProgress(value=0.0, description='Transcribing', max=8.0, style=ProgressStyle(description_w…


three armed tribesmen approached and asked what the boy and the alchemist were doing there, real: Three armed tribesmen approached, and asked what the boy and the alchemist were doing there.
he thought of all the married shepherds he had known, real: He thought of all the married shepherds he had known.
he lives at the al fayoum oasis his friend had said, real: "He lives at the Al-Fayoum oasis," his friend had said.
it didn't bring with it the smell of the desert nor the threat of moorish invasion, real: It didn't bring with it the smell of the desert, nor the threat of Moorish invasion.
sadly my dream of becoming a squirrel whisperer may never happen, real: Sadly, my dream of becoming a squirrel whisperer may never happen.
but the merchant understood what the boy had said, real: But the merchant understood what the boy had said.
some of those who saw its flight say it travelled with a hissing sound, real: Some of those who saw its flight say it travelled with a hissing sound.
first t

In [77]:
files = [f'/content/small/validated/wav/{a}' for a in curated_wav]
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
  print(f"{transcription}, real: {validated.set_index('path').loc[fname.split('/')[-1].replace('wav','mp3')]['sentence']}")

HBox(children=(FloatProgress(value=0.0, description='Transcribing', max=8.0, style=ProgressStyle(description_w…


three armed tribes been approached and asked what the boy and the alchemists were doing there, real: Three armed tribesmen approached, and asked what the boy and the alchemist were doing there.
he thought of all the merry shepherds he had known, real: He thought of all the married shepherds he had known.
he lives at the alpoyum goyces his friend had said, real: "He lives at the Al-Fayoum oasis," his friend had said.
it didn't bring with it the smell of the desert nor the threat of maorsh invasion, real: It didn't bring with it the smell of the desert, nor the threat of Moorish invasion.
sadly my dream of becoming a squirrel whisperer may never happen, real: Sadly, my dream of becoming a squirrel whisperer may never happen.
but the merchant understood what the boy had said, real: But the merchant understood what the boy had said.
some of those who saw its flight said it travelled with a hissing sound, real: Some of those who saw its flight say it travelled with a hissing sound.
first t

In [71]:
from nemo.collections.asr.metrics.wer import word_error_rate

In [79]:
hypothesis = asr_model.transcribe(paths2audio_files=files)
actual =  [validated.set_index('path').loc[a.split('/')[-1].replace('wav','mp3')]['sentence'] for a in curated_wav]

HBox(children=(FloatProgress(value=0.0, description='Transcribing', max=8.0, style=ProgressStyle(description_w…




In [None]:
# clean up reference transcripts
ref = [a.replace(','," ").replace('"',' ').replace('.','') for a in actual]

In [86]:
wer = word_error_rate(hypotheses=hypothesis, references=ref)
print(wer)

0.16887417218543047


# record audio from colab

In [23]:
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [24]:
audio, sr = get_audio()
