In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install matplotlib>=3.3.2
!apt-get install libsox-fmt-all libsox-dev sox > /dev/null
!pip install torchaudio
!python -m pip install git+https://github.com/facebookresearch/WavAugment.git > /dev/null
!pip install wandb

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

# install beam search decoder
!apt-get install -y swig
!git clone https://github.com/NVIDIA/NeMo -b "$BRANCH"
!cd NeMo && bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh


"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()
from IPython.display import clear_output
clear_output()

In [None]:
import os
import re
import wget
import gzip
import shutil

import nemo
import nemo.collections.asr as nemo_asr
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import augment
import torchaudio
import torchaudio.datasets
from pathlib import Path
from google.colab import files

from datetime import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt

from asr.metrics import ASRMetricFunction, CTCLossFunction
from asr.visualisation import play_audio, print_err_html, print_stats, plot_waveform
from asr.general import set_parameter_requires_grad, load_checkpoint, save_checkpoint, tensors_to_device, tensor_to_string
from asr.models import BeamSearchDecoderWithLM
from asr.datasets import LibriSpeechBookDataset
from IPython.display import YouTubeVideo, clear_output
clear_output()

print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

dataroot = os.path.join(".")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME='stt_en_jasper10x5dr'
LM_3GRAM_PATH = '3-gram.arpa'
ROOT = os.path.join(".")

# The csv files containing the data
train_clean_100_path = os.path.join(dataroot, "data", "train-clean-100.csv")
train_clean_360_path = os.path.join(dataroot, "data", "train-clean-360.csv")
dev_clean_path = os.path.join(dataroot, "data", "dev-clean.csv")
test_clean_path = os.path.join(dataroot, "data", "test-clean.csv")
dev_other_path = os.path.join(dataroot, "data", "dev-other.csv")
test_other_path = os.path.join(dataroot, "data", "test-other.csv")

Setup complete. Using torch 1.9.0+cu102 CPU


In [None]:
def download_lm(lm_path):
    %rm -v "{lm_path}"*
    !wget "https://www.openslr.org/resources/11/{lm_path}.gz" -O "{lm_path}.gz"
    !gzip -cdv "{lm_path}.gz" > "{lm_path}"

model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=MODEL_NAME, strict=False).to(DEVICE)

VOCABULARY = list(map(lambda x: x.upper(), model.decoder.vocabulary))
vocab = VOCABULARY + ['<pad>']
BLANK = len(vocab) - 1 

DICTIONARY = dict(zip(vocab, range(len(vocab))))
LABELS = {v:k for k, v in DICTIONARY.items()}

if not os.path.exists(LM_3GRAM_PATH):
    download_lm(LM_3GRAM_PATH)
beam_search_lm = BeamSearchDecoderWithLM(
    vocab=VOCABULARY,
    beam_width=16,
    alpha=1.5, beta=1.5,
    lm_path=LM_3GRAM_PATH,
    num_cpus=max(os.cpu_count(), 1))
clear_output()

In [None]:
dev_dataset = LibriSpeechBookDataset(root=ROOT, url="dev-clean", folder_in_archive="LibriSpeech", download=True)
test_dataset = LibriSpeechBookDataset(root=ROOT, url="test-clean", folder_in_archive="LibriSpeech", download=True)
train_dataset = LibriSpeechBookDataset(root=ROOT, url="train-clean-100", folder_in_archive="LibriSpeech", download=True)

In [None]:
dev_dataset_other = LibriSpeechBookDataset(root=ROOT, url="dev-other", folder_in_archive="LibriSpeech", download=True)
test_dataset_other = LibriSpeechBookDataset(root=ROOT, url="test-other", folder_in_archive="LibriSpeech", download=True)
train_dataset_2 = LibriSpeechBookDataset(root=ROOT, url="train-clean-360", folder_in_archive="LibriSpeech", download=True)

  0%|          | 0.00/300M [00:00<?, ?B/s]

  0%|          | 0.00/314M [00:00<?, ?B/s]

  0%|          | 0.00/21.5G [00:00<?, ?B/s]

In [None]:
def get_best_transcriptions(transcriptions):
    return list(map(lambda xs: xs[0][1], transcriptions))

In [None]:
def strip_nonas(text):
    return re.sub('[\r\n]', '', text)

In [None]:
loop = tqdm(train_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    df = df.append({"REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(train_clean_100_path)

print(df["REAL TEXT"].apply(lambda t: len(t.split(" "))).describe())
print(df["DURATION"].describe())

100%|██████████| 305/305 [03:09<00:00,  1.61it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count      305.000000
mean      3226.773770
std       2456.083601
min         78.000000
25%       1445.000000
50%       2792.000000
75%       4135.000000
max      19479.000000
Name: REAL TEXT, dtype: float64
count    305.000000
mean      19.675639
std       15.336142
min        0.540000
25%        9.170000
50%       17.020000
75%       25.110000
max      127.620000
Name: DURATION, dtype: float64


In [None]:
loop = tqdm(train_dataset_2, position=0, leave=True)
df = pd.DataFrame(None, columns=["REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    df = df.append({"REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(train_clean_360_path)

print(df["REAL TEXT"].apply(lambda t: len(t.split(" "))).describe())
print(df["DURATION"].describe())

100%|██████████| 865/865 [12:45<00:00,  1.13it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count      865.000000
mean      4148.835838
std       3398.631068
min        134.000000
25%       1777.000000
50%       3456.000000
75%       4999.000000
max      42084.000000
Name: REAL TEXT, dtype: float64
count    865.000000
mean      25.177827
std       20.582275
min        1.060000
25%       10.840000
50%       21.960000
75%       30.160000
max      254.310000
Name: DURATION, dtype: float64


In [None]:
loop = tqdm(dev_dataset_other, position=0, leave=True)
df = pd.DataFrame(None, columns=["REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    df = df.append({"REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(dev_other_path)

print(df["REAL TEXT"].apply(lambda t: len(t.split(" "))).describe())
print(df["DURATION"].describe())

100%|██████████| 62/62 [00:12<00:00,  4.93it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count      62.000000
mean      821.741935
std       552.934969
min       104.000000
25%       368.500000
50%       697.500000
75%      1090.250000
max      3219.000000
Name: REAL TEXT, dtype: float64
count    62.000000
mean      4.956129
std       3.395335
min       0.700000
25%       2.645000
50%       3.875000
75%       6.225000
max      21.060000
Name: DURATION, dtype: float64


In [None]:
loop = tqdm(test_dataset_other, position=0, leave=True)
df = pd.DataFrame(None, columns=["REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    df = df.append({"REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(test_other_path)

print(df["REAL TEXT"].apply(lambda t: len(t.split(" "))).describe())
print(df["DURATION"].describe())

100%|██████████| 48/48 [00:12<00:00,  3.78it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count      48.000000
mean     1090.479167
std       685.934182
min        76.000000
25%       557.750000
50%      1143.500000
75%      1561.000000
max      3723.000000
Name: REAL TEXT, dtype: float64
count    48.000000
mean      6.676875
std       4.057966
min       0.670000
25%       3.305000
50%       6.615000
75%      10.032500
max      23.300000
Name: DURATION, dtype: float64


In [None]:
model.eval()
loop = tqdm(dev_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(dev_clean_path)

  0%|          | 0/63 [00:00<?, ?it/s][NeMo W 2021-08-16 12:00:04 patch_utils:50] torch.stft() signature has been updated for PyTorch 1.7+
    Please update PyTorch to remain compatible with later versions of NeMo.
    To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
      return torch.floor_divide(self, other)
    
100%|██████████| 63/63 [07:24<00:00,  7.05s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model.eval()
loop = tqdm(test_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": strip_nonas(booktitle), "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(test_clean_path)

100%|██████████| 54/54 [07:41<00:00,  8.55s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model.eval()
loop = tqdm(train_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": booktitle, "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv(train_clean_100_path)

In [None]:
book_dataset = LibriSpeechBookDataset(root=ROOT, url="dev-other", folder_in_archive="LibriSpeech", download=True)

loop = tqdm(train_dataset, position=0, leave=True)
outdir = "out"
os.makedirs(outdir, exist_ok=True)

for batch_idx, (waveform, _, booktitle, _) in enumerate(loop):
    torchaudio.save(f"{outdir}/{strip_nonas(booktitle)}.wav", waveform, 16_000)
    if batch_idx > 3:
        break

loop.close()