In [None]:
import logging

import nemo.collections.asr as nemo_asr
import pandas as pd
import torch
from nemo.collections.nlp.models import PunctuationCapitalizationModel
from pyannote.audio import Pipeline
from pydub import AudioSegment
from pydub.silence import detect_silence
from ctcdecode import CTCBeamDecoder

logging.getLogger("nemo_logger").setLevel(logging.ERROR)
asr_logger = logging.getLogger("asr")
asr_logger.setLevel(logging.INFO)


## Model spot-testing

In [None]:
import tempfile
import nvsmi
from pathlib import Path
import time
import torch
import math

# scope the models
target_models = ['stt_en_quartznet15x5', 'stt_en_citrinet_512', 'stt_en_contextnet_512', 'stt_en_conformer_ctc_medium', 'stt_en_conformer_transducer_medium']
pretrained_models = [e for e in nemo_asr.models.ASRModel.list_available_models() if e.pretrained_model_name in target_models]

# take a sample podcast
audio_segment = AudioSegment.from_wav('/home/blog-os-asr/output/temp_dir/rewilding-the-scottish-highlands.wav')

# progressively slice to gauge memory usage
probable_max_audio_seconds = 60 * 3
s2ms = 1000
seconds_increment = 30

slice_intervals = [(0, e*s2ms) for e in list(range(0, math.ceil(audio_segment.duration_seconds)+30, 30))[1:]]
# slice_intervals = [(0, e*s2ms) for e in list(range(0,probable_max_audio_seconds,seconds_increment))][1:]

memory_usage_records = []
for pretrained_model in pretrained_models:
    print(f"Memory testing: {pretrained_model.pretrained_model_name}")

    # model classes defined alongside model names
    model = pretrained_model.class_.from_pretrained(model_name=pretrained_model.pretrained_model_name)
    model_memory_footprint = nvsmi.get_gpu_processes()[0].used_memory

    with tempfile.TemporaryDirectory() as temp_dir:
        # files as input; save in tmp dir
        for interval in slice_intervals:
            try:
                slice = audio_segment[interval[0]:interval[1]]
                save_message = slice.export(Path(temp_dir) / 'memory_test_fragment.wav', format='wav')
                before = time.time()
                transcription = model.transcribe(paths2audio_files=[str(Path(temp_dir) / 'memory_test_fragment.wav')], batch_size=1)
                after = time.time()

                # collect some metrics
                memory_usage_records.append({'model_name': pretrained_model.pretrained_model_name,
                'input_size': slice.duration_seconds,
                'transcript': transcription,
                'memory_usage': nvsmi.get_gpu_processes()[0].used_memory,
                'time_elapsed': after-before})
            except:
                # out-of-memory > move onto next model
                print('CUDA out of memory; skipping remaining slice intervals')
                break
    
    # clear for next model
    del model
    torch.cuda.empty_cache()


In [None]:
memory_usage = pd.DataFrame(memory_usage_records)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("dark")
sns.barplot(x="input_size", y="time_elapsed", hue="model_name", data=memory_usage)
plt.xticks(rotation=45)
plt.title('Input Size x Elapsed Time')

## WER evaluation
- given limitation of conformer/transducer models, bisect audio for all models > whole transcript

In [41]:
transcriptions

(["but we begin with jeremy leggett former professor at imperial college and oxford with his massive project in a world famous setting highlands reworlding is a new type of company it's a mass ownership company and it's going to do its level best to accelerate nature recovery in the highlands of scotland so as to contribute to repair of climate melt down and biodiversity claps that's some remit isn't it jeremy leggett could you place us where are we i know down below is loch ness but lochnes much to my surprise it's very very long so where are we we're halfway along loch ness about sixteen miles from a fairly big city inverness the capital of the highlands of scotland despite the proximity to that city we're in a really wild place there's one thousand two hundred acres on this estate that we're managing and not many footpaths and there's a lot of genuine wildland in our mixed habitats of native woodlands peatlands pasture heathland and coniferous plantations how long is loch ness becau

In [50]:
type(transcriptions[0]) == list

True

In [51]:
full_transcription_records = []

with tempfile.TemporaryDirectory() as temp_dir:
    for pretrained_model in pretrained_models:
        print(f"Processing for {pretrained_model.pretrained_model_name}")
        model = pretrained_model.class_.from_pretrained(model_name=pretrained_model.pretrained_model_name)
        # oh god, bisect all audio
        first = audio_segment[:math.ceil(audio_segment.duration_seconds / 2) * 1000].export(Path(temp_dir) / 'first.wav', format='wav')
        second = audio_segment[math.ceil(audio_segment.duration_seconds / 2) * 1000:].export(Path(temp_dir) / 'second.wav', format='wav')
        transcriptions = model.transcribe(paths2audio_files=[str(Path(temp_dir) / 'first.wav'), str(Path(temp_dir) / 'second.wav')], batch_size=1)

        joined_transcriptions = ' '.join(transcriptions) if type(transcriptions) == list else ' '.join(transcriptions[0])
        full_transcription_records.append({'model': pretrained_model.pretrained_model_name, 'transcript': joined_transcriptions})
            
        del model
        torch.cuda.empty_cache()

Processing for stt_en_citrinet_512


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

Processing for stt_en_conformer_ctc_medium


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

Processing for stt_en_conformer_transducer_medium


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

Processing for stt_en_contextnet_512


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

Processing for stt_en_quartznet15x5


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

In [56]:
from jiwer import wer

ground_truth = Path('/home/blog-os-asr/output/radio_national_podcasts/transcripts/rewilding-the-scottish-highlands.txt').read_text()

full_transcriptions = (pd.DataFrame(full_transcription_records)
.assign(wer=lambda x: x.transcript.apply(lambda y: wer(ground_truth, y)))
)

full_transcriptions.pipe(lambda x: x[['model','wer']])

Unnamed: 0,model,wer
0,stt_en_citrinet_512,0.318701
1,stt_en_conformer_ctc_medium,0.334937
2,stt_en_conformer_transducer_medium,0.285628
3,stt_en_contextnet_512,0.293446
4,stt_en_quartznet15x5,0.326518


## Test script

In [None]:
from asr import transcribe_mono

transcription = transcribe_mono(
    "../output/radio_national_podcasts/audio/rewilding-the-scottish-highlands.mp3"
)
transcription

In [None]:
time_formatted_words_all = []
for idx, record in transcription.head(10).iterrows():
    time_formatted_words = _format_word_timestamps(record.asr_outputs, record.start)

    # 5.0 apply punctuation to each output
    punctuated_sequence = punct_model.add_punctuation_capitalization(
        [" ".join(e["word"] for e in time_formatted_words)]
    )[0]

    # if len(punctuated_sequence.split(" ")) == len(time_formatted_words):
    #     # easy case, where punctuated output len matches input len; assign directly
    #     punctuated_sequence_joined = (
    #         pd.DataFrame(time_formatted_words)
    #         .assign(word=punctuated_sequence.split(" "))
    #         .assign(speakerTag=record.speaker)
    #         .to_dict(orient="records")
    #     )
    #     time_formatted_words_all.append(punctuated_sequence_joined)
    # else:
    #     # otherwise.. pad the difference? changes should be limited to immediately proceeding fullstops, commas, question marks
    #     # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html
    #     print("Punctuated outputs not the same length as input")