In [None]:
import logging

import nemo.collections.asr as nemo_asr
import pandas as pd
import torch
from nemo.collections.nlp.models import PunctuationCapitalizationModel
from pyannote.audio import Pipeline
from pydub import AudioSegment
from pydub.silence import detect_silence

from ctcdecode import CTCBeamDecoder

logging.getLogger("nemo_logger").setLevel(logging.ERROR)
asr_logger = logging.getLogger("asr")
asr_logger.setLevel(logging.INFO)


## Model spot-testing

In [None]:
import math
import tempfile
import time
from pathlib import Path

import nvsmi
import torch

# scope the models
target_models = [
    "stt_en_quartznet15x5",
    "stt_en_citrinet_512",
    "stt_en_contextnet_512",
    "stt_en_conformer_ctc_medium",
    "stt_en_conformer_transducer_medium",
]
pretrained_models = [
    e
    for e in nemo_asr.models.ASRModel.list_available_models()
    if e.pretrained_model_name in target_models
]

# take a sample podcast
audio_segment = AudioSegment.from_wav(
    "/home/blog-os-asr/output/temp_dir/rewilding-the-scottish-highlands.wav"
)

# progressively slice to gauge memory usage
s2ms = 1000
seconds_increment = 30
slice_intervals = [
    (0, e * s2ms)
    for e in list(range(0, math.ceil(audio_segment.duration_seconds) + 30, 30))[1:]
]

memory_usage_records = []
for pretrained_model in pretrained_models:
    print(f"Memory testing: {pretrained_model.pretrained_model_name}")

    # model classes defined alongside model names
    model = pretrained_model.class_.from_pretrained(
        model_name=pretrained_model.pretrained_model_name
    )
    model_memory_footprint = nvsmi.get_gpu_processes()[0].used_memory

    with tempfile.TemporaryDirectory() as temp_dir:
        # files as input; save in tmp dir
        for interval in slice_intervals:
            try:
                slice = audio_segment[interval[0] : interval[1]]
                save_message = slice.export(
                    Path(temp_dir) / "memory_test_fragment.wav", format="wav"
                )
                before = time.time()
                transcription = model.transcribe(
                    paths2audio_files=[
                        str(Path(temp_dir) / "memory_test_fragment.wav")
                    ],
                    batch_size=1,
                )
                after = time.time()

                # collect some metrics
                memory_usage_records.append(
                    {
                        "model_name": pretrained_model.pretrained_model_name,
                        "input_size": slice.duration_seconds,
                        "transcript": transcription,
                        "memory_usage": nvsmi.get_gpu_processes()[0].used_memory,
                        "time_elapsed": after - before,
                    }
                )
            except:
                # out-of-memory > move onto next model
                print("CUDA out of memory; skipping remaining slice intervals")
                break

    # clear for next model
    del model
    torch.cuda.empty_cache()


In [None]:
memory_usage = pd.DataFrame(memory_usage_records)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("dark")
sns.barplot(x="input_size", y="time_elapsed", hue="model_name", data=memory_usage)
plt.xticks(rotation=45)
plt.title("Input Size x Elapsed Time")


## WER evaluation
- given limitation of conformer/transducer models, bisect audio for all models > whole transcript

In [None]:
full_transcription_records = []

with tempfile.TemporaryDirectory() as temp_dir:
    for pretrained_model in pretrained_models:
        print(f"Processing for {pretrained_model.pretrained_model_name}")
        model = pretrained_model.class_.from_pretrained(
            model_name=pretrained_model.pretrained_model_name
        )
        # oh god, bisect all audio
        first = audio_segment[
            : math.ceil(audio_segment.duration_seconds / 2) * 1000
        ].export(Path(temp_dir) / "first.wav", format="wav")
        second = audio_segment[
            math.ceil(audio_segment.duration_seconds / 2) * 1000 :
        ].export(Path(temp_dir) / "second.wav", format="wav")
        transcriptions = model.transcribe(
            paths2audio_files=[
                str(Path(temp_dir) / "first.wav"),
                str(Path(temp_dir) / "second.wav"),
            ],
            batch_size=1,
        )

        joined_transcriptions = (
            " ".join(transcriptions)
            if type(transcriptions) == list
            else " ".join(transcriptions[0])
        )
        full_transcription_records.append(
            {
                "model": pretrained_model.pretrained_model_name,
                "transcript": joined_transcriptions,
            }
        )

        del model
        torch.cuda.empty_cache()


In [None]:
from jiwer import wer

ground_truth = Path(
    "/home/blog-os-asr/output/radio_national_podcasts/transcripts/rewilding-the-scottish-highlands.txt"
).read_text()

full_transcriptions = pd.DataFrame(full_transcription_records).assign(
    wer=lambda x: x.transcript.apply(lambda y: wer(ground_truth, y))
)

full_transcriptions.pipe(lambda x: x[["model", "wer"]])


## Test script

In [None]:
from pyannote.audio import Pipeline

DIA_MODEL_NAME = "pyannote/speaker-diarization@2022.07"
DIA_MODEL = Pipeline.from_pretrained(DIA_MODEL_NAME)
PAUSE_THRESHOLD = 1
MS = 1000


def diarize_mono_audio(in_file, audio_segment):
    diarization_raw = DIA_MODEL(str(in_file))
    diarized_segments = (
        pd.DataFrame(
            [
                {"start": turn.start, "end": turn.end, "speaker": speaker}
                for turn, _, speaker in diarization_raw.itertracks(yield_label=True)
            ]
        )
        # shift speaker attribution > mark/collapse consecutive speaker segments
        .assign(segment_marker=lambda x: x.speaker.shift(1))
        .assign(segment_marker=lambda x: x.segment_marker != x.speaker)
        .assign(segment_marker=lambda x: pd.Series.cumsum(x.segment_marker))
        # groupby segment, merge audio start/end times
        .groupby("segment_marker")
        .agg(
            {
                "speaker": "first",
                "start": "first",
                "end": "last",
                "segment_marker": "count",
            }
        )
        .rename(
            mapper={"segment_marker": "segment_marker_count"},
            axis="columns",
            inplace=False,
        )
        .assign(segment_len=lambda x: x.end - x.start)
        # TODO: finesse a merging strategy
        .query("segment_len >= @PAUSE_THRESHOLD")
        .reset_index(drop=True)
        .assign(
            audio_segment=lambda x: x.apply(
                lambda y: _assign_child_segment(y, audio_segment), axis=1
            )
        )
    )
    return diarized_segments


def _assign_child_segment(record, parent_audio_segment):
    return parent_audio_segment[record.start * MS : record.end * MS]


in_file = "/home/blog-os-asr/output/temp_dir/rewilding-the-scottish-highlands.wav"
audio_segment = AudioSegment.from_file(in_file)
diarized_segments = diarize_mono_audio(in_file, audio_segment)


In [None]:
from pydub import AudioSegment, silence, utils

ASR_LOGGER = logging.getLogger("asr")
ASR_LOGGER.setLevel(logging.INFO)

SECOND_MAX_AUDIO = 240


def _pseudo_optimise_silence_split(audio_segment):
    # note, silence splitting has effect of reducing broader segment > small amounts of drift
    dbfs_min = 10
    dbfs_max = 40
    dbfs_delta = 10
    min_silence_len = 500  # ms
    dBFS = audio_segment.dBFS
    audio_segments = silence.split_on_silence(
        audio_segment, min_silence_len=min_silence_len, silence_thresh=dBFS - dbfs_min
    )
    while (
        pd.Series([e.duration_seconds for e in audio_segments]).median()
        >= SECOND_MAX_AUDIO
        and dbfs_min <= dbfs_max
    ):
        ASR_LOGGER.warning(
            f"Unable to split segment on silences with silence_thresh of {dBFS - dbfs_min}; re-attempting.."
        )
        dbfs_min += dbfs_delta
        audio_segments = silence.split_on_silence(
            audio_segment,
            min_silence_len=min_silence_len,
            silence_thresh=dBFS - dbfs_min,
        )

    return audio_segments


def segment_utterances(audio_segment_record):
    if audio_segment_record.segment_len > SECOND_MAX_AUDIO:
        silence_splits = _pseudo_optimise_silence_split(
            audio_segment_record.audio_segment
        )
        all_splits = []
        for split in silence_splits:
            if split.duration_seconds > SECOND_MAX_AUDIO:
                all_splits.extend(utils.make_chunks(split, SECOND_MAX_AUDIO * MS))
            else:
                all_splits.append(split)

        start_times = []
        start_time = audio_segment_record.start
        # no cumsum unfortunately
        for e in all_splits:
            start_times.append(start_time)
            start_time += e.duration_seconds

        segments = (
            pd.DataFrame(
                [
                    {
                        "audio_segment": e,
                        "speaker": audio_segment_record.speaker,
                        "segment_len": e.duration_seconds,
                    }
                    for e in all_splits
                ]
            )
            .assign(start=start_times)
            .assign(end=lambda x: x.start + x.segment_len)
        )
        return segments
    else:
        return audio_segment_record.to_frame().T


chunked_diarized_segments = diarized_segments.apply(
    lambda x: segment_utterances(x), axis=1
)
chunked_diarized_segments = pd.concat(chunked_diarized_segments.tolist()).reset_index(
    drop=True
)


In [None]:
from pathlib import Path

ASR_MODEL_NAME = "stt_en_quartznet15x5"
ASR_MODEL = nemo_asr.models.ASRModel.from_pretrained(model_name=ASR_MODEL_NAME)
BATCH_SIZE = 4
temp_dir = Path("../output/temp_dir")

paths2audio_files = []  # explicitly sequence, RE: sorted() issues
for idx, record in chunked_diarized_segments.iterrows():
    segment_audio_res = record.audio_segment.export(
        Path(temp_dir) / f"chunk_{idx}.wav", format="wav"
    )
    paths2audio_files.append(str(Path(temp_dir) / f"chunk_{idx}.wav"))

asr_outputs = ASR_MODEL.transcribe(
    paths2audio_files=paths2audio_files,
    batch_size=BATCH_SIZE,
    return_hypotheses=True,
)
chunked_diarized_segments = chunked_diarized_segments.assign(asr_outputs=asr_outputs)


In [None]:
from nemo.collections.nlp.models import PunctuationCapitalizationModel

PUNCT_MODEL_NAME = "punctuation_en_bert"
PUNCT_MODEL = PunctuationCapitalizationModel.from_pretrained(PUNCT_MODEL_NAME)


def _punctuate_collapse_segment(record):
    return {
        "speaker": record.iloc[0].speaker,
        "start": record.start.min(),
        "end": record.end.max(),
        "transcript": PUNCT_MODEL.add_punctuation_capitalization(
            [" ".join(record.asr_outputs.apply(lambda x: x.text).tolist())]
        )[0],
    }


punctuated_exchanges = pd.DataFrame(
    chunked_diarized_segments.assign(segment_marker=lambda x: x.speaker.shift(1))
    .assign(segment_marker=lambda x: x.segment_marker != x.speaker)
    .assign(segment_marker=lambda x: pd.Series.cumsum(x.segment_marker))
    .groupby("segment_marker")
    .apply(_punctuate_collapse_segment)
    .tolist()
)


In [None]:
pd.set_option("max_colwidth", 400)

punctuated_exchanges


## Test module invocation

In [None]:
import sys

from asr import transcribe_mono_audio

sys.path.append("..")

transcription = transcribe_mono_audio(
    "../output/radio_national_podcasts/audio/rewilding-the-scottish-highlands.mp3"
)
transcription
