### IMPORTING RIVA NeMo LIBS

In [1]:
import nemo.collections.asr as nemo_asr
from nemo.core.classes import IterableDataset

import torch
import torchaudio
from torch.utils.data import DataLoader

import pandas as pd
from tqdm import tqdm

import math
import numpy as np

  warn(f"Failed to load image Python extension: {e}")
[NeMo W 2023-04-28 20:05:52 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2023-04-28 20:05:53 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.


### LOAD NeMO MODEL --- CONFORMER-CTC

In [2]:
model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained("nvidia/stt_en_conformer_ctc_large")

[NeMo I 2023-04-28 20:06:07 mixins:170] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2023-04-28 20:06:08 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar
    
[NeMo W 2023-04-28 20:06:08 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data/ASR/LibriSpeech/librispeech_withs

[NeMo I 2023-04-28 20:06:08 features:287] PADDING: 0
[NeMo I 2023-04-28 20:06:08 save_restore_connector:247] Model EncDecCTCModelBPE was successfully restored from C:\Users\saadn\.cache\huggingface\hub\models--nvidia--stt_en_conformer_ctc_large\snapshots\2c8326e4e43ae5b994612cfea3f3029818fb23c6\stt_en_conformer_ctc_large.nemo.


In [3]:
def getRivaTranscript(audio) -> str:
    # Convert audio tensor to a numpy array
    samples = audio.get_array_of_samples()
    audio = np.array(samples)
    
    # Set up buffer and chunk lengths, stride, and sample rate
    context_len_in_secs = 1
    chunk_len_in_secs = 15            
    stride = 4
    sample_rate = 16000
    chunk_len = sample_rate * chunk_len_in_secs
    buffer_len_in_secs = chunk_len_in_secs + 2 * context_len_in_secs
    buffer_len = sample_rate * buffer_len_in_secs
    sampbuffer = np.zeros([buffer_len], dtype=np.float32)
    
    # Read chunks of audio
    chunk_reader = AudioChunkIterator(audio, chunk_len_in_secs, sample_rate)
    buffer_list = []
    for chunk in chunk_reader:
        sampbuffer[:-chunk_len] = sampbuffer[chunk_len:]
        sampbuffer[-chunk_len:] = chunk
        buffer_list.append(np.array(sampbuffer))
    
    # Transcribe the audio chunks
    asr_decoder = ChunkBufferDecoder(model, stride=stride, chunk_len_in_secs=chunk_len_in_secs, buffer_len_in_secs=buffer_len_in_secs)
    transcription = asr_decoder.transcribe_buffers(buffer_list, plot=False)    
    
    # Return the transcription
    return transcription

## Support Functionality

In [4]:
import librosa
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from IPython.display import Audio
from numpy.fft import fft, ifft
from pydub import AudioSegment
%matplotlib inline

In [5]:
class AudioChunkIterator():
    def __init__(self, samples, frame_len, sample_rate):
        self._samples = samples
        self._chunk_len = frame_len*sample_rate
        self._start = 0
        self.output=True
   
    def __iter__(self):
        return self
    
    def __next__(self):
        if not self.output:
            raise StopIteration
        last = int(self._start + self._chunk_len)
        if last <= len(self._samples):
            chunk = self._samples[self._start: last]
            self._start = last
        else:
            chunk = np.zeros([int(self._chunk_len)], dtype='float32')
            samp_len = len(self._samples) - self._start
            chunk[0:samp_len] = self._samples[self._start:len(self._samples)]
            self.output = False
        return chunk

def speech_collate_fn(batch):
    """collate batch of audio sig, audio len
    Args:
        batch (FloatTensor, LongTensor):  A tuple of tuples of signal, signal lengths.
        This collate func assumes the signals are 1d torch tensors (i.e. mono audio).
    """
    _, audio_lengths = zip(*batch)

    max_audio_len = 0
    has_audio = audio_lengths[0] is not None
    if has_audio:
        max_audio_len = max(audio_lengths).item()
   
    audio_signal= []
    for sig, sig_len in batch:
        if has_audio:
            sig_len = sig_len.item()
            if sig_len < max_audio_len:
                pad = (0, max_audio_len - sig_len)
                sig = torch.nn.functional.pad(sig, pad)
            audio_signal.append(sig)
        
    if has_audio:
        audio_signal = torch.stack(audio_signal)
        audio_lengths = torch.stack(audio_lengths)
    else:
        audio_signal, audio_lengths = None, None

    return audio_signal, audio_lengths

# simple data layer to pass audio signal
class AudioBuffersDataLayer(IterableDataset):
    def __init__(self):
        super().__init__()
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self._buf_count == len(self.signal) :
            raise StopIteration
        self._buf_count +=1
        return torch.as_tensor(self.signal[self._buf_count-1], dtype=torch.float32), \
               torch.as_tensor(self.signal_shape[0], dtype=torch.int64)
        
    def set_signal(self, signals):
        self.signal = signals
        self.signal_shape = self.signal[0].shape
        self._buf_count = 0

    def __len__(self):
        return 1
    
class ChunkBufferDecoder:
    def __init__(self,asr_model, stride, chunk_len_in_secs=1, buffer_len_in_secs=3):
        self.asr_model = asr_model
        self.asr_model.eval()
        self.data_layer = AudioBuffersDataLayer()
        self.data_loader = DataLoader(self.data_layer, batch_size=1, collate_fn=speech_collate_fn)
        self.buffers = []
        self.all_preds = []
        self.chunk_len = chunk_len_in_secs
        self.buffer_len = buffer_len_in_secs
        assert(chunk_len_in_secs<=buffer_len_in_secs)
        
        feature_stride = asr_model._cfg.preprocessor['window_stride']
        self.model_stride_in_secs = feature_stride * stride
        self.n_tokens_per_chunk = math.ceil(self.chunk_len / self.model_stride_in_secs)
        self.blank_id = len(asr_model.decoder.vocabulary)
        self.plot=False
        
    @torch.no_grad()    
    def transcribe_buffers(self, buffers, merge=True, plot=False):
        self.plot = plot
        self.buffers = buffers
        self.data_layer.set_signal(buffers[:])
        self._get_batch_preds()      
        return self.decode_final(merge)
    
    def _get_batch_preds(self):
        device = self.asr_model.device
        for batch in iter(self.data_loader):

            audio_signal, audio_signal_len = batch

            audio_signal, audio_signal_len = audio_signal.to(device), audio_signal_len.to(device)
            log_probs, encoded_len, predictions = self.asr_model(input_signal=audio_signal, input_signal_length=audio_signal_len)
            preds = torch.unbind(predictions)
            for pred in preds:
                self.all_preds.append(pred.cpu().numpy())
    
    def decode_final(self, merge=True, extra=0):
        self.unmerged = []
        self.toks_unmerged = []
        # index for the first token corresponding to a chunk of audio would be len(decoded) - 1 - delay
        delay = math.ceil((self.chunk_len + (self.buffer_len - self.chunk_len) / 2) / self.model_stride_in_secs)

        decoded_frames = []
        all_toks = []
        for pred in self.all_preds:
            ids, toks = self._greedy_decoder(pred, self.asr_model.tokenizer)
            decoded_frames.append(ids)
            all_toks.append(toks)

        for decoded in decoded_frames:
            self.unmerged += decoded[len(decoded) - 1 - delay:len(decoded) - 1 - delay + self.n_tokens_per_chunk]
        if self.plot:
            for i, tok in enumerate(all_toks):
                plt.plot(self.buffers[i])
                plt.show()
                print("\nGreedy labels collected from this buffer")
                print(tok[len(tok) - 1 - delay:len(tok) - 1 - delay + self.n_tokens_per_chunk])                
                self.toks_unmerged += tok[len(tok) - 1 - delay:len(tok) - 1 - delay + self.n_tokens_per_chunk]
            print("\nTokens collected from succesive buffers before CTC merge")
            print(self.toks_unmerged)

        if not merge:
            return self.unmerged
        return self.greedy_merge(self.unmerged)
    
    def _greedy_decoder(self, preds, tokenizer):
        s = []
        ids = []
        for i in range(preds.shape[0]):
            if preds[i] == self.blank_id:
                s.append("_")
            else:
                pred = preds[i]
                s.append(tokenizer.ids_to_tokens([pred.item()])[0])
            ids.append(preds[i])
        return ids, s
         
    def greedy_merge(self, preds):
        decoded_prediction = []
        previous = self.blank_id
        for p in preds:
            if (p != previous or previous == self.blank_id) and p != self.blank_id:
                decoded_prediction.append(p.item())
            previous = p
        hypothesis = self.asr_model.tokenizer.ids_to_text(decoded_prediction)
        return hypothesis
    
def plot_audio(audio):
    samples = audio.get_array_of_samples()
    np.array(samples)
    plt.figure(figsize=[14,5])
    plt.plot(samples)
    plt.xlabel('Sample Index')
    plt.ylabel('Amplitude')
    plt.title('Waveform of Agent Audio')
    plt.show()

In [6]:
def merge_conversation(list1, list2, column=3):
    merged_list = []
    i = j = 0
    n1, n2 = len(list1), len(list2)
    while i < n1 and j < n2:
        if list1[i][column] <= list2[j][column]:
            merged_list.append(list1[i])
            i += 1
        else:
            merged_list.append(list2[j])
            j += 1
    merged_list.extend(list1[i:])
    merged_list.extend(list2[j:])
    return merged_list

In [7]:
from pydub import silence

def split_on_silence(audio, silence_detect_fn, label):
    silences = silence_detect_fn(audio)
    chunks = []
    start = 0
    for i, (start_silence, end_silence) in enumerate(silences):
        transcript = ""
        end = start_silence * 1000
        mid =  (start + end) / 2
        chunk = audio[start:end]
        if chunk.duration_seconds > 0.5:
            transcript = getRivaTranscript(chunk)       ## TRANSCRIPTION OF CHUNK THROIUGH RIVA   
        chunks.append((label, start/1000, end/1000, mid, chunk, transcript))
        start = end_silence * 1000
    end = audio.duration_seconds * 1000
    mid =  (start + end) / 2
    chunk = audio[start:end]
    if chunk.duration_seconds > 0.5:
        transcript = getRivaTranscript(chunk)       ## TRANSCRIPTION OF CHUNK THROIUGH RIVA   
    chunks.append((label, start/1000, end/1000, mid, chunk, transcript))
    return chunks

def extract_silences(audio, dBFS=None):
    if dBFS is None:
        dBFS = audio.dBFS
    sil = silence.detect_silence(audio, min_silence_len=1000, silence_thresh=dBFS-16)
    return [(start/1000, stop/1000) for start, stop in sil]

In [8]:
def dialog_transcription(agent_file, customer_file):
    # Read Audio files using pydub
    agent_audio = AudioSegment.from_file(agent_file)
    customer_audio = AudioSegment.from_file(customer_file)
        
    #Resampling to 16k
    agent_audio = agent_audio.set_frame_rate(16000)
    customer_audio = customer_audio.set_frame_rate(16000)
    
    #split audios based on silences then merge and order
    agent_chunks = split_on_silence(agent_audio, extract_silences, "Agent")
    customer_chunks = split_on_silence(customer_audio, extract_silences, "Customer")
    dialog =  merge_conversation(agent_chunks, customer_chunks)
    
    # Concatenate transcript of the chunks with their timestamps
    result = ""
    for label, start, end, mid, chunk, transcript in dialog:
        if chunk.duration_seconds > 0.5: 
            #print("[{:<4.2f}, {:<4.2f}] {:<8}: {}".format(start, end, label, transcript))
            chunk_str = "[{:<4.2f}, {:<4.2f}] {:<8} :{}\n".format(start, end, label, transcript)
            result += chunk_str
    
    return result

## TEST TRANSCRIPTS

In [9]:
transcript = dialog_transcription("MegaAudio/utr7lk3r1qfnjhcp41f7_1_TFS_600004.wav", "MegaAudio/utr7lk3r1qfnjhcp41f7_600002.wav")
print(transcript)

[0.00, 3.04] Agent    :ternoon this is sou is speaking how may i help you
[7.67, 13.28] Customer :helloso my name is mitin yanda and i'm calling because we have issues with our internet connect
[11.82, 14.42] Agent    :liir could you please provide me with a bortai
[19.08, 26.74] Customer :during the day or internet is fad but during the night the connection is very bad could cause that
[26.34, 29.73] Agent    :this is because during the night there is a lot of more traffic
[34.94, 36.51] Customer :what we should doy
[36.09, 40.18] Agent    :my subpicion is to send you a new router that will be more powerful
[44.70, 45.45] Customer :the suny
[44.57, 49.20] Agent    :i will inform o technical team to visit you and implement a new route
[51.96, 55.75] Agent    :could you please provide me with your address mr yano
[60.57, 62.41] Customer :brs tuding stn
[61.68, 65.45] Agent    :thank you wou next tuesday at three pm woul you
[69.88, 72.59] Customer :yes fine i will be at home at thattime

In [10]:
transcript = dialog_transcription("MegaAudio/d3nn9h1jsqje8dv9v0pq_1_TFS_400059.wav", "MegaAudio/d3nn9h1jsqje8dv9v0pq_101011.wav")
print(transcript)

[1.80, 6.66] Agent    :good morning thank you for calling dr nron salon this is matt speaking how may i assist you today
[7.78, 11.82] Customer :hy i was wondering if i could book an appointment for a hair cut
[13.55, 18.81] Agent    :of course i'ld be happy to help you with that can you please tell me the date and the time that worked for you
[20.23, 25.81] Customer :i was thinking about next thursday at two pm will that be available
[27.62, 29.24] Agent    :uh let me check our schedule for you
[30.74, 34.53] Agent    :yes we have an opening for an haircut at two pm next thursday
[35.62, 37.83] Customer :mat can you please look up for me
[39.78, 41.51] Agent    :certainly may i have your name plea
[42.35, 44.06] Customer :my name is jennifer
[45.67, 53.60] Agent    :perfect jennifer i have booked you a haircut at two p m next thursday may i have your phone number in case we need to reach you
[54.79, 60.84] Customer :sure my phone number is four one five five one three one one three ni

In [11]:
transcript = dialog_transcription("MegaAudio/1snipln0j6obdfsbeqhv_400059.wav", "MegaAudio/1snipln0j6obdfsbeqhv_1_TFS_101011.wav")
print(transcript)

[1.80, 5.72] Agent    :hello this is sakan from affinity insurance how are you doing today
[7.80, 9.24] Customer :hello i'm good thank you
[10.90, 20.05] Agent    :i'm calling because i notice that you may be in need of insurance coverage for your car can i ask you a few questions to determine the best coverage options for you
[22.26, 23.11] Customer :nine
[24.38, 28.98] Agent    :waight let me start by asking do you currently have insurance coverage for your car
[30.78, 32.90] Customer :yes i have insurance coverage for my c
[33.82, 39.84] Agent    :a that's great can i ask what type of coverage you have and who is your current provider
[41.98, 46.58] Customer :i have to covered with hundred percent payment premiums with aliiance insurance
[47.66, 56.53] Agent    :okay thank you let me take a moment to review your current coverage and see if there are any areas where we can improve your coverage or offer a better one
[58.77, 59.63] Customer :okay s
[62.70, 79.72] Agent    :great i hav

In [14]:
import torch

In [15]:
torch.cuda.get_device_name(0)