In [None]:
import librosa
import numpy as np
import time
from asr_streaming_object import ASRStreaming

# Initialize the ASRStreaming object
asr_streaming = ASRStreaming(model_name="stt_en_fastconformer_hybrid_large_streaming_multi", lookahead_size=80, decoder_type="rnnt")

# Load the audio file
y, sr = librosa.load('2086-149220-0033.wav', sr=16000)

# Define the chunk duration in seconds
sr = 16000
chunk_duration = 0.160  # seconds
chunk_samples = int(sr * chunk_duration) - 1  # samples per chunk
print(chunk_samples)

# Calculate the number of chunks
num_chunks = int(np.ceil(len(y) / chunk_samples))
print(num_chunks)

# Process each chunk
for i in range(num_chunks):
    start_sample = i * chunk_samples
    end_sample = min((i + 1) * chunk_samples, len(y))
    chunk = y[start_sample:end_sample]
    
    # If the chunk is shorter than chunk_samples, pad with zeroes
    if len(chunk) < chunk_samples:
        padding = chunk_samples - len(chunk)
        chunk = np.pad(chunk, (0, padding), mode='constant')

    chunk = (chunk * 32767).astype(np.int16)

    start_time = time.time()
    text = asr_streaming.transcribe_chunk(chunk)
    elapsed_time = time.time() - start_time

    print(f"--- {elapsed_time:.2f} seconds ---")
    print(f"--- RTF {chunk_duration / elapsed_time:.2f} ---")
    print(f"Chunk {i + 1}: {text}")

[NeMo I 2024-12-06 15:50:44 cloud:58] Found existing object /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo.
[NeMo I 2024-12-06 15:50:44 cloud:64] Re-using file from: /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo
[NeMo I 2024-12-06 15:50:44 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2024-12-06 15:50:45 mixins:173] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-12-06 15:50:46 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /raid/local//bucket1/tarred_audio_manifest.json
    - - /raid/local//bucket2/tarred_audio_manifest.json
    - - /raid/local//bucket3/tarred_audio_manifest.json
    - - /raid/local//bucket4/tarred_audio_manifest.json
    - - /raid/local//bucket5/tarred_audio_manifest.json
    - - /raid/local//bucket6/tarred_audio_manifest.json
    - - /raid/local//bucket7/tarred_audio_manifest.json
    - - /raid/local//bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 25
    min_duration: 0.1
    is_tarred: true
    tarred_audio_filepaths:
    - - /data2/nemo_asr/nemo_asr_set_3.0//bucket1/audi

[NeMo I 2024-12-06 15:50:46 features:305] PADDING: 0
[NeMo I 2024-12-06 15:50:46 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}
[NeMo I 2024-12-06 15:50:46 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:50:46 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2024-12-06 15:50:46 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:50:46 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2024-12-06 15:50:47 save_restore_connector:275] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo.
[NeMo I 2024-12-06 15:50:47 hybrid_rnnt_ctc_bpe_models:431] No `decoding_cfg` passed when changing decoding strategy, using internal config
[NeMo I 2024-12-06 15:50:47 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:50:47 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2024-12-06 15:50:47 hybrid_rnnt_ctc_bpe_models:469] Changed decoding strategy of the RNNT decoder to 
    model_type: rnnt
    strategy: greedy_batch
    compute_hypothesis_token_set: false
    preserve_alignments: null
    confidence_cfg:
      preserve_frame_confidence: false
      preserve_token_confidence: false
      preserve_word_confidence: false
      exclude_blank: true
      aggregation: min
      tdt_include_duration: false
      method_cfg:
        name: entropy
        entropy_type: tsallis
        alpha: 0.33
        entropy_norm: exp
        temperature: DEPRECATED
    fused_batch_size: null
    compute_timestamps: null
    compute_langs: false
    word_seperator: ' '
    rnnt_timestamp_type: all
    greedy:
      max_symbols_per_step: 10
      preserve_alignments: false
      preserve_frame_confidence: false
      tdt_include_duration_confidence: false
      confidence_method_cfg:
        name: entropy
        entropy_type: tsallis
        alpha: 0.33
        en

In [5]:
import pyaudio as pa
import numpy as np
import time
from asr_streaming_object import ASRStreaming

# INIT PARAMS
lookahead_size = 80
# specify encoder step length (which is 80 ms for FastConformer models)
ENCODER_STEP_LENGTH = 80 # ms
SAMPLE_RATE = 16000
chunk_duration = (lookahead_size + ENCODER_STEP_LENGTH) / 1000  # seconds
chunk_samples = int(SAMPLE_RATE * chunk_duration) - 1  # samples per chunk
# Initialize the ASRStreaming object
asr_streaming = ASRStreaming(model_name="stt_en_fastconformer_hybrid_large_streaming_multi", lookahead_size=lookahead_size, decoder_type="rnnt")

# MICROPHONE EXAMPLE
p = pa.PyAudio()
print('Available audio input devices:')
input_devices = []
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    if dev.get('maxInputChannels'):
        input_devices.append(i)
        print(i, dev.get('name'))

if len(input_devices):
    dev_idx = -2
    while dev_idx not in input_devices:
        print('Please type input device ID:')
        dev_idx = int(input())

    def callback(in_data, frame_count, time_info, status):
        signal = np.frombuffer(in_data, dtype=np.int16)
        text = asr_streaming.transcribe_chunk(signal)
        print(text, end='\r')
        return (in_data, pa.paContinue)

    stream = p.open(format=pa.paInt16,
                    channels=1,
                    rate=SAMPLE_RATE,
                    input=True,
                    input_device_index=dev_idx,
                    stream_callback=callback,
                    frames_per_buffer=chunk_samples
                   )

    print('Listening...')

    stream.start_stream()
    
    # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !
    try:
        while stream.is_active():
            time.sleep(0.1)
    finally:        
        stream.stop_stream()
        stream.close()
        p.terminate()

        print()
        print("PyAudio stopped")
    
else:
    print('ERROR: No audio input device found.')

[NeMo I 2024-12-06 15:54:57 cloud:58] Found existing object /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo.
[NeMo I 2024-12-06 15:54:57 cloud:64] Re-using file from: /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo
[NeMo I 2024-12-06 15:54:57 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2024-12-06 15:54:58 mixins:173] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-12-06 15:54:59 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /raid/local//bucket1/tarred_audio_manifest.json
    - - /raid/local//bucket2/tarred_audio_manifest.json
    - - /raid/local//bucket3/tarred_audio_manifest.json
    - - /raid/local//bucket4/tarred_audio_manifest.json
    - - /raid/local//bucket5/tarred_audio_manifest.json
    - - /raid/local//bucket6/tarred_audio_manifest.json
    - - /raid/local//bucket7/tarred_audio_manifest.json
    - - /raid/local//bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 25
    min_duration: 0.1
    is_tarred: true
    tarred_audio_filepaths:
    - - /data2/nemo_asr/nemo_asr_set_3.0//bucket1/audi

[NeMo I 2024-12-06 15:54:59 features:305] PADDING: 0


    


[NeMo I 2024-12-06 15:55:00 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}
[NeMo I 2024-12-06 15:55:00 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:55:00 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2024-12-06 15:55:00 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:55:00 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`
      return torch.load(model_weights, map_location='cpu')
    


[NeMo I 2024-12-06 15:55:00 save_restore_connector:275] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /home/sander/.cache/torch/NeMo/NeMo_2.0.0/stt_en_fastconformer_hybrid_large_streaming_multi/a22a1091ef0b90f40b4e99859c44b15e/stt_en_fastconformer_hybrid_large_streaming_multi.nemo.
[NeMo I 2024-12-06 15:55:00 hybrid_rnnt_ctc_bpe_models:431] No `decoding_cfg` passed when changing decoding strategy, using internal config
[NeMo I 2024-12-06 15:55:00 rnnt_models:225] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.005, 'clamp': -1.0}


[NeMo W 2024-12-06 15:55:00 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2024-12-06 15:55:00 hybrid_rnnt_ctc_bpe_models:469] Changed decoding strategy of the RNNT decoder to 
    model_type: rnnt
    strategy: greedy_batch
    compute_hypothesis_token_set: false
    preserve_alignments: null
    confidence_cfg:
      preserve_frame_confidence: false
      preserve_token_confidence: false
      preserve_word_confidence: false
      exclude_blank: true
      aggregation: min
      tdt_include_duration: false
      method_cfg:
        name: entropy
        entropy_type: tsallis
        alpha: 0.33
        entropy_norm: exp
        temperature: DEPRECATED
    fused_batch_size: null
    compute_timestamps: null
    compute_langs: false
    word_seperator: ' '
    rnnt_timestamp_type: all
    greedy:
      max_symbols_per_step: 10
      preserve_alignments: false
      preserve_frame_confidence: false
      tdt_include_duration_confidence: false
      confidence_method_cfg:
        name: entropy
        entropy_type: tsallis
        alpha: 0.33
        en

KeyboardInterrupt: 