In [59]:
# python3 -m pip install torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0 torchtext==0.16.0+cpu torchdata==0.7.0 --index-url https://download.pytorch.org/whl/cu121

# pacmd list sources
# name: <alsa_input.usb-SEEED_ReSpeaker_4_Mic_Array__UAC1.0_-00.multichannel-input>
#         driver: <module-alsa-card.c>
#         flags: HARDWARE DECIBEL_VOLUME LATENCY DYNAMIC_LATENCY
#         state: SUSPENDED
#         suspend cause: IDLE
#         priority: 9540
#         volume: front-left: 65536 / 100% / 0.00 dB,   front-right: 65536 / 100% / 0.00 dB,   rear-left: 65536 / 100% / 0.00 dB,   rear-right: 65536 / 100% / 0.00 dB,   front-center: 65536 / 100% / 0.00 dB,   lfe: 65536 / 100% / 0.00 dB
#                 balance 0.00
#         base volume: 65536 / 100% / 0.00 dB
#         volume steps: 65537
#         muted: no
#         current latency: 0.00 ms
#         max rewind: 0 KiB
#         sample spec: s16le 6ch 16000Hz
#         channel map: front-left,front-right,rear-left,rear-right,front-center,lfe
#                      Surround 5.1
#         used by: 0
#         linked by: 0
#         configured latency: 0.00 ms; range is 0.50 .. 2000.00 ms
#         card: 4 <alsa_card.usb-SEEED_ReSpeaker_4_Mic_Array__UAC1.0_-00>
#         module: 27
#         properties:
#                 alsa.resolution_bits = "16"
#                 device.api = "alsa"
#                 device.class = "sound"
#                 alsa.class = "generic"
#                 alsa.subclass = "generic-mix"
#                 alsa.name = "USB Audio"
#                 alsa.id = "USB Audio"
#                 alsa.subdevice = "0"
#                 alsa.subdevice_name = "subdevice #0"
#                 alsa.device = "0"
#                 alsa.card = "4"
#                 alsa.card_name = "ReSpeaker 4 Mic Array (UAC1.0)"
#                 alsa.long_card_name = "SEEED ReSpeaker 4 Mic Array (UAC1.0) at usb-0000:02:00.0-3, full speed"
#                 alsa.driver_name = "snd_usb_audio"
#                 device.bus_path = "pci-0000:02:00.0-usb-0:3:1.0"
#                 sysfs.path = "/devices/pci0000:00/0000:00:01.3/0000:02:00.0/usb1/1-3/1-3:1.0/sound/card4"
#                 udev.id = "usb-SEEED_ReSpeaker_4_Mic_Array__UAC1.0_-00"
#                 device.bus = "usb"
#                 device.vendor.id = "2886"
#                 device.vendor.name = "Seeed Technology Co., Ltd."
#                 device.product.id = "0018"
#                 device.product.name = "ReSpeaker 4 Mic Array (UAC1.0)"
#                 device.serial = "SEEED_ReSpeaker_4_Mic_Array__UAC1.0_"
#                 device.form_factor = "speaker"
#                 device.string = "hw:4"
#                 device.buffering.buffer_size = "384000"
#                 device.buffering.fragment_size = "192000"
#                 device.access_mode = "mmap+timer"
#                 device.profile.name = "multichannel-input"
#                 device.profile.description = "Multichannel"
#                 device.description = "ReSpeaker 4 Mic Array (UAC1.0) Multichannel"
#                 module-udev-detect.discovered = "1"
#                 device.icon_name = "audio-speakers-usb"



In [19]:
import numpy as np
import sentencepiece as spm
import torch
import torchaudio
import torchvision
from torchaudio.io import StreamReader
import torch.multiprocessing as mp

In [73]:
def stream(q, format, src, segment_length, sample_rate):
    
    print("Building StreamReader...")
    streamer = torchaudio.io.StreamReader(src=src, format=format, option=option)
    streamer.add_basic_audio_stream(frames_per_chunk=segment_length, sample_rate=sample_rate, format="s16p", num_channels=6)

    print(streamer.get_src_stream_info(0))
    print("Streaming...")
    print()
    for (chunk_a) in streamer.stream(timeout=-1, backoff=1.0):
        q.put([chunk_a])


class ContextCacher:
    """Cache the end of input data and prepend the next input data with it.

    Args:
        segment_length (int): The size of main segment.
            If the incoming segment is shorter, then the segment is padded.
        context_length (int): The size of the context, cached and appended.
    """

    def __init__(self, segment_length: int, context_length: int):
        self.segment_length = segment_length
        self.context_length = context_length
        self.context = torch.zeros([context_length])

    def __call__(self, chunk: torch.Tensor):
        if chunk.size(0) < self.segment_length:
            chunk = torch.nn.functional.pad(chunk, (0, self.segment_length - chunk.size(0)))
        chunk_with_context = torch.cat((self.context, chunk))
        self.context = chunk[-self.context_length :]
        return chunk_with_context

class Pipeline:
    """Build inference pipeline from RNNTBundle.

    Args:
        bundle (torchaudio.pipelines.RNNTBundle): Bundle object
        beam_width (int): Beam size of beam search decoder.
    """

    def __init__(self, bundle: torchaudio.pipelines.RNNTBundle, beam_width: int = 10):
        self.bundle = bundle
        self.feature_extractor = bundle.get_streaming_feature_extractor()
        self.decoder = bundle.get_decoder()
        self.token_processor = bundle.get_token_processor()

        self.beam_width = beam_width

        self.state = None
        self.hypotheses = None

    def infer(self, segment: torch.Tensor) -> str:
        """Perform streaming inference"""
        features, length = self.feature_extractor(segment)
        self.hypotheses, self.state = self.decoder.infer(
            features, length, self.beam_width, state=self.state, hypothesis=self.hypotheses
        )
        transcript = self.token_processor(self.hypotheses[0][0], lstrip=False)
        return transcript

In [75]:
def main():
    # Parameters
    device = "alsa"
    src = "hw:4"
    n_channels = 6
    
    # Model info
    bundle=torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH
    sample_rate = bundle.sample_rate
    segment_length = bundle.segment_length * bundle.hop_length
    context_length = bundle.right_context_length * bundle.hop_length
    pipeline = Pipeline(bundle)
    
    
    # Cache stream
    cacher = ContextCacher(segment_length, context_length)
    
    
    # Inference
    
    ctx = mp.get_context("spawn")
    
    @torch.inference_mode()
    def infer():
        while True:
            chunk = q.get()      
            segment = cacher(chunk[:, 0])
            transcript = pipeline.infer(segment)
            print(transcript, end="\r", flush=True)
    
    q = ctx.Queue()
    p = ctx.Process(target=stream, args=(q, device, src, segment_length, sample_rate))
    p.start()
    infer()
    p.join()

In [None]:
main()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/jd/anaconda3/envs/marmot/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/jd/anaconda3/envs/marmot/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'stream' on <module '__main__' (built-in)>


In [66]:
print(sample_rate)
print("hop length: %s " % bundle.hop_length)
print("segment length: %s " % bundle.segment_length)
print("context length: %s " %bundle.right_context_length)
print(segment_length)
print(context_length)

print(streamer)

16000
hop length: 160 
segment length: 16 
context length: 4 
2560
640
<torchaudio.io._stream_reader.StreamReader object at 0x77951405ba60>


In [None]:
# Get stream


# Visualize stream?


# form beam at location (az/el/dist)


# visualize separated beam


# extract noise/common spectrum from all channels