Merge pull request #14 from sled-group/speech-to-text

Speech to text
sled-group · Oct 19, 2023 · 86def61 · 86def61
2 parents 34a90cb + c2e80d3
commit 86def61
Show file tree

Hide file tree

Showing 11 changed files with 359 additions and 216 deletions.
diff --git a/README.md b/README.md
@@ -105,35 +105,30 @@ You can configure `PercepSyncHoloLensCapture` via a configuration file `PercepSy
 
 `PercepSync` uses [ZeroMQ](https://zeromq.org/) to publish data from different input devices. Data that can be synchronized will be synchronized and published to a single topic. The serialization format is [MessagePack](https://msgpack.org/).
 
-Here's the list of available topics and their data formats:
+Currently, one topic for synchronized perception data is available:
 
-- `videoFrame`
+- `perception`
 
 ```python
 {
     "message": {
-        "pixelData": bytes, # raw pixels in RGB 24-bit for a single frame
-        "width": int,
-        "height": int,
-        "stride": int,
-    },
-    "originatingTime": int,
-}
-```
-
-- `audio`
-
-```python
-{
-    "message": {
-        "buffer": bytes, # audio buffer in 16KHz, 1 channel, 16-bit PCM
+        "frame": {
+            "pixelData": bytes, # raw pixels in RGB 24-bit for a single frame
+            "width": int,
+            "height": int,
+            "stride": int,
+        },
+        "audio": {
+            "buffer": bytes, # audio buffer in 16KHz, 1 channel, 16-bit PCM
+        },
+        "transcribedText": {
+            "text": str,
+        },
     },
     "originatingTime": int,
 }
 ```
 
-**NOTE: Synchronizing a single video frame and an audio buffer conceptually do not make sense since they operate on different frequencies. What we could do is to pair up a list of video frames and an audio buffer within the same timeframe. Let us know if you need this, and we'll implement it.**
-
 ## Text-to-speech Data Format
 
 `PercepSync` uses [ZeroMQ](https://zeromq.org/) to accept text-to-speech requests data from different clients. It uses the [Push-Pull pattern](https://learning-0mq-with-pyzmq.readthedocs.io/en/latest/pyzmq/patterns/pushpull.html). The serialization format is [MessagePack](https://msgpack.org/). Please see the [sample script](samples/simple_tts.py) for more details. See below for the request format:
@@ -162,23 +157,25 @@ $ ./PercepSync local --camera-device-id /dev/video1
 
 ### Audio
 
-By default, `PercepSync` uses `plughw:0,0` as both input and output devices, but if you want to use another audio device, you can pass it in using the `--audio-input-device-name` and `--audio-output-device-name` options.
+By default, `PercepSync` uses `plughw:0,0` as both input and output devices, but if you want to use another audio device, you can pass it in using the `--audio-input-device-name` and `--audio-output-device-name` options. The first number refers to the "card" number, and the second number refers to the "device" number. You can find out all the output devices with `aplay -l`, and input devices with `arecord -l`.
 
 ```bash
-$ pacmd list-sources
-2 source(s) available.
-  * index: 0
-    ...truncated
-        alsa.device = "0"
-        alsa.card = "2"
-    ...truncated
-  * index: 1
-    ...truncated
-        alsa.device = "0"
-        alsa.card = "3"
-    ...truncated
-
-$ ./PercepSync local --audio-input-device-name plughw:2,0 --audio-output-device-name plughw:2,0
+# For output devices.
+$ aplay -l
+**** List of PLAYBACK Hardware Devices ****
+card 0: Device [Device], device 3: HDMI 0 [HDMI 0]
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+...
+
+$ arecord -l
+**** List of CAPTURE Hardware Devices ****
+card 1: Device [Device], device 0: USB Audio [USB Audio]
+  Subdevices: 0/1
+  Subdevice #0: subdevice #0
+...
+
+$ ./PercepSync local --audio-output-device-name plughw:0,3 --audio-input-device-name plughw:1,0
 ```
 
 ## Development

diff --git a/samples/simple_subscriber.py b/samples/simple_subscriber.py
@@ -12,13 +12,13 @@
 while True:
     topic, message = socket.recv_multipart()
     data = msgpack.unpackb(message)
-    print(data['originatingTime'])
-    if topic == b'videoFrame':
-        image_data = data['message']
-        image = Image.frombytes('RGB', (image_data['width'], image_data['height']), image_data['pixelData'])
-        cv2.imshow("Webcam", np.array(image))
-    elif topic == b'audio':
-        sa.play_buffer(data['message']['buffer'], 1, 2, 16000)
+    if topic == b'perception':
+        frame_data = data['message']['frame']
+        frame = Image.frombytes('RGB', (frame_data['width'], frame_data['height']), frame_data['pixelData'])
+        cv2.imshow("Webcam", np.array(frame))
+        sa.play_buffer(data['message']['audio']['buffer'], 1, 2, 16000)
+        if data["message"]["transcribedText"]["text"] != '':
+            print(f'Transcribed Text: {data["message"]["transcribedText"]["text"]}')
     else:
         raise Exception(f'Unknown topic: {topic}')
 

diff --git a/src/HoloLensCaptureInterop/Serializers.cs b/src/HoloLensCaptureInterop/Serializers.cs
@@ -28,7 +28,10 @@ namespace HoloLensCaptureInterop
     /// </summary>
     public static class Serializers
     {
-        private static readonly WaveFormat AssumedWaveFormat =
+        /// <summary>
+        /// AssumedWaveFormat
+        /// </summary>
+        public static readonly WaveFormat AssumedWaveFormat =
             WaveFormat.Create16kHz1Channel16BitPcm();
 
         /// <summary>

diff --git a/src/PercepSync/Config.cs b/src/PercepSync/Config.cs
@@ -7,12 +7,14 @@ internal class Config
         public static int DefaultRdzvServerPort = 13331;
         public static bool DefaultEnableTts = false;
         public static string DefaultTtsAddress = "tcp://*:12346";
+        public static double DefaultFps = 5;
 
         public string PercepStreamAddress { get; set; } = DefaultPercepStreamAddress;
         public bool EnablePreview { get; set; } = DefaultEnablePreview;
         public int RdzvServerPort { get; set; } = DefaultRdzvServerPort;
         public bool EnableTts { get; set; } = DefaultEnableTts;
         public string TtsAddress { get; set; } = DefaultTtsAddress;
+        public double Fps { get; set; } = DefaultFps;
         public AzureSpeechConfig AzureSpeechConfig { get; set; } = new();
         public LocalConfig? LocalConfig { get; set; } = null;
         public HoloLensConfig? HoloLensConfig { get; set; } = null;

diff --git a/src/PercepSync/ContinuousAzureSpeechRecognizer.cs b/src/PercepSync/ContinuousAzureSpeechRecognizer.cs
@@ -0,0 +1,89 @@
+// Copied from https://github.com/microsoft/psi-samples/blob/760862bdc435288eca7dbd892979353e3d5318a7/Samples/LinuxSpeechSample/ContinuousSpeechRecognizer.cs
+// with some modifications.
+namespace Sled.PercepSync
+{
+    using System;
+    using Microsoft.CognitiveServices.Speech;
+    using Microsoft.CognitiveServices.Speech.Audio;
+    using Microsoft.Psi;
+    using Microsoft.Psi.Audio;
+    using Microsoft.Psi.Components;
+
+    /// <summary>
+    /// Component that wraps the Azure Cognitive Services speech recognizer.
+    /// </summary>
+    public class ContinuousAzureSpeechRecognizer
+        : ConsumerProducer<AudioBuffer, string>,
+            ISourceComponent,
+            IDisposable
+    {
+        private readonly PushAudioInputStream pushStream;
+        private readonly AudioConfig audioInput;
+        private readonly SpeechRecognizer recognizer;
+
+        private string recognizedText = "";
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="ContinuousSpeechRecognizer"/> class.
+        /// </summary>
+        /// <param name="pipeline">The pipeline in which to create the component.</param>
+        /// <param name="subscriptionKey">The subscription key for the Azure speech resource.</param>
+        /// <param name="region">The service region of the Azure speech resource.</param>
+        public ContinuousAzureSpeechRecognizer(
+            Pipeline pipeline,
+            string subscriptionKey,
+            string region
+        )
+            : base(pipeline)
+        {
+            var config = SpeechConfig.FromSubscription(subscriptionKey, region);
+            pushStream = AudioInputStream.CreatePushStream();
+            audioInput = AudioConfig.FromStreamInput(pushStream);
+            recognizer = new SpeechRecognizer(config, audioInput);
+        }
+
+        /// <inheritdoc/>
+        public void Start(Action<DateTime> notifyCompletionTime)
+        {
+            recognizer.Recognized += Recognizer_Recognized;
+            recognizer.StartContinuousRecognitionAsync().Wait();
+            notifyCompletionTime(DateTime.MaxValue);
+        }
+
+        /// <inheritdoc/>
+        public void Stop(DateTime finalOriginatingTime, Action notifyCompleted)
+        {
+            recognizer.Recognized -= Recognizer_Recognized;
+            pushStream.Close();
+            recognizer.StopContinuousRecognitionAsync().Wait();
+            notifyCompleted();
+        }
+
+        /// <inheritdoc/>
+        public void Dispose()
+        {
+            recognizer.Dispose();
+            audioInput.Dispose();
+            pushStream.Dispose();
+        }
+
+        /// <inheritdoc/>
+        protected override void Receive(AudioBuffer data, Envelope envelope)
+        {
+            pushStream.Write(data.Data);
+            Out.Post(recognizedText, envelope.OriginatingTime + data.Duration);
+            if (recognizedText != "")
+            {
+                recognizedText = "";
+            }
+        }
+
+        /// <summary>
+        /// Handler for the speech recognized event from the recognizer. Sets the recognized text to be posted.
+        /// </summary>
+        private void Recognizer_Recognized(object? sender, SpeechRecognitionEventArgs e)
+        {
+            recognizedText = e.Result.Text;
+        }
+    }
+}
diff --git a/src/PercepSync/LocalDevices.cs b/src/PercepSync/LocalDevices.cs
@@ -6,59 +6,40 @@ namespace Sled.PercepSync
     using Microsoft.Psi.Imaging;
     using Microsoft.Psi.Audio;
     using Microsoft.Psi.Interop.Transport;
-    using Microsoft.Psi.Interop.Format;
+    using HoloLensCaptureInterop;
 
     public class LocalDevicesCapture : IDisposable
     {
+        public static readonly string WebcamAddress = "inproc://local-devices-webcam";
         public static readonly string WebcamTopic = "webcam";
+        public static readonly string AudioAddress = "inproc://local-devices-audio";
         public static readonly string AudioTopic = "audio";
         private readonly RendezvousClient rdzvClient;
         private readonly Pipeline pipeline;
-        private readonly NetMQWriter mqWriter;
+        private readonly NetMQWriter<Shared<Image>> mqWebcamWriter;
+        private readonly NetMQWriter<AudioBuffer> mqAudioWriter;
 
         public LocalDevicesCapture(
             string serverAddress,
             int serverPort,
             string cameraDeviceID,
-            string audioDeviceName,
-            string mqAddress = "inproc://local-devices-capture"
+            string audioDeviceName
         )
         {
             pipeline = Pipeline.Create();
             rdzvClient = new RendezvousClient(serverAddress, port: serverPort);
 
             // Create the webcam component
 #if NET7_0
-            var webcam = new MediaCapture(pipeline, 640, 480, cameraDeviceID, PixelFormatId.YUYV);
-            var serializedWebcam = webcam.Select(
-                (image) =>
-                {
-                    var rgb24Image = image.Resource.Convert(PixelFormat.RGB_24bpp);
-                    var pixelData = new byte[rgb24Image.Size];
-                    rgb24Image.CopyTo(pixelData);
-                    return new RawPixelImage(
-                        pixelData,
-                        rgb24Image.Width,
-                        rgb24Image.Height,
-                        rgb24Image.Stride
-                    );
-                }
-            );
+            var webcam = new MediaCapture(
+                pipeline,
+                640,
+                480,
+                cameraDeviceID,
+                PixelFormatId.YUYV
+            ).Select((image) => Shared.Create(image.Resource.Convert(PixelFormat.RGB_24bpp)));
 #else
-            var webcam = new MediaCapture(pipeline, 640, 480);
-            var serializedWebcam = webcam.Select(
-                (image) =>
-                {
-                    var pixelData = new byte[image.Resource.Size];
-                    image.Resource.CopyTo(pixelData);
-                    return new RawPixelImage(
-                        pixelData,
-                        image.Resource.Width,
-                        image.Resource.Height,
-                        image.Resource.Stride
-                    );
-                }
-            );
+            var webcam = new MediaCapture(pipeline, 640, 480).Out;
 #endif
 
             // Create the audio capture component
@@ -70,25 +51,30 @@ public LocalDevicesCapture(
                     DeviceName = audioDeviceName,
                     Format = WaveFormat.Create16kHz1Channel16BitPcm()
                 }
-            );
+            ).Out;
 #else
-            var audio = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());
+            var audio = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()).Out;
 #endif
-            var serializedAudio = audio.Select((buffer) => new Audio(buffer.Data));
-
             // NOTE: We can't use RemoteExporter here b/c \psi uses named memory mapped files
             // to serialize complex types, e.g., RawPixelImage, but named memory mapped files
             // are not supported on *nix systems.
             // https://github.com/dotnet/runtime/issues/21863
-            mqWriter = new NetMQWriter(pipeline, mqAddress, MessagePackFormat.Instance);
-            serializedWebcam.PipeTo(
-                mqWriter.AddTopic<RawPixelImage>(WebcamTopic),
-                deliveryPolicy: DeliveryPolicy.LatestMessage
+            mqWebcamWriter = new NetMQWriter<Shared<Image>>(
+                pipeline,
+                WebcamTopic,
+                WebcamAddress,
+                Serializers.SharedImageFormat(),
+                name: nameof(mqWebcamWriter)
             );
-            serializedAudio.PipeTo(
-                mqWriter.AddTopic<Audio>(AudioTopic),
-                deliveryPolicy: DeliveryPolicy.LatestMessage
+            webcam.PipeTo(mqWebcamWriter, deliveryPolicy: DeliveryPolicy.LatestMessage);
+            mqAudioWriter = new NetMQWriter<AudioBuffer>(
+                pipeline,
+                AudioTopic,
+                AudioAddress,
+                Serializers.AudioBufferFormat(),
+                name: nameof(mqAudioWriter)
             );
+            audio.PipeTo(mqAudioWriter, deliveryPolicy: DeliveryPolicy.LatestMessage);
         }
 
         public void Start()
@@ -101,7 +87,11 @@ public void Start()
                 rdzvClient.Rendezvous.TryAddProcess(
                     new Rendezvous.Process(
                         nameof(LocalDevicesCapture),
-                        new[] { mqWriter.ToRendezvousEndpoint() }
+                        new[]
+                        {
+                            mqWebcamWriter.ToRendezvousEndpoint(),
+                            mqAudioWriter.ToRendezvousEndpoint()
+                        }
                     )
                 );
                 pipeline.RunAsync();