Skip to content

Commit

Permalink
Merge pull request #14 from sled-group/speech-to-text
Browse files Browse the repository at this point in the history
Speech to text
  • Loading branch information
yukw777 committed Oct 19, 2023
2 parents 34a90cb + c2e80d3 commit 86def61
Show file tree
Hide file tree
Showing 11 changed files with 359 additions and 216 deletions.
65 changes: 31 additions & 34 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,35 +105,30 @@ You can configure `PercepSyncHoloLensCapture` via a configuration file `PercepSy

`PercepSync` uses [ZeroMQ](https://zeromq.org/) to publish data from different input devices. Data that can be synchronized will be synchronized and published to a single topic. The serialization format is [MessagePack](https://msgpack.org/).

Here's the list of available topics and their data formats:
Currently, one topic for synchronized perception data is available:

- `videoFrame`
- `perception`

```python
{
"message": {
"pixelData": bytes, # raw pixels in RGB 24-bit for a single frame
"width": int,
"height": int,
"stride": int,
},
"originatingTime": int,
}
```

- `audio`

```python
{
"message": {
"buffer": bytes, # audio buffer in 16KHz, 1 channel, 16-bit PCM
"frame": {
"pixelData": bytes, # raw pixels in RGB 24-bit for a single frame
"width": int,
"height": int,
"stride": int,
},
"audio": {
"buffer": bytes, # audio buffer in 16KHz, 1 channel, 16-bit PCM
},
"transcribedText": {
"text": str,
},
},
"originatingTime": int,
}
```

**NOTE: Synchronizing a single video frame and an audio buffer conceptually do not make sense since they operate on different frequencies. What we could do is to pair up a list of video frames and an audio buffer within the same timeframe. Let us know if you need this, and we'll implement it.**

## Text-to-speech Data Format

`PercepSync` uses [ZeroMQ](https://zeromq.org/) to accept text-to-speech requests data from different clients. It uses the [Push-Pull pattern](https://learning-0mq-with-pyzmq.readthedocs.io/en/latest/pyzmq/patterns/pushpull.html). The serialization format is [MessagePack](https://msgpack.org/). Please see the [sample script](samples/simple_tts.py) for more details. See below for the request format:
Expand Down Expand Up @@ -162,23 +157,25 @@ $ ./PercepSync local --camera-device-id /dev/video1

### Audio

By default, `PercepSync` uses `plughw:0,0` as both input and output devices, but if you want to use another audio device, you can pass it in using the `--audio-input-device-name` and `--audio-output-device-name` options.
By default, `PercepSync` uses `plughw:0,0` as both input and output devices, but if you want to use another audio device, you can pass it in using the `--audio-input-device-name` and `--audio-output-device-name` options. The first number refers to the "card" number, and the second number refers to the "device" number. You can find out all the output devices with `aplay -l`, and input devices with `arecord -l`.

```bash
$ pacmd list-sources
2 source(s) available.
* index: 0
...truncated
alsa.device = "0"
alsa.card = "2"
...truncated
* index: 1
...truncated
alsa.device = "0"
alsa.card = "3"
...truncated

$ ./PercepSync local --audio-input-device-name plughw:2,0 --audio-output-device-name plughw:2,0
# For output devices.
$ aplay -l
**** List of PLAYBACK Hardware Devices ****
card 0: Device [Device], device 3: HDMI 0 [HDMI 0]
Subdevices: 1/1
Subdevice #0: subdevice #0
...

$ arecord -l
**** List of CAPTURE Hardware Devices ****
card 1: Device [Device], device 0: USB Audio [USB Audio]
Subdevices: 0/1
Subdevice #0: subdevice #0
...

$ ./PercepSync local --audio-output-device-name plughw:0,3 --audio-input-device-name plughw:1,0
```

## Development
Expand Down
14 changes: 7 additions & 7 deletions samples/simple_subscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
while True:
topic, message = socket.recv_multipart()
data = msgpack.unpackb(message)
print(data['originatingTime'])
if topic == b'videoFrame':
image_data = data['message']
image = Image.frombytes('RGB', (image_data['width'], image_data['height']), image_data['pixelData'])
cv2.imshow("Webcam", np.array(image))
elif topic == b'audio':
sa.play_buffer(data['message']['buffer'], 1, 2, 16000)
if topic == b'perception':
frame_data = data['message']['frame']
frame = Image.frombytes('RGB', (frame_data['width'], frame_data['height']), frame_data['pixelData'])
cv2.imshow("Webcam", np.array(frame))
sa.play_buffer(data['message']['audio']['buffer'], 1, 2, 16000)
if data["message"]["transcribedText"]["text"] != '':
print(f'Transcribed Text: {data["message"]["transcribedText"]["text"]}')
else:
raise Exception(f'Unknown topic: {topic}')

Expand Down
5 changes: 4 additions & 1 deletion src/HoloLensCaptureInterop/Serializers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ namespace HoloLensCaptureInterop
/// </summary>
public static class Serializers
{
private static readonly WaveFormat AssumedWaveFormat =
/// <summary>
/// AssumedWaveFormat
/// </summary>
public static readonly WaveFormat AssumedWaveFormat =
WaveFormat.Create16kHz1Channel16BitPcm();

/// <summary>
Expand Down
2 changes: 2 additions & 0 deletions src/PercepSync/Config.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ internal class Config
public static int DefaultRdzvServerPort = 13331;
public static bool DefaultEnableTts = false;
public static string DefaultTtsAddress = "tcp://*:12346";
public static double DefaultFps = 5;

public string PercepStreamAddress { get; set; } = DefaultPercepStreamAddress;
public bool EnablePreview { get; set; } = DefaultEnablePreview;
public int RdzvServerPort { get; set; } = DefaultRdzvServerPort;
public bool EnableTts { get; set; } = DefaultEnableTts;
public string TtsAddress { get; set; } = DefaultTtsAddress;
public double Fps { get; set; } = DefaultFps;
public AzureSpeechConfig AzureSpeechConfig { get; set; } = new();
public LocalConfig? LocalConfig { get; set; } = null;
public HoloLensConfig? HoloLensConfig { get; set; } = null;
Expand Down
89 changes: 89 additions & 0 deletions src/PercepSync/ContinuousAzureSpeechRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copied from https://github.com/microsoft/psi-samples/blob/760862bdc435288eca7dbd892979353e3d5318a7/Samples/LinuxSpeechSample/ContinuousSpeechRecognizer.cs
// with some modifications.
namespace Sled.PercepSync
{
using System;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Microsoft.Psi;
using Microsoft.Psi.Audio;
using Microsoft.Psi.Components;

/// <summary>
/// Component that wraps the Azure Cognitive Services speech recognizer.
/// </summary>
public class ContinuousAzureSpeechRecognizer
: ConsumerProducer<AudioBuffer, string>,
ISourceComponent,
IDisposable
{
private readonly PushAudioInputStream pushStream;
private readonly AudioConfig audioInput;
private readonly SpeechRecognizer recognizer;

private string recognizedText = "";

/// <summary>
/// Initializes a new instance of the <see cref="ContinuousSpeechRecognizer"/> class.
/// </summary>
/// <param name="pipeline">The pipeline in which to create the component.</param>
/// <param name="subscriptionKey">The subscription key for the Azure speech resource.</param>
/// <param name="region">The service region of the Azure speech resource.</param>
public ContinuousAzureSpeechRecognizer(
Pipeline pipeline,
string subscriptionKey,
string region
)
: base(pipeline)
{
var config = SpeechConfig.FromSubscription(subscriptionKey, region);
pushStream = AudioInputStream.CreatePushStream();
audioInput = AudioConfig.FromStreamInput(pushStream);
recognizer = new SpeechRecognizer(config, audioInput);
}

/// <inheritdoc/>
public void Start(Action<DateTime> notifyCompletionTime)
{
recognizer.Recognized += Recognizer_Recognized;
recognizer.StartContinuousRecognitionAsync().Wait();
notifyCompletionTime(DateTime.MaxValue);
}

/// <inheritdoc/>
public void Stop(DateTime finalOriginatingTime, Action notifyCompleted)
{
recognizer.Recognized -= Recognizer_Recognized;
pushStream.Close();
recognizer.StopContinuousRecognitionAsync().Wait();
notifyCompleted();
}

/// <inheritdoc/>
public void Dispose()
{
recognizer.Dispose();
audioInput.Dispose();
pushStream.Dispose();
}

/// <inheritdoc/>
protected override void Receive(AudioBuffer data, Envelope envelope)
{
pushStream.Write(data.Data);
Out.Post(recognizedText, envelope.OriginatingTime + data.Duration);
if (recognizedText != "")
{
recognizedText = "";
}
}

/// <summary>
/// Handler for the speech recognized event from the recognizer. Sets the recognized text to be posted.
/// </summary>
private void Recognizer_Recognized(object? sender, SpeechRecognitionEventArgs e)
{
recognizedText = e.Result.Text;
}
}
}
80 changes: 35 additions & 45 deletions src/PercepSync/LocalDevices.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,59 +6,40 @@ namespace Sled.PercepSync
using Microsoft.Psi.Imaging;
using Microsoft.Psi.Audio;
using Microsoft.Psi.Interop.Transport;
using Microsoft.Psi.Interop.Format;
using HoloLensCaptureInterop;

public class LocalDevicesCapture : IDisposable
{
public static readonly string WebcamAddress = "inproc://local-devices-webcam";
public static readonly string WebcamTopic = "webcam";
public static readonly string AudioAddress = "inproc://local-devices-audio";
public static readonly string AudioTopic = "audio";
private readonly RendezvousClient rdzvClient;
private readonly Pipeline pipeline;
private readonly NetMQWriter mqWriter;
private readonly NetMQWriter<Shared<Image>> mqWebcamWriter;
private readonly NetMQWriter<AudioBuffer> mqAudioWriter;

public LocalDevicesCapture(
string serverAddress,
int serverPort,
string cameraDeviceID,
string audioDeviceName,
string mqAddress = "inproc://local-devices-capture"
string audioDeviceName
)
{
pipeline = Pipeline.Create();
rdzvClient = new RendezvousClient(serverAddress, port: serverPort);

// Create the webcam component
#if NET7_0
var webcam = new MediaCapture(pipeline, 640, 480, cameraDeviceID, PixelFormatId.YUYV);
var serializedWebcam = webcam.Select(
(image) =>
{
var rgb24Image = image.Resource.Convert(PixelFormat.RGB_24bpp);
var pixelData = new byte[rgb24Image.Size];
rgb24Image.CopyTo(pixelData);
return new RawPixelImage(
pixelData,
rgb24Image.Width,
rgb24Image.Height,
rgb24Image.Stride
);
}
);
var webcam = new MediaCapture(
pipeline,
640,
480,
cameraDeviceID,
PixelFormatId.YUYV
).Select((image) => Shared.Create(image.Resource.Convert(PixelFormat.RGB_24bpp)));
#else
var webcam = new MediaCapture(pipeline, 640, 480);
var serializedWebcam = webcam.Select(
(image) =>
{
var pixelData = new byte[image.Resource.Size];
image.Resource.CopyTo(pixelData);
return new RawPixelImage(
pixelData,
image.Resource.Width,
image.Resource.Height,
image.Resource.Stride
);
}
);
var webcam = new MediaCapture(pipeline, 640, 480).Out;
#endif

// Create the audio capture component
Expand All @@ -70,25 +51,30 @@ public LocalDevicesCapture(
DeviceName = audioDeviceName,
Format = WaveFormat.Create16kHz1Channel16BitPcm()
}
);
).Out;
#else
var audio = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());
var audio = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()).Out;
#endif
var serializedAudio = audio.Select((buffer) => new Audio(buffer.Data));

// NOTE: We can't use RemoteExporter here b/c \psi uses named memory mapped files
// to serialize complex types, e.g., RawPixelImage, but named memory mapped files
// are not supported on *nix systems.
// https://github.com/dotnet/runtime/issues/21863
mqWriter = new NetMQWriter(pipeline, mqAddress, MessagePackFormat.Instance);
serializedWebcam.PipeTo(
mqWriter.AddTopic<RawPixelImage>(WebcamTopic),
deliveryPolicy: DeliveryPolicy.LatestMessage
mqWebcamWriter = new NetMQWriter<Shared<Image>>(
pipeline,
WebcamTopic,
WebcamAddress,
Serializers.SharedImageFormat(),
name: nameof(mqWebcamWriter)
);
serializedAudio.PipeTo(
mqWriter.AddTopic<Audio>(AudioTopic),
deliveryPolicy: DeliveryPolicy.LatestMessage
webcam.PipeTo(mqWebcamWriter, deliveryPolicy: DeliveryPolicy.LatestMessage);
mqAudioWriter = new NetMQWriter<AudioBuffer>(
pipeline,
AudioTopic,
AudioAddress,
Serializers.AudioBufferFormat(),
name: nameof(mqAudioWriter)
);
audio.PipeTo(mqAudioWriter, deliveryPolicy: DeliveryPolicy.LatestMessage);
}

public void Start()
Expand All @@ -101,7 +87,11 @@ public void Start()
rdzvClient.Rendezvous.TryAddProcess(
new Rendezvous.Process(
nameof(LocalDevicesCapture),
new[] { mqWriter.ToRendezvousEndpoint() }
new[]
{
mqWebcamWriter.ToRendezvousEndpoint(),
mqAudioWriter.ToRendezvousEndpoint()
}
)
);
pipeline.RunAsync();
Expand Down
Loading

0 comments on commit 86def61

Please sign in to comment.