Merge pull request #16 from sled-group/tts-fix

Various TTS Fixes
sled-group · Oct 19, 2023 · dcd4f64 · dcd4f64
2 parents bdf5420 + a4378d2
commit dcd4f64
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 14 deletions.
diff --git a/src/PercepSync/PercepSync.cs b/src/PercepSync/PercepSync.cs
@@ -288,6 +288,11 @@ public static void Main(string[] args)
                     };
 
                     // Set up a rendezvous endpoint for text-to-speech
+                    var percepDurationInSeconds = 1 / config.Fps;
+                    var audioBufferFrameSizeInBytes = (int)
+                        Math.Ceiling(
+                            Serializers.AssumedWaveFormat.AvgBytesPerSec * percepDurationInSeconds
+                        );
                     AzureSpeechSynthesizer? speechSynthesizer = null;
                     if (config.EnableTts)
                     {
@@ -299,7 +304,8 @@ public static void Main(string[] args)
                             percepSyncPipeline,
                             config.AzureSpeechConfig.SubscriptionKey,
                             config.AzureSpeechConfig.Region,
-                            config.AzureSpeechConfig.SpeechSynthesisVoiceName
+                            config.AzureSpeechConfig.SpeechSynthesisVoiceName,
+                            audioBufferFrameSizeInBytes
                         );
                         ttsReceiver.PipeTo(speechSynthesizer);
                         if (config.LocalConfig is not null)
@@ -321,17 +327,12 @@ public static void Main(string[] args)
 
                     // Construct sensor streams
                     var sensorStreams = ConstructSensorStreams(process, speechSynthesizer);
-                    var frameDurationInSeconds = 1 / config.Fps;
                     var videoFrameStream = sensorStreams.VideoFrameStream.Sample(
-                        TimeSpan.FromSeconds(frameDurationInSeconds)
+                        TimeSpan.FromSeconds(percepDurationInSeconds)
                     );
 
                     var audioBufferStream = sensorStreams.AudioBufferStream.Reframe(
-                        (int)
-                            Math.Ceiling(
-                                Serializers.AssumedWaveFormat.AvgBytesPerSec
-                                    * frameDurationInSeconds
-                            )
+                        audioBufferFrameSizeInBytes
                     );
                     var speechRecognizer = new ContinuousAzureSpeechRecognizer(
                         percepSyncPipeline,
@@ -343,13 +344,13 @@ public static void Main(string[] args)
                         .Join(
                             audioBufferStream,
                             Reproducible.Nearest<AudioBuffer>(
-                                TimeSpan.FromSeconds(frameDurationInSeconds)
+                                TimeSpan.FromSeconds(percepDurationInSeconds)
                             )
                         )
                         .Join(
                             speechRecognizer,
                             Reproducible.Nearest<string>(
-                                TimeSpan.FromSeconds(frameDurationInSeconds / 2)
+                                TimeSpan.FromSeconds(percepDurationInSeconds / 2)
                             )
                         )
                         .Select(

diff --git a/src/PercepSync/Tts.cs b/src/PercepSync/Tts.cs
@@ -80,12 +80,14 @@ public class AzureSpeechSynthesizer : IConsumerProducer<TtsRequest, AudioBuffer>
         private readonly string region;
         private readonly string voiceName;
         private SpeechSynthesizer speechSynthesizer;
+        private Reframe reframer;
 
         public AzureSpeechSynthesizer(
             Pipeline pipeline,
             string subscriptionKey,
             string region,
-            string voiceName
+            string voiceName,
+            int audioBufferFrameSizeInBytes
         )
         {
             this.subscriptionKey = subscriptionKey;
@@ -106,7 +108,10 @@ string voiceName
                 throw new Exception($"Error while initializing SpeechSynthesizer: {e.Message}");
             }
             In = pipeline.CreateReceiver<TtsRequest>(this, Receive, nameof(In));
-            Out = pipeline.CreateEmitter<AudioBuffer>(this, nameof(Out));
+            audioOut = pipeline.CreateEmitter<AudioBuffer>(this, nameof(audioOut));
+            reframer = new Reframe(pipeline, audioBufferFrameSizeInBytes);
+            audioOut.PipeTo(reframer);
+            Out = reframer.Out;
         }
 
         private async void Receive(TtsRequest req, Envelope envelope)
@@ -141,11 +146,12 @@ private async void Receive(TtsRequest req, Envelope envelope)
                 memoryStream.ToArray(),
                 WaveFormat.Create16kHz1Channel16BitPcm()
             );
-            Out.Post(audioBuffer, envelope.OriginatingTime);
+            audioOut.Post(audioBuffer, envelope.OriginatingTime);
         }
 
         public Receiver<TtsRequest> In { get; }
         public Emitter<AudioBuffer> Out { get; private set; }
+        private Emitter<AudioBuffer> audioOut { get; set; }
 
         public void Dispose()
         {

diff --git a/src/PercepSyncHoloLensCapture/Config.cs b/src/PercepSyncHoloLensCapture/Config.cs
@@ -29,6 +29,11 @@ public class Config
         /// Gray-scale camera config
         /// </summary>
         public GrayConfig Gray { get; set; } = new GrayConfig();
+
+        /// <summary>
+        /// Tts config
+        /// </summary>
+        public TtsConfig Tts { get; set; } = new TtsConfig();
     }
 
     /// <summary>
@@ -187,4 +192,16 @@ public class GrayConfig
         /// </summary>
         public string EncodeMethod { get; set; } = "jpeg";
     }
+
+    /// <summary>
+    /// Configuration for text-to-speech
+    /// </summary>
+    public class TtsConfig
+    {
+        /// <summary>
+        /// Max duration for text-to-speech audio in seconds. The longer it is, the more memory it takes.
+        /// The buffer is circular, so if text-to-speech audio goes over this limit, the earlier speech will be overwritten.
+        /// </summary>
+        public double MaxTtsDurationInSeconds { get; set; } = 10;
+    }
 }
diff --git a/src/PercepSyncHoloLensCapture/PercepSyncHoloLensCapture.cs b/src/PercepSyncHoloLensCapture/PercepSyncHoloLensCapture.cs
@@ -1158,7 +1158,8 @@ DeliveryPolicy deliveryPolicy
                                                         );
                                                         var spatialSound = new SpatialSound(
                                                             pipeline,
-                                                            default
+                                                            default,
+                                                            Config.Tts.MaxTtsDurationInSeconds
                                                         );
                                                         ttsAudio
                                                             .PipeTo(audioResampler)

diff --git a/src/PercepSyncHoloLensCapture/PercepSyncHoloLensCapture.csproj b/src/PercepSyncHoloLensCapture/PercepSyncHoloLensCapture.csproj
@@ -58,6 +58,7 @@
     <RestoreProjectStyle>PackageReference</RestoreProjectStyle>
   </PropertyGroup>
   <ItemGroup>
+    <Compile Include="SpatialSound.cs" />
     <Compile Include="Config.cs" />
     <Compile Include="PercepSyncHoloLensCapture.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />

diff --git a/src/PercepSyncHoloLensCapture/SpatialSound.cs b/src/PercepSyncHoloLensCapture/SpatialSound.cs
@@ -0,0 +1,166 @@
+// Copied with modifications from: https://github.com/microsoft/psi/blob/ef4b2a627ffefc5415b826930a883b309c0a37cd/Sources/MixedReality/Microsoft.Psi.MixedReality/StereoKit/SpatialSound.cs
+// and https://github.com/microsoft/psi/blob/ef4b2a627ffefc5415b826930a883b309c0a37cd/Sources/MixedReality/Microsoft.Psi.MixedReality/StereoKit/StereoKitTransforms.cs
+// We need to override the private property "sound", so the only way around it is to copy like this.
+namespace Sled.PercepSyncHoloLensCapture
+{
+    using System;
+    using System.IO;
+    using MathNet.Spatial.Euclidean;
+    using StereoKit;
+    using Microsoft.Psi.Audio;
+    using Microsoft.Psi;
+    using Microsoft.Psi.MixedReality.StereoKit;
+
+    /// <summary>
+    /// Static StereoKit transforms which are applied in/out of StereoKit from \psi.
+    /// </summary>
+    public static class StereoKitTransforms
+    {
+        /// <summary>
+        /// Gets the "world hierarchy" for rendering.
+        /// Push this matrix onto StereoKit's <see cref="Hierarchy"/> stack to render content coherently in the world.
+        /// </summary>
+        /// <remarks>
+        /// This matrix is pushed automatically by the <see cref="StereoKitRenderer"/> base class for new rendering components.
+        /// The value is null when the HoloLens loses localization.
+        /// </remarks>
+        public static Matrix? WorldHierarchy { get; internal set; } = Matrix.Identity;
+
+        /// <summary>
+        /// Gets or sets the transform from StereoKit to the world.
+        /// </summary>
+        internal static CoordinateSystem StereoKitToWorld { get; set; } = new CoordinateSystem();
+
+        /// <summary>
+        /// Gets or sets the the transform from the world to StereoKit.
+        /// </summary>
+        internal static CoordinateSystem WorldToStereoKit { get; set; } = new CoordinateSystem();
+    }
+
+    /// <summary>
+    /// Component that implements a spatial sound renderer.
+    /// </summary>
+    public class SpatialSound : StereoKitComponent, IConsumer<AudioBuffer>
+    {
+        private Sound sound;
+        private SoundInst soundInst;
+        private Point3D worldPosition;
+        private float volume;
+        private float bufferDurationInSeconds;
+        private bool playing = false;
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="SpatialSound"/> class.
+        /// </summary>
+        /// <param name="pipeline">The pipeline to add the component to.</param>
+        /// <param name="initialPosition">Initial position of spatial sound.</param>
+        /// <param name="bufferDurationInSeconds">The duration of stream buffer.</param>
+        /// <param name="initialVolume">Initial audio volume (0-1, default 1).</param>
+        /// <param name="name">An optional name for the component.</param>
+        public SpatialSound(
+            Pipeline pipeline,
+            Point3D initialPosition,
+            double bufferDurationInSeconds,
+            double initialVolume = 1,
+            string name = nameof(SpatialSound)
+        )
+            : base(pipeline, name)
+        {
+            worldPosition = initialPosition;
+            volume = (float)initialVolume;
+            this.bufferDurationInSeconds = (float)bufferDurationInSeconds;
+            In = pipeline.CreateReceiver<AudioBuffer>(this, UpdateAudio, nameof(In));
+            PositionInput = pipeline.CreateReceiver<Point3D>(
+                this,
+                p => worldPosition = p,
+                nameof(PositionInput)
+            );
+            VolumeInput = pipeline.CreateReceiver<double>(
+                this,
+                v => volume = (float)v,
+                nameof(VolumeInput)
+            );
+        }
+
+        /// <summary>
+        /// Gets the receiver of audio.
+        /// </summary>
+        public Receiver<AudioBuffer> In { get; private set; }
+
+        /// <summary>
+        /// Gets receiver for spatial pose.
+        /// </summary>
+        public Receiver<Point3D> PositionInput { get; private set; }
+
+        /// <summary>
+        /// Gets receiver for audio volume.
+        /// </summary>
+        public Receiver<double> VolumeInput { get; private set; }
+
+        /// <inheritdoc />
+        public override bool Initialize()
+        {
+            sound = Sound.CreateStream(bufferDurationInSeconds);
+            return true;
+        }
+
+        /// <inheritdoc/>
+        public override void Step()
+        {
+            if (playing)
+            {
+                soundInst.Volume = volume;
+
+                if (StereoKitTransforms.WorldToStereoKit is not null)
+                {
+                    soundInst.Position = ComputeSoundPosition();
+                }
+            }
+        }
+
+        private Vec3 ComputeSoundPosition()
+        {
+            if (StereoKitTransforms.WorldToStereoKit is null)
+            {
+                return Vec3.Zero;
+            }
+            else
+            {
+                return worldPosition.TransformBy(StereoKitTransforms.WorldToStereoKit).ToVec3();
+            }
+        }
+
+        private void UpdateAudio(AudioBuffer audio)
+        {
+            var format = audio.Format;
+            if (
+                format.Channels != 1
+                || format.SamplesPerSec != 48000
+                || (
+                    format.FormatTag != WaveFormatTag.WAVE_FORMAT_IEEE_FLOAT
+                    && format.FormatTag != WaveFormatTag.WAVE_FORMAT_EXTENSIBLE
+                )
+                || format.BitsPerSample != 32
+            )
+            {
+                throw new ArgumentException("Expected 1-channel, 48kHz, float32 audio format.");
+            }
+
+            using var stream = new MemoryStream(audio.Data, 0, audio.Length);
+            using var reader = new BinaryReader(stream);
+            var count = audio.Length / 4;
+            var samples = new float[count];
+            for (var i = 0; i < count; i++)
+            {
+                samples[i] = reader.ReadSingle();
+            }
+
+            sound.WriteSamples(samples);
+            if (!playing)
+            {
+                soundInst = sound.Play(ComputeSoundPosition(), volume);
+                playing = true;
+            }
+        }
+    }
+}