Skip to content

Commit

Permalink
Merge pull request #16 from sled-group/tts-fix
Browse files Browse the repository at this point in the history
Various TTS Fixes
  • Loading branch information
yukw777 committed Oct 19, 2023
2 parents bdf5420 + a4378d2 commit dcd4f64
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 14 deletions.
21 changes: 11 additions & 10 deletions src/PercepSync/PercepSync.cs
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,11 @@ public static void Main(string[] args)
};
// Set up a rendezvous endpoint for text-to-speech
var percepDurationInSeconds = 1 / config.Fps;
var audioBufferFrameSizeInBytes = (int)
Math.Ceiling(
Serializers.AssumedWaveFormat.AvgBytesPerSec * percepDurationInSeconds
);
AzureSpeechSynthesizer? speechSynthesizer = null;
if (config.EnableTts)
{
Expand All @@ -299,7 +304,8 @@ public static void Main(string[] args)
percepSyncPipeline,
config.AzureSpeechConfig.SubscriptionKey,
config.AzureSpeechConfig.Region,
config.AzureSpeechConfig.SpeechSynthesisVoiceName
config.AzureSpeechConfig.SpeechSynthesisVoiceName,
audioBufferFrameSizeInBytes
);
ttsReceiver.PipeTo(speechSynthesizer);
if (config.LocalConfig is not null)
Expand All @@ -321,17 +327,12 @@ public static void Main(string[] args)
// Construct sensor streams
var sensorStreams = ConstructSensorStreams(process, speechSynthesizer);
var frameDurationInSeconds = 1 / config.Fps;
var videoFrameStream = sensorStreams.VideoFrameStream.Sample(
TimeSpan.FromSeconds(frameDurationInSeconds)
TimeSpan.FromSeconds(percepDurationInSeconds)
);
var audioBufferStream = sensorStreams.AudioBufferStream.Reframe(
(int)
Math.Ceiling(
Serializers.AssumedWaveFormat.AvgBytesPerSec
* frameDurationInSeconds
)
audioBufferFrameSizeInBytes
);
var speechRecognizer = new ContinuousAzureSpeechRecognizer(
percepSyncPipeline,
Expand All @@ -343,13 +344,13 @@ public static void Main(string[] args)
.Join(
audioBufferStream,
Reproducible.Nearest<AudioBuffer>(
TimeSpan.FromSeconds(frameDurationInSeconds)
TimeSpan.FromSeconds(percepDurationInSeconds)
)
)
.Join(
speechRecognizer,
Reproducible.Nearest<string>(
TimeSpan.FromSeconds(frameDurationInSeconds / 2)
TimeSpan.FromSeconds(percepDurationInSeconds / 2)
)
)
.Select(
Expand Down
12 changes: 9 additions & 3 deletions src/PercepSync/Tts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,14 @@ public class AzureSpeechSynthesizer : IConsumerProducer<TtsRequest, AudioBuffer>
private readonly string region;
private readonly string voiceName;
private SpeechSynthesizer speechSynthesizer;
private Reframe reframer;

public AzureSpeechSynthesizer(
Pipeline pipeline,
string subscriptionKey,
string region,
string voiceName
string voiceName,
int audioBufferFrameSizeInBytes
)
{
this.subscriptionKey = subscriptionKey;
Expand All @@ -106,7 +108,10 @@ string voiceName
throw new Exception($"Error while initializing SpeechSynthesizer: {e.Message}");
}
In = pipeline.CreateReceiver<TtsRequest>(this, Receive, nameof(In));
Out = pipeline.CreateEmitter<AudioBuffer>(this, nameof(Out));
audioOut = pipeline.CreateEmitter<AudioBuffer>(this, nameof(audioOut));
reframer = new Reframe(pipeline, audioBufferFrameSizeInBytes);
audioOut.PipeTo(reframer);
Out = reframer.Out;
}

private async void Receive(TtsRequest req, Envelope envelope)
Expand Down Expand Up @@ -141,11 +146,12 @@ private async void Receive(TtsRequest req, Envelope envelope)
memoryStream.ToArray(),
WaveFormat.Create16kHz1Channel16BitPcm()
);
Out.Post(audioBuffer, envelope.OriginatingTime);
audioOut.Post(audioBuffer, envelope.OriginatingTime);
}

public Receiver<TtsRequest> In { get; }
public Emitter<AudioBuffer> Out { get; private set; }
private Emitter<AudioBuffer> audioOut { get; set; }

public void Dispose()
{
Expand Down
17 changes: 17 additions & 0 deletions src/PercepSyncHoloLensCapture/Config.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ public class Config
/// Gray-scale camera config
/// </summary>
public GrayConfig Gray { get; set; } = new GrayConfig();

/// <summary>
/// Tts config
/// </summary>
public TtsConfig Tts { get; set; } = new TtsConfig();
}

/// <summary>
Expand Down Expand Up @@ -187,4 +192,16 @@ public class GrayConfig
/// </summary>
public string EncodeMethod { get; set; } = "jpeg";
}

/// <summary>
/// Configuration for text-to-speech
/// </summary>
public class TtsConfig
{
/// <summary>
/// Max duration for text-to-speech audio in seconds. The longer it is, the more memory it takes.
/// The buffer is circular, so if text-to-speech audio goes over this limit, the earlier speech will be overwritten.
/// </summary>
public double MaxTtsDurationInSeconds { get; set; } = 10;
}
}
3 changes: 2 additions & 1 deletion src/PercepSyncHoloLensCapture/PercepSyncHoloLensCapture.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1158,7 +1158,8 @@ DeliveryPolicy deliveryPolicy
);
var spatialSound = new SpatialSound(
pipeline,
default
default,
Config.Tts.MaxTtsDurationInSeconds
);
ttsAudio
.PipeTo(audioResampler)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
<RestoreProjectStyle>PackageReference</RestoreProjectStyle>
</PropertyGroup>
<ItemGroup>
<Compile Include="SpatialSound.cs" />
<Compile Include="Config.cs" />
<Compile Include="PercepSyncHoloLensCapture.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
Expand Down
166 changes: 166 additions & 0 deletions src/PercepSyncHoloLensCapture/SpatialSound.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Copied with modifications from: https://github.com/microsoft/psi/blob/ef4b2a627ffefc5415b826930a883b309c0a37cd/Sources/MixedReality/Microsoft.Psi.MixedReality/StereoKit/SpatialSound.cs
// and https://github.com/microsoft/psi/blob/ef4b2a627ffefc5415b826930a883b309c0a37cd/Sources/MixedReality/Microsoft.Psi.MixedReality/StereoKit/StereoKitTransforms.cs
// We need to override the private property "sound", so the only way around it is to copy like this.
namespace Sled.PercepSyncHoloLensCapture
{
using System;
using System.IO;
using MathNet.Spatial.Euclidean;
using StereoKit;
using Microsoft.Psi.Audio;
using Microsoft.Psi;
using Microsoft.Psi.MixedReality.StereoKit;

/// <summary>
/// Static StereoKit transforms which are applied in/out of StereoKit from \psi.
/// </summary>
public static class StereoKitTransforms
{
/// <summary>
/// Gets the "world hierarchy" for rendering.
/// Push this matrix onto StereoKit's <see cref="Hierarchy"/> stack to render content coherently in the world.
/// </summary>
/// <remarks>
/// This matrix is pushed automatically by the <see cref="StereoKitRenderer"/> base class for new rendering components.
/// The value is null when the HoloLens loses localization.
/// </remarks>
public static Matrix? WorldHierarchy { get; internal set; } = Matrix.Identity;

/// <summary>
/// Gets or sets the transform from StereoKit to the world.
/// </summary>
internal static CoordinateSystem StereoKitToWorld { get; set; } = new CoordinateSystem();

/// <summary>
/// Gets or sets the the transform from the world to StereoKit.
/// </summary>
internal static CoordinateSystem WorldToStereoKit { get; set; } = new CoordinateSystem();
}

/// <summary>
/// Component that implements a spatial sound renderer.
/// </summary>
public class SpatialSound : StereoKitComponent, IConsumer<AudioBuffer>
{
private Sound sound;
private SoundInst soundInst;
private Point3D worldPosition;
private float volume;
private float bufferDurationInSeconds;
private bool playing = false;

/// <summary>
/// Initializes a new instance of the <see cref="SpatialSound"/> class.
/// </summary>
/// <param name="pipeline">The pipeline to add the component to.</param>
/// <param name="initialPosition">Initial position of spatial sound.</param>
/// <param name="bufferDurationInSeconds">The duration of stream buffer.</param>
/// <param name="initialVolume">Initial audio volume (0-1, default 1).</param>
/// <param name="name">An optional name for the component.</param>
public SpatialSound(
Pipeline pipeline,
Point3D initialPosition,
double bufferDurationInSeconds,
double initialVolume = 1,
string name = nameof(SpatialSound)
)
: base(pipeline, name)
{
worldPosition = initialPosition;
volume = (float)initialVolume;
this.bufferDurationInSeconds = (float)bufferDurationInSeconds;
In = pipeline.CreateReceiver<AudioBuffer>(this, UpdateAudio, nameof(In));
PositionInput = pipeline.CreateReceiver<Point3D>(
this,
p => worldPosition = p,
nameof(PositionInput)
);
VolumeInput = pipeline.CreateReceiver<double>(
this,
v => volume = (float)v,
nameof(VolumeInput)
);
}

/// <summary>
/// Gets the receiver of audio.
/// </summary>
public Receiver<AudioBuffer> In { get; private set; }

/// <summary>
/// Gets receiver for spatial pose.
/// </summary>
public Receiver<Point3D> PositionInput { get; private set; }

/// <summary>
/// Gets receiver for audio volume.
/// </summary>
public Receiver<double> VolumeInput { get; private set; }

/// <inheritdoc />
public override bool Initialize()
{
sound = Sound.CreateStream(bufferDurationInSeconds);
return true;
}

/// <inheritdoc/>
public override void Step()
{
if (playing)
{
soundInst.Volume = volume;

if (StereoKitTransforms.WorldToStereoKit is not null)
{
soundInst.Position = ComputeSoundPosition();
}
}
}

private Vec3 ComputeSoundPosition()
{
if (StereoKitTransforms.WorldToStereoKit is null)
{
return Vec3.Zero;
}
else
{
return worldPosition.TransformBy(StereoKitTransforms.WorldToStereoKit).ToVec3();
}
}

private void UpdateAudio(AudioBuffer audio)
{
var format = audio.Format;
if (
format.Channels != 1
|| format.SamplesPerSec != 48000
|| (
format.FormatTag != WaveFormatTag.WAVE_FORMAT_IEEE_FLOAT
&& format.FormatTag != WaveFormatTag.WAVE_FORMAT_EXTENSIBLE
)
|| format.BitsPerSample != 32
)
{
throw new ArgumentException("Expected 1-channel, 48kHz, float32 audio format.");
}

using var stream = new MemoryStream(audio.Data, 0, audio.Length);
using var reader = new BinaryReader(stream);
var count = audio.Length / 4;
var samples = new float[count];
for (var i = 0; i < count; i++)
{
samples[i] = reader.ReadSingle();
}

sound.WriteSamples(samples);
if (!playing)
{
soundInst = sound.Play(ComputeSoundPosition(), volume);
playing = true;
}
}
}
}

0 comments on commit dcd4f64

Please sign in to comment.