diff --git a/README.md b/README.md index 816dccf..c88658e 100644 --- a/README.md +++ b/README.md @@ -22,66 +22,33 @@ than is in this brief introduction. ```java SpeechPipeline pipeline = new SpeechPipeline.Builder() - .setInputClass("io.spokestack.spokestack.android.MicrophoneInput") - .addStageClass("io.spokestack.spokestack.webrtc.AutomaticGainControl") - .addStageClass("io.spokestack.spokestack.webrtc.VoiceActivityDetector") - .addStageClass("io.spokestack.spokestack.webrtc.VoiceActivityTrigger") - .addStageClass("io.spokestack.spokestack.google.GoogleSpeechRecognizer") + .useProfile("io.spokestack.spokestack.profile.VADTriggerGoogleASR") .setProperty("google-credentials", "") .setProperty("locale", "en-US") .build(); ``` -This example creates an active speech recognition pipeline using the Google -Speech API that is triggered by VAD. The `google-credentials` parameter should +This example uses a pre-built profile to create a speech recognition pipeline triggered by VAD +that uses the Google Speech API for speech recognition. The `google-credentials` parameter should be the contents of a Google Cloud service account credentials file, in JSON format. For more information, see the [documentation](https://cloud.google.com/speech/docs/streaming-recognize). -See the [javadoc](https://www.javadoc.io/doc/io.spokestack/spokestack) for +See the [javadoc](https://www.javadoc.io/doc/io.spokestack/spokestack-android) for other component-specific configuration parameters. -### Microsoft Bing Speech API - -```java -SpeechPipeline pipeline = new SpeechPipeline.Builder() - .setInputClass("io.spokestack.spokestack.android.MicrophoneInput") - .addStageClass("io.spokestack.spokestack.webrtc.AutomaticGainControl") - .addStageClass("io.spokestack.spokestack.webrtc.VoiceActivityDetector") - .addStageClass("io.spokestack.spokestack.webrtc.VoiceActivityTrigger") - .addStageClass("io.spokestack.spokestack.microsoft.BingSpeechRecognizer") - .setProperty("sample-rate", 16000) - .setProperty("frame-width", 20) - .setProperty("buffer-width", 300) - .setProperty("vad-rise-delay", 100) - .setProperty("vad-fall-delay", 500) - .setProperty("bing-speech-api-key", "") - .setProperty("locale", "fr-CA") - .build(); -``` - -This example creates a VAD-triggered pipeline with custom rise/fall delays -using the Microsoft Bing Speech API. For more information on this API, check -out the [documentation](https://azure.microsoft.com/en-us/services/cognitive-services/speech/). - ### Wakeword Detection ```java SpeechPipeline pipeline = new SpeechPipeline.Builder() - .setInputClass("io.spokestack.spokestack.android.MicrophoneInput") - .addStageClass("io.spokestack.spokestack.webrtc.AutomaticGainControl") - .addStageClass("io.spokestack.spokestack.webrtc.VoiceActivityDetector") - .addStageClass("io.spokestack.spokestack.wakeword.WakewordTrigger") - .addStageClass("io.spokestack.spokestack.google.GoogleSpeechRecognizer") - .setProperty("vad-fall-delay", 200) - .setProperty("pre-emphasis", 0.97) + .useProfile("io.spokestack.spokestack.profile.TFWakewordGoogleASR") .setProperty("wake-filter-path", "") .setProperty("wake-encode-path", "") .setProperty("wake-detect-path", "") - .setProperty("wake-smooth-length", 50) + .setProperty("wake-threshold", 0.85) .setProperty("google-credentials", "") .setProperty("locale", "en-US") .build(); ``` -This example creates a wakeword-triggered pipeline with the google speech +This example creates a wakeword-triggered pipeline with the Google Speech recognizer. The wakeword trigger uses three trained [TensorFlow Lite](https://www.tensorflow.org/lite/) models: a *filter* model for spectrum preprocessing, an autoregressive encoder *encode* model, and a @@ -89,6 +56,10 @@ for spectrum preprocessing, an autoregressive encoder *encode* model, and a the wakeword detector and its configuration parameters, click [here](https://github.com/spokestack/spokestack-android/wiki/wakeword). +The "wake-threshold" property is set by the `TFWakewordGoogleASR` profile, but it is +overridden here to emphasize that properties set after a profile is applied (either directly +in the builder or by another profile) supersede those set by that profile. + To use the demo "Spokestack" wakeword, download the TensorFlow Lite models: [detect](https://d3dmqd7cy685il.cloudfront.net/model/wake/spokestack/detect.lite) | [encode](https://d3dmqd7cy685il.cloudfront.net/model/wake/spokestack/encode.lite) | [filter](https://d3dmqd7cy685il.cloudfront.net/model/wake/spokestack/filter.lite) ## Development @@ -143,7 +114,7 @@ For additional information about releasing see http://maven.apache.org/maven-rel ## License -Copyright 2019 Spokestack, Inc. +Copyright 2020 Spokestack, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/src/main/java/io/spokestack/spokestack/PipelineProfile.java b/src/main/java/io/spokestack/spokestack/PipelineProfile.java new file mode 100644 index 0000000..5c38687 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/PipelineProfile.java @@ -0,0 +1,26 @@ +package io.spokestack.spokestack; + +/** + * A pipeline profile encapsulates a series of configuration values tuned for + * a specific task to make building a {@link SpeechPipeline} more convenient. + * + *

+ * Profiles are not authoritative; they act just like calling a series of + * methods on a {@link SpeechPipeline.Builder}, and any configuration + * properties they set can be overridden by subsequent calls. + *

+ * + *

+ * Pipeline profiles must not require arguments in their constructors. + *

+ */ +public interface PipelineProfile { + + /** + * Apply this profile to the pipeline builder. + * + * @param builder The builder to which the profile should be applied. + * @return The modified pipeline builder. + */ + SpeechPipeline.Builder apply(SpeechPipeline.Builder builder); +} diff --git a/src/main/java/io/spokestack/spokestack/SpeechPipeline.java b/src/main/java/io/spokestack/spokestack/SpeechPipeline.java index 2025567..48587b1 100644 --- a/src/main/java/io/spokestack/spokestack/SpeechPipeline.java +++ b/src/main/java/io/spokestack/spokestack/SpeechPipeline.java @@ -351,6 +351,33 @@ public Builder setProperty(String key, Object value) { return this; } + /** + * applies configuration from a {@link PipelineProfile} to the current + * builder, returning the modified builder. subsequent calls to {@code + * useProfile} or {@code setProperty} can override configuration set by + * a profile. + * + * @param profileClass class name of the profile to apply. + * @return an updated builder + * @throws IllegalArgumentException if the specified profile does not + * exist + */ + public Builder useProfile(String profileClass) + throws IllegalArgumentException { + PipelineProfile profile; + try { + profile = (PipelineProfile) Class + .forName(profileClass) + .getConstructor() + .newInstance(); + } catch (Exception e) { + throw new IllegalArgumentException( + profileClass + " pipeline profile is invalid!"); + } + + return profile.apply(this); + } + /** * adds a pipeline event listener. * diff --git a/src/main/java/io/spokestack/spokestack/profile/PushToTalkAndroidASR.java b/src/main/java/io/spokestack/spokestack/profile/PushToTalkAndroidASR.java new file mode 100644 index 0000000..76cacb2 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/PushToTalkAndroidASR.java @@ -0,0 +1,36 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that relies on manual pipeline activation, + * using Android's {@code SpeechRecognizer} API for ASR. + * + *

+ * Using Android's built-in ASR requires that an Android {@code Context} object + * be attached to the speech pipeline using it. This must be done separately + * from profile application, using + * {@link SpeechPipeline.Builder#setAndroidContext(android.content.Context)}. + *

+ * + * @see io.spokestack.spokestack.android.AndroidSpeechRecognizer + */ +public class PushToTalkAndroidASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.android.AndroidSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/PushToTalkGoogleASR.java b/src/main/java/io/spokestack/spokestack/profile/PushToTalkGoogleASR.java new file mode 100644 index 0000000..ccd39bf --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/PushToTalkGoogleASR.java @@ -0,0 +1,44 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that relies on manual pipeline activation, + * using Google Speech for ASR. + * + *

+ * Google Speech requires extra configuration, which must be added to the + * pipeline build process separately from this profile: + *

+ * + *
    + *
  • + * google-credentials (string): json-stringified google service + * account credentials, used to authenticate with the speech API + *
  • + *
  • + * locale (string): language code for speech recognition + *
  • + *
+ * + * @see io.spokestack.spokestack.google.GoogleSpeechRecognizer + */ +public class PushToTalkGoogleASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.google.GoogleSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/TFWakewordAndroidASR.java b/src/main/java/io/spokestack/spokestack/profile/TFWakewordAndroidASR.java new file mode 100644 index 0000000..e27f30c --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/TFWakewordAndroidASR.java @@ -0,0 +1,73 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses TensorFlow Lite for wakeword detection + * and Android's {@code SpeechRecognizer} API for ASR. Properties related to + * signal processing are tuned for the "Spokestack" wakeword. + * + *

+ * Wakeword detection requires configuration to locate the models used for + * classification; these properties must be set elsewhere: + *

+ * + *
    + *
  • + * wake-filter-path (string, required): file system path to the + * "filter" Tensorflow-Lite model, which is used to calculate a mel + * spectrogram frame from the linear STFT; its inputs should be shaped + * [fft-width], and its outputs [mel-width] + *
  • + *
  • + * wake-encode-path (string, required): file system path to the + * "encode" Tensorflow-Lite model, which is used to perform each + * autoregressive step over the mel frames; its inputs should be shaped + * [mel-length, mel-width], and its outputs [encode-width], with an + * additional state input/output shaped [state-width] + *
  • + *
  • + * wake-detect-path (string, required): file system path to the + * "detect" Tensorflow-Lite model; its inputs shoudld be shaped + * [encode-length, encode-width], and its outputs [1] + *
  • + *
+ * + *

+ * Using Android's built-in ASR requires that an Android {@code Context} object + * be attached to the speech pipeline using it. This must be done separately + * from profile application, using + * {@link SpeechPipeline.Builder#setAndroidContext(android.content.Context)}. + *

+ * + * @see io.spokestack.spokestack.android.AndroidSpeechRecognizer + * @see io.spokestack.spokestack.wakeword.WakewordTrigger + */ +public class TFWakewordAndroidASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .setProperty("ans-policy", "aggressive") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-target-level-dbfs", 3) + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .setProperty("vad-mode", "very-aggressive") + .setProperty("vad-fall-delay", 800) + .addStageClass( + "io.spokestack.spokestack.wakeword.WakewordTrigger") + .setProperty("wake-threshold", 0.9) + .setProperty("pre-emphasis", 0.97) + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .setProperty("active-min", 2000) + .addStageClass( + "io.spokestack.spokestack.android.AndroidSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/TFWakewordGoogleASR.java b/src/main/java/io/spokestack/spokestack/profile/TFWakewordGoogleASR.java new file mode 100644 index 0000000..30ebfca --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/TFWakewordGoogleASR.java @@ -0,0 +1,80 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses TensorFlow Lite for wakeword detection + * and Google Speech for ASR. Properties related to signal processing are tuned + * for the "Spokestack" wakeword. + * + *

+ * Wakeword detection requires configuration to locate the models used for + * classification; these properties must be set separately from this profile: + *

+ * + *
    + *
  • + * wake-filter-path (string, required): file system path to the + * "filter" Tensorflow-Lite model, which is used to calculate a mel + * spectrogram frame from the linear STFT; its inputs should be shaped + * [fft-width], and its outputs [mel-width] + *
  • + *
  • + * wake-encode-path (string, required): file system path to the + * "encode" Tensorflow-Lite model, which is used to perform each + * autoregressive step over the mel frames; its inputs should be shaped + * [mel-length, mel-width], and its outputs [encode-width], with an + * additional state input/output shaped [state-width] + *
  • + *
  • + * wake-detect-path (string, required): file system path to the + * "detect" Tensorflow-Lite model; its inputs shoudld be shaped + * [encode-length, encode-width], and its outputs [1] + *
  • + *
+ * + *

+ * Google Speech also requires configuration: + *

+ * + *
    + *
  • + * google-credentials (string): json-stringified google service + * account credentials, used to authenticate with the speech API + *
  • + *
  • + * locale (string): language code for speech recognition + *
  • + *
+ * + * @see io.spokestack.spokestack.wakeword.WakewordTrigger + * @see io.spokestack.spokestack.google.GoogleSpeechRecognizer + */ +public class TFWakewordGoogleASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .setProperty("ans-policy", "aggressive") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-target-level-dbfs", 3) + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .setProperty("vad-mode", "very-aggressive") + .setProperty("vad-fall-delay", 800) + .addStageClass( + "io.spokestack.spokestack.wakeword.WakewordTrigger") + .setProperty("wake-threshold", 0.9) + .setProperty("pre-emphasis", 0.97) + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .setProperty("active-min", 2000) + .addStageClass( + "io.spokestack.spokestack.google.GoogleSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/VADTriggerAndroidASR.java b/src/main/java/io/spokestack/spokestack/profile/VADTriggerAndroidASR.java new file mode 100644 index 0000000..42975f0 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/VADTriggerAndroidASR.java @@ -0,0 +1,38 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses voice activity detection to activate + * Android's {@code SpeechRecognizer} API for ASR. + * + *

+ * Using Android's built-in ASR requires that an Android {@code Context} object + * be attached to the speech pipeline using it. This must be done separately + * from profile application, using + * {@link SpeechPipeline.Builder#setAndroidContext(android.content.Context)}. + *

+ * + * @see io.spokestack.spokestack.android.AndroidSpeechRecognizer + */ +public class VADTriggerAndroidASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityTrigger") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.android.AndroidSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/VADTriggerGoogleASR.java b/src/main/java/io/spokestack/spokestack/profile/VADTriggerGoogleASR.java new file mode 100644 index 0000000..a4c9922 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/VADTriggerGoogleASR.java @@ -0,0 +1,44 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses voice activity detection to activate + * Google Speech ASR. + * + *

+ * Google Speech requires extra configuration, which must be added to the + * pipeline build process separately from this profile: + *

+ * + *
    + *
  • + * google-credentials (string): json-stringified google service + * account credentials, used to authenticate with the speech API + *
  • + *
  • + * locale (string): language code for speech recognition + *
  • + *
+ */ +public class VADTriggerGoogleASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityTrigger") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.google.GoogleSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/package-info.java b/src/main/java/io/spokestack/spokestack/profile/package-info.java new file mode 100644 index 0000000..87dde1f --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/package-info.java @@ -0,0 +1,5 @@ +/** + * This package contains pre-built profiles used to configure the speech + * pipeline. + */ +package io.spokestack.spokestack.profile; diff --git a/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java b/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java index 94bbb0e..1285126 100644 --- a/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java +++ b/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java @@ -1,5 +1,6 @@ package io.spokestack.spokestack; +import java.lang.reflect.InvocationTargetException; import java.util.*; import java.util.concurrent.Semaphore; import java.nio.ByteBuffer; @@ -10,6 +11,15 @@ import static org.junit.jupiter.api.Assertions.*; public class SpeechPipelineTest implements OnSpeechEventListener { + private static final List> PROFILES = Arrays.asList( + io.spokestack.spokestack.profile.PushToTalkAndroidASR.class, + io.spokestack.spokestack.profile.PushToTalkGoogleASR.class, + io.spokestack.spokestack.profile.TFWakewordAndroidASR.class, + io.spokestack.spokestack.profile.TFWakewordGoogleASR.class, + io.spokestack.spokestack.profile.VADTriggerAndroidASR.class, + io.spokestack.spokestack.profile.VADTriggerGoogleASR.class + ); + private List events = new ArrayList<>(); @Before @@ -73,6 +83,29 @@ public void testBuilder() throws Exception { } } + @Test + public void testProfiles() { + assertThrows(IllegalArgumentException.class, () -> + new SpeechPipeline.Builder() + .useProfile("io.spokestack.InvalidProfile") + ); + + // no pre-set profiles should throw errors on use + // (use instantiates the associated profile class) + for (Class profileClass : PROFILES) { + new SpeechPipeline.Builder() + .useProfile(profileClass.getCanonicalName()); + } + + // The implicated class requires a config property + SpeechPipeline pipeline = new SpeechPipeline.Builder() + .useProfile( + "io.spokestack.spokestack.SpeechPipelineTest$TestProfile") + .build(); + + assertThrows(InvocationTargetException.class, pipeline::start); + } + @Test public void testStartStop() throws Exception { final SpeechPipeline pipeline = new SpeechPipeline.Builder() @@ -241,4 +274,33 @@ public void process(SpeechContext context, ByteBuffer frame) throw new Exception("fail"); } } + + public static class ConfigRequiredStage implements SpeechProcessor { + public ConfigRequiredStage(SpeechConfig config) { + config.getString("required-property"); + } + + public void close() throws Exception { + throw new Exception("fail"); + } + + public void process(SpeechContext context, ByteBuffer frame) + throws Exception { + throw new Exception("fail"); + } + } + + private static class TestProfile implements PipelineProfile { + + public TestProfile() {} + + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.SpeechPipelineTest$Input") + .addStageClass( + "io.spokestack.spokestack.SpeechPipelineTest$ConfigRequiredStage"); + } + } }