feat: wakeword-only profile and empty ASR

This adds a no-op ASR and new pipeline profile for a wakeword-only use case. Upon successful wakeword recognition, the pipeline remains active for a single frame and is then deactivated.
spokestack · Jul 22, 2021 · 8afb3c6 · 8afb3c6
1 parent f9f7c13
commit 8afb3c6
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 0 deletions.
diff --git a/src/main/java/io/spokestack/spokestack/asr/EmptyRecognizer.java b/src/main/java/io/spokestack/spokestack/asr/EmptyRecognizer.java
@@ -0,0 +1,67 @@
+package io.spokestack.spokestack.asr;
+
+import io.spokestack.spokestack.SpeechConfig;
+import io.spokestack.spokestack.SpeechContext;
+import io.spokestack.spokestack.SpeechProcessor;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Empty speech recognizer
+ *
+ * <p>
+ * This recognizer is designed for use in profiles that want to skip ASR
+ * entirely, dispatching only activate and deactivate events from a wakeword
+ * recognizer.
+ * </p>
+ *
+ * <p>
+ * Once the wakeword is recognized, this stage allows the pipeline to remain
+ * active for a single frame then deactivates it.
+ * </p>
+ */
+public class EmptyRecognizer implements SpeechProcessor {
+
+    private boolean active = false;
+
+    /**
+     * initializes a new recognizer instance.
+     *
+     * @param speechConfig Spokestack speech configuration
+     */
+    public EmptyRecognizer(SpeechConfig speechConfig) {
+        // no configuration necessary
+    }
+
+    @Override
+    public void process(SpeechContext context, ByteBuffer frame)
+          throws Exception {
+        // all we want to do is return control to the wakeword component, so
+        // simply deactivate the context. this allows multiple wakeword
+        // utterances to be recognized in quick succession.
+        // we want to leave the context active for one frame, though, so the
+        // wakeword trigger has a chance to recognize the activity and reset
+        // itself when we deactivate on the following frame; otherwise, we'll
+        // get repeated activations as the wakeword trigger fires for multiple
+        // frames in a row.
+        if (this.active) {
+            context.setActive(false);
+        }
+        this.active = context.isActive();
+    }
+
+    @Override
+    public void reset() throws Exception {
+    }
+
+    @Override
+    public void close() throws Exception {
+    }
+
+    /**
+     * determines whether the recognizer is currently active. used for testing.
+     */
+    boolean isActive() {
+        return this.active;
+    }
+}
diff --git a/src/main/java/io/spokestack/spokestack/profile/TFWakewordEmptyASR.java b/src/main/java/io/spokestack/spokestack/profile/TFWakewordEmptyASR.java
@@ -0,0 +1,62 @@
+package io.spokestack.spokestack.profile;
+
+import io.spokestack.spokestack.PipelineProfile;
+import io.spokestack.spokestack.SpeechPipeline;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A speech pipeline profile that uses TensorFlow Lite for wakeword detection
+ * and no ASR.
+ *
+ * <p>
+ * Wakeword detection requires configuration to locate the models used for
+ * classification; these properties must be set elsewhere:
+ * </p>
+ *
+ * <ul>
+ *   <li>
+ *      <b>wake-filter-path</b> (string, required): file system path to the
+ *      "filter" Tensorflow-Lite model, which is used to calculate a mel
+ *      spectrogram frame from the linear STFT; its inputs should be shaped
+ *      [fft-width], and its outputs [mel-width]
+ *   </li>
+ *   <li>
+ *      <b>wake-encode-path</b> (string, required): file system path to the
+ *      "encode" Tensorflow-Lite model, which is used to perform each
+ *      autoregressive step over the mel frames; its inputs should be shaped
+ *      [mel-length, mel-width], and its outputs [encode-width], with an
+ *      additional state input/output shaped [state-width]
+ *   </li>
+ *   <li>
+ *      <b>wake-detect-path</b> (string, required): file system path to the
+ *      "detect" Tensorflow-Lite model; its inputs shoudld be shaped
+ *      [encode-length, encode-width], and its outputs [1]
+ *   </li>
+ * </ul>
+ *
+ * @see io.spokestack.spokestack.asr.EmptyRecognizer
+ * @see io.spokestack.spokestack.wakeword.WakewordTrigger
+ */
+public class TFWakewordEmptyASR implements PipelineProfile {
+    @Override
+    public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) {
+        List<String> stages = new ArrayList<>();
+        stages.add("io.spokestack.spokestack.webrtc.AutomaticGainControl");
+        stages.add("io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor");
+        stages.add("io.spokestack.spokestack.webrtc.VoiceActivityDetector");
+        stages.add("io.spokestack.spokestack.wakeword.WakewordTrigger");
+        stages.add("io.spokestack.spokestack.asr.EmptyRecognizer");
+
+        return builder
+              .setInputClass(
+                    "io.spokestack.spokestack.android.PreASRMicrophoneInput")
+              .setProperty("ans-policy", "aggressive")
+              .setProperty("vad-mode", "very-aggressive")
+              .setProperty("vad-fall-delay", 800)
+              .setProperty("wake-threshold", 0.9)
+              .setProperty("pre-emphasis", 0.97)
+              .setStageClasses(stages);
+    }
+}
diff --git a/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java b/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java
@@ -24,6 +24,7 @@ public class SpeechPipelineTest implements OnSpeechEventListener {
           io.spokestack.spokestack.profile.PushToTalkSpokestackASR.class,
           io.spokestack.spokestack.profile.TFWakewordAndroidASR.class,
           io.spokestack.spokestack.profile.TFWakewordAzureASR.class,
+          io.spokestack.spokestack.profile.TFWakewordEmptyASR.class,
           io.spokestack.spokestack.profile.TFWakewordGoogleASR.class,
           io.spokestack.spokestack.profile.TFWakewordKeywordASR.class,
           io.spokestack.spokestack.profile.TFWakewordSpokestackASR.class,

diff --git a/src/test/java/io/spokestack/spokestack/asr/EmptyRecognizerTest.java b/src/test/java/io/spokestack/spokestack/asr/EmptyRecognizerTest.java
@@ -0,0 +1,32 @@
+package io.spokestack.spokestack.asr;
+
+import io.spokestack.spokestack.SpeechConfig;
+import io.spokestack.spokestack.SpeechContext;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class EmptyRecognizerTest {
+
+    @Test
+    public void testProcess() throws Exception {
+        SpeechConfig config = new SpeechConfig();
+        EmptyRecognizer recognizer = new EmptyRecognizer(config);
+        assertFalse(recognizer.isActive());
+        SpeechContext context = new SpeechContext(config);
+        // context is inactive, so the stage does nothing
+        recognizer.process(context, null);
+        assertFalse(recognizer.isActive());
+        // the first process call after activation sets the internal flag
+        // but doesn't deactivate the context
+        context.setActive(true);
+        recognizer.process(context, null);
+        assertTrue(recognizer.isActive());
+        assertTrue(context.isActive());
+        // another process call deactivates both the context and
+        // the internal flag
+        recognizer.process(context, null);
+        assertFalse(context.isActive());
+        assertFalse(recognizer.isActive());
+    }
+}