This repository has been archived by the owner on May 6, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: wakeword-only profile and empty ASR
This adds a no-op ASR and new pipeline profile for a wakeword-only use case. Upon successful wakeword recognition, the pipeline remains active for a single frame and is then deactivated.
- Loading branch information
1 parent
f9f7c13
commit 8afb3c6
Showing
4 changed files
with
162 additions
and
0 deletions.
There are no files selected for viewing
67 changes: 67 additions & 0 deletions
67
src/main/java/io/spokestack/spokestack/asr/EmptyRecognizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package io.spokestack.spokestack.asr; | ||
|
||
import io.spokestack.spokestack.SpeechConfig; | ||
import io.spokestack.spokestack.SpeechContext; | ||
import io.spokestack.spokestack.SpeechProcessor; | ||
|
||
import java.nio.ByteBuffer; | ||
|
||
/** | ||
* Empty speech recognizer | ||
* | ||
* <p> | ||
* This recognizer is designed for use in profiles that want to skip ASR | ||
* entirely, dispatching only activate and deactivate events from a wakeword | ||
* recognizer. | ||
* </p> | ||
* | ||
* <p> | ||
* Once the wakeword is recognized, this stage allows the pipeline to remain | ||
* active for a single frame then deactivates it. | ||
* </p> | ||
*/ | ||
public class EmptyRecognizer implements SpeechProcessor { | ||
|
||
private boolean active = false; | ||
|
||
/** | ||
* initializes a new recognizer instance. | ||
* | ||
* @param speechConfig Spokestack speech configuration | ||
*/ | ||
public EmptyRecognizer(SpeechConfig speechConfig) { | ||
// no configuration necessary | ||
} | ||
|
||
@Override | ||
public void process(SpeechContext context, ByteBuffer frame) | ||
throws Exception { | ||
// all we want to do is return control to the wakeword component, so | ||
// simply deactivate the context. this allows multiple wakeword | ||
// utterances to be recognized in quick succession. | ||
// we want to leave the context active for one frame, though, so the | ||
// wakeword trigger has a chance to recognize the activity and reset | ||
// itself when we deactivate on the following frame; otherwise, we'll | ||
// get repeated activations as the wakeword trigger fires for multiple | ||
// frames in a row. | ||
if (this.active) { | ||
context.setActive(false); | ||
} | ||
this.active = context.isActive(); | ||
} | ||
|
||
@Override | ||
public void reset() throws Exception { | ||
} | ||
|
||
@Override | ||
public void close() throws Exception { | ||
} | ||
|
||
/** | ||
* determines whether the recognizer is currently active. used for testing. | ||
*/ | ||
boolean isActive() { | ||
return this.active; | ||
} | ||
} |
62 changes: 62 additions & 0 deletions
62
src/main/java/io/spokestack/spokestack/profile/TFWakewordEmptyASR.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package io.spokestack.spokestack.profile; | ||
|
||
import io.spokestack.spokestack.PipelineProfile; | ||
import io.spokestack.spokestack.SpeechPipeline; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* A speech pipeline profile that uses TensorFlow Lite for wakeword detection | ||
* and no ASR. | ||
* | ||
* <p> | ||
* Wakeword detection requires configuration to locate the models used for | ||
* classification; these properties must be set elsewhere: | ||
* </p> | ||
* | ||
* <ul> | ||
* <li> | ||
* <b>wake-filter-path</b> (string, required): file system path to the | ||
* "filter" Tensorflow-Lite model, which is used to calculate a mel | ||
* spectrogram frame from the linear STFT; its inputs should be shaped | ||
* [fft-width], and its outputs [mel-width] | ||
* </li> | ||
* <li> | ||
* <b>wake-encode-path</b> (string, required): file system path to the | ||
* "encode" Tensorflow-Lite model, which is used to perform each | ||
* autoregressive step over the mel frames; its inputs should be shaped | ||
* [mel-length, mel-width], and its outputs [encode-width], with an | ||
* additional state input/output shaped [state-width] | ||
* </li> | ||
* <li> | ||
* <b>wake-detect-path</b> (string, required): file system path to the | ||
* "detect" Tensorflow-Lite model; its inputs shoudld be shaped | ||
* [encode-length, encode-width], and its outputs [1] | ||
* </li> | ||
* </ul> | ||
* | ||
* @see io.spokestack.spokestack.asr.EmptyRecognizer | ||
* @see io.spokestack.spokestack.wakeword.WakewordTrigger | ||
*/ | ||
public class TFWakewordEmptyASR implements PipelineProfile { | ||
@Override | ||
public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { | ||
List<String> stages = new ArrayList<>(); | ||
stages.add("io.spokestack.spokestack.webrtc.AutomaticGainControl"); | ||
stages.add("io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor"); | ||
stages.add("io.spokestack.spokestack.webrtc.VoiceActivityDetector"); | ||
stages.add("io.spokestack.spokestack.wakeword.WakewordTrigger"); | ||
stages.add("io.spokestack.spokestack.asr.EmptyRecognizer"); | ||
|
||
return builder | ||
.setInputClass( | ||
"io.spokestack.spokestack.android.PreASRMicrophoneInput") | ||
.setProperty("ans-policy", "aggressive") | ||
.setProperty("vad-mode", "very-aggressive") | ||
.setProperty("vad-fall-delay", 800) | ||
.setProperty("wake-threshold", 0.9) | ||
.setProperty("pre-emphasis", 0.97) | ||
.setStageClasses(stages); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
src/test/java/io/spokestack/spokestack/asr/EmptyRecognizerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package io.spokestack.spokestack.asr; | ||
|
||
import io.spokestack.spokestack.SpeechConfig; | ||
import io.spokestack.spokestack.SpeechContext; | ||
import org.junit.Test; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
public class EmptyRecognizerTest { | ||
|
||
@Test | ||
public void testProcess() throws Exception { | ||
SpeechConfig config = new SpeechConfig(); | ||
EmptyRecognizer recognizer = new EmptyRecognizer(config); | ||
assertFalse(recognizer.isActive()); | ||
SpeechContext context = new SpeechContext(config); | ||
// context is inactive, so the stage does nothing | ||
recognizer.process(context, null); | ||
assertFalse(recognizer.isActive()); | ||
// the first process call after activation sets the internal flag | ||
// but doesn't deactivate the context | ||
context.setActive(true); | ||
recognizer.process(context, null); | ||
assertTrue(recognizer.isActive()); | ||
assertTrue(context.isActive()); | ||
// another process call deactivates both the context and | ||
// the internal flag | ||
recognizer.process(context, null); | ||
assertFalse(context.isActive()); | ||
assertFalse(recognizer.isActive()); | ||
} | ||
} |