Merge pull request #30 from spokestack/jz-platform-asr

Support ASR via Android's built-in SpeechRecognizer
spokestack · Jan 13, 2020 · 1446c54 · 1446c54
2 parents c75da2e + 24113a3
commit 1446c54
Show file tree

Hide file tree

Showing 11 changed files with 538 additions and 16 deletions.
diff --git a/src/main/java/io/spokestack/spokestack/SpeechConfig.java b/src/main/java/io/spokestack/spokestack/SpeechConfig.java
@@ -109,7 +109,7 @@ public int getInteger(String key) {
     }
 
     /**
-     * fetches an string value, coercing if needed.
+     * fetches a double value, coercing if needed.
      * @param key          key to look up
      * @param defaultValue value to return if not found
      * @return the double configuration value if found, defaultValue otherwise

diff --git a/src/main/java/io/spokestack/spokestack/SpeechContext.java b/src/main/java/io/spokestack/spokestack/SpeechContext.java
@@ -1,5 +1,8 @@
 package io.spokestack.spokestack;
 
+import android.content.Context;
+import androidx.annotation.Nullable;
+
 import java.util.Deque;
 import java.util.List;
 import java.util.ArrayList;
@@ -68,6 +71,7 @@ public int value() {
 
     private final List<OnSpeechEventListener> listeners = new ArrayList<>();
     private final int traceLevel;
+    private Context appContext;
     private Deque<ByteBuffer> buffer;
     private boolean speech;
     private boolean active;
@@ -86,6 +90,22 @@ public SpeechContext(SpeechConfig config) {
             TraceLevel.NONE.value());
     }
 
+    /**
+     * @return the Android context if set
+     */
+    @Nullable
+    public Context getAndroidContext() {
+        return appContext;
+    }
+
+    /**
+     * sets the Android context.
+     * @param androidContext The Android context
+     */
+    public void setAndroidContext(@Nullable Context androidContext) {
+        this.appContext = androidContext;
+    }
+
     /** @return speech frame buffer */
     public Deque<ByteBuffer> getBuffer() {
         return this.buffer;

diff --git a/src/main/java/io/spokestack/spokestack/SpeechPipeline.java b/src/main/java/io/spokestack/spokestack/SpeechPipeline.java
@@ -1,5 +1,7 @@
 package io.spokestack.spokestack;
 
+import android.content.Context;
+
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
@@ -83,6 +85,7 @@ private SpeechPipeline(Builder builder) {
         this.stageClasses = builder.stageClasses;
         this.config = builder.config;
         this.context = new SpeechContext(this.config);
+        this.context.setAndroidContext(builder.appContext);
         this.stages = new ArrayList<>();
 
         for (OnSpeechEventListener l : builder.listeners) {
@@ -255,6 +258,7 @@ public static final class Builder {
         private String inputClass;
         private List<String> stageClasses = new ArrayList<>();
         private SpeechConfig config = new SpeechConfig();
+        private Context appContext;
         private List<OnSpeechEventListener> listeners = new ArrayList<>();
 
         /**
@@ -310,6 +314,20 @@ public Builder setConfig(SpeechConfig value) {
             return this;
         }
 
+        /**
+         * Sets the android context for the pipeline. Some components may
+         * require an application context instead of an activity context;
+         * see individual component documentation for details.
+         *
+         * @param androidContext the android context for the pipeline.
+         * @return this
+         * @see io.spokestack.spokestack.android.AndroidSpeechRecognizer
+         */
+        public Builder setAndroidContext(Context androidContext) {
+            this.appContext = androidContext;
+            return this;
+        }
+
         /**
          * sets a pipeline configuration value.
          *

diff --git a/src/main/java/io/spokestack/spokestack/android/AndroidSpeechRecognizer.java b/src/main/java/io/spokestack/spokestack/android/AndroidSpeechRecognizer.java
@@ -0,0 +1,199 @@
+package io.spokestack.spokestack.android;
+
+import android.content.Context;
+import android.content.Intent;
+import android.os.Bundle;
+import android.speech.RecognitionListener;
+import android.speech.RecognizerIntent;
+import android.speech.SpeechRecognizer;
+import io.spokestack.spokestack.SpeechConfig;
+import io.spokestack.spokestack.SpeechContext;
+import io.spokestack.spokestack.SpeechProcessor;
+import io.spokestack.spokestack.util.TaskHandler;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+
+
+/**
+ * Speech recognition using built-in Android APIs.
+ *
+ * <p>
+ * This component uses the built-in Android {@code SpeechRecognizer} to process
+ * user speech.
+ * </p>
+ *
+ * <p>
+ * As part of normal operation, {@code SpeechRecognizer} plays system sounds
+ * both when it starts and stops actively listening to the user, just like the
+ * built-in Google Assistant. This behavior is not optional; it can be
+ * suppressed by having the {@code AudioManager} mute the music stream, but it
+ * muting and restoring the volume of that stream at exactly the right times is
+ * error-prone, so such behavior has been omitted from this component.
+ * </p>
+ *
+ * <p>
+ * Note that this component requires an Android {@code Context} to be attached
+ * to the pipeline that has created it. If the pipeline is meant to persist
+ * across different {@code Activity}s, the {@code Context} used must either be
+ * the <em>application</em> context, or it must be re-set on the pipeline's
+ * {@code SpeechContext} object when the Activity context changes.
+ * </p>
+ *
+ * <p>
+ * Implementation of {@code SpeechRecognizer} is left up to devices, and even
+ * though the API exists, an actual recognizer may not be present on all
+ * devices. If using this component, it's a good idea to call {@code
+ * SpeechRecognizer.isRecognitionAvailable()} before adding it to the pipeline
+ * to determine whether it will be viable on the current device.
+ * </p>
+ *
+ * <p>
+ * In addition, testing has shown that some older devices may return {@code
+ * true} for the preceding call but have outdated implementations that
+ * consistently throw errors. For this reason, it's a good idea to have an
+ * {@link io.spokestack.spokestack.OnSpeechEventListener} set up to detect
+ * {@link SpeechRecognizerError}s and have an appropriate fallback strategy in
+ * place.
+ * </p>
+ */
+public final class AndroidSpeechRecognizer implements SpeechProcessor {
+    private boolean streaming;
+    private SpeechRecognizer speechRecognizer;
+    private TaskHandler taskHandler;
+
+    /**
+     * Initializes a new recognizer.
+     *
+     * @param speechConfig Spokestack pipeline configuration
+     */
+    @SuppressWarnings("unused")
+    public AndroidSpeechRecognizer(SpeechConfig speechConfig) {
+        this.streaming = false;
+        this.taskHandler = new TaskHandler(true);
+    }
+
+    /**
+     * Create an instance of the recognizer with an injected {@link
+     * TaskHandler}. Used for testing.
+     *
+     * @param speechConfig Spokestack pipeline configuration
+     * @param handler      The task handler used to interact with the speech
+     *                     recognizer.
+     */
+    AndroidSpeechRecognizer(SpeechConfig speechConfig,
+                            TaskHandler handler) {
+        this(speechConfig);
+        this.taskHandler = handler;
+    }
+
+    @Override
+    public void process(SpeechContext context, ByteBuffer frame) {
+        if (this.speechRecognizer == null) {
+            createRecognizer(context);
+        }
+
+        if (context.isActive()) {
+            if (!this.streaming) {
+                begin();
+                this.streaming = true;
+            }
+        } else {
+            this.streaming = false;
+        }
+    }
+
+    private void createRecognizer(SpeechContext context) {
+        this.taskHandler.run(() -> {
+            Context androidContext = context.getAndroidContext();
+            this.speechRecognizer =
+                  SpeechRecognizer.createSpeechRecognizer(androidContext);
+            this.speechRecognizer.setRecognitionListener(
+                  new SpokestackListener(context));
+        });
+    }
+
+    private void begin() {
+        this.taskHandler.run(() -> {
+            Intent recognitionIntent = createRecognitionIntent();
+            this.speechRecognizer.startListening(recognitionIntent);
+        });
+    }
+
+    private Intent createRecognitionIntent() {
+        Intent intent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
+        intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL,
+              RecognizerIntent.LANGUAGE_MODEL_FREE_FORM);
+        // added in API level 23
+        intent.putExtra("android.speech.extra.PREFER_OFFLINE", true);
+        return intent;
+    }
+
+    @Override
+    public void close() {
+        this.taskHandler.run(() -> this.speechRecognizer.destroy());
+    }
+
+    /**
+     * An internal listener used to dispatch events from the Android speech
+     * recognizer to the Spokestack {@link SpeechContext}.
+     */
+    private static class SpokestackListener implements RecognitionListener {
+        private final SpeechContext context;
+
+        SpokestackListener(SpeechContext speechContext) {
+            this.context = speechContext;
+        }
+
+        @Override
+        public void onError(int error) {
+            this.context.setError(new SpeechRecognizerError(error));
+            this.context.dispatch(SpeechContext.Event.ERROR);
+        }
+
+        @Override
+        public void onResults(Bundle results) {
+            String transcript = extractTranscript(results);
+            float confidence = extractConfidence(results);
+            this.context.setTranscript(transcript);
+            this.context.setConfidence(confidence);
+            this.context.dispatch(SpeechContext.Event.RECOGNIZE);
+        }
+
+        private String extractTranscript(Bundle results) {
+            ArrayList<String> nBest = results.getStringArrayList(
+                  SpeechRecognizer.RESULTS_RECOGNITION);
+            return nBest.get(0);
+        }
+
+        private float extractConfidence(Bundle results) {
+            float[] confidences = results.getFloatArray(
+                  SpeechRecognizer.CONFIDENCE_SCORES);
+            return confidences.length > 0 ? confidences[0] : 0.0f;
+        }
+
+        // other methods required by RecognitionListener but useless for our
+        // current purposes
+
+        @Override
+        public void onReadyForSpeech(Bundle params) { }
+
+        @Override
+        public void onBeginningOfSpeech() { }
+
+        @Override
+        public void onRmsChanged(float rmsdB) { }
+
+        @Override
+        public void onBufferReceived(byte[] buffer) { }
+
+        @Override
+        public void onEndOfSpeech() { }
+
+        @Override
+        public void onPartialResults(Bundle partialResults) { }
+
+        @Override
+        public void onEvent(int eventType, Bundle params) { }
+    }
+}
diff --git a/src/main/java/io/spokestack/spokestack/android/SpeechRecognizerError.java b/src/main/java/io/spokestack/spokestack/android/SpeechRecognizerError.java
@@ -0,0 +1,50 @@
+package io.spokestack.spokestack.android;
+
+/**
+ * A simple exception class that wraps error codes from {@code
+ * android.speech.SpeechRecognizer}.
+ */
+public class SpeechRecognizerError extends Exception {
+
+    /**
+     * Create a new SpeechRecognizerError from an error code provided by the
+     * Android system.
+     *
+     * @param errorCode The Android system error code.
+     */
+    public SpeechRecognizerError(int errorCode) {
+        super("SpeechRecognizer error code " + errorCode + ": "
+              + SpeechRecognizerError.errorDescription(errorCode));
+    }
+
+    private static String errorDescription(int errorCode) {
+        if (errorCode < Description.VALUES.length) {
+            return Description.VALUES[errorCode].toString();
+        } else {
+            return Description.UNKNOWN_ERROR.toString();
+        }
+    }
+
+    /**
+     * An enumeration of the SpeechRecognizer error descriptions aligned with
+     * their integer constant values.
+     */
+    @SuppressWarnings("checkstyle:javadocvariable")
+    public enum Description {
+        UNKNOWN_ERROR,
+        NETWORK_TIMEOUT,
+        NETWORK_ERROR,
+        AUDIO_RECORDING_ERROR,
+        SERVER_ERROR,
+        CLIENT_ERROR,
+        SPEECH_TIMEOUT,
+        NO_RECOGNITION_MATCH,
+        RECOGNIZER_BUSY,
+        INSUFFICIENT_PERMISSIONS;
+
+        /**
+         * A cache of the error descriptions to reduce overhead accessing them.
+         */
+        public static final Description[] VALUES = values();
+    }
+}
diff --git a/src/main/java/io/spokestack/spokestack/google/GoogleSpeechRecognizer.java b/src/main/java/io/spokestack/spokestack/google/GoogleSpeechRecognizer.java
@@ -204,7 +204,7 @@ public void onError(Throwable e) {
         public void onCompleted() {
             this.context.setTranscript(this.transcript);
             this.context.setConfidence(this.confidence);
-            if (this.transcript != "")
+            if (!this.transcript.equals(""))
                 this.context.dispatch(SpeechContext.Event.RECOGNIZE);
             else
                 this.context.dispatch(SpeechContext.Event.TIMEOUT);

diff --git a/src/main/java/io/spokestack/spokestack/tts/SpokestackSynthesisResponse.java b/src/main/java/io/spokestack/spokestack/tts/SpokestackSynthesisResponse.java
@@ -30,15 +30,15 @@ public String getUrl() {
     /**
      * Wrapper class used for deserializing synthesis responses.
      */
-    private class ResponseData {
+    private static class ResponseData {
         private ResponseMethod synthesizeText;
         private ResponseMethod synthesizeSsml;
     }
 
     /**
      * Wrapper class used for deserializing synthesis responses.
      */
-    private class ResponseMethod {
+    private static class ResponseMethod {
         private String url;
     }
 }