diff --git a/README.md b/README.md index c88658e..f57d191 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Spokestack provides an extensible speech recognition pipeline for the Android platform. It includes a variety of built-in speech processors for Voice Activity Detection (VAD) and Automatic Speech Recognition (ASR) via popular -speech recognition services, such as the Google Speech API and Bing Speech +speech recognition services such as the Google Speech API and Azure Speech API. See the [documentation](https://spokestack.io/docs) for a lot more information diff --git a/pom.xml b/pom.xml index 6c67c8b..dfd94ee 100644 --- a/pom.xml +++ b/pom.xml @@ -47,6 +47,10 @@ jcenter https://jcenter.bintray.com/ + + microsoft + https://csspeechstorage.blob.core.windows.net/maven/ + @@ -106,7 +110,16 @@ provided - + + + com.microsoft.cognitiveservices.speech + client-sdk + 1.9.0 + aar + provided + + + com.squareup.okhttp3 okhttp @@ -287,7 +300,7 @@ INSTRUCTION COVEREDRATIO - 0.85 + 0.8 diff --git a/src/main/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizer.java b/src/main/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizer.java new file mode 100644 index 0000000..fe3c2e1 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizer.java @@ -0,0 +1,255 @@ +package io.spokestack.spokestack.microsoft; + +import com.microsoft.cognitiveservices.speech.CancellationReason; +import com.microsoft.cognitiveservices.speech.ProfanityOption; +import com.microsoft.cognitiveservices.speech.ResultReason; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionCanceledEventArgs; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionEventArgs; +import com.microsoft.cognitiveservices.speech.SpeechRecognizer; +import com.microsoft.cognitiveservices.speech.audio.AudioConfig; +import com.microsoft.cognitiveservices.speech.audio.AudioInputStream; +import com.microsoft.cognitiveservices.speech.audio.PushAudioInputStream; +import com.microsoft.cognitiveservices.speech.util.EventHandler; +import io.spokestack.spokestack.SpeechConfig; +import io.spokestack.spokestack.SpeechContext; +import io.spokestack.spokestack.SpeechProcessor; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * microsoft azure speech service recognizer + * + *

+ * This component implements the speech processor interface using the Azure + * Speech Service for speech recognition. + *

+ * + *

+ * When the speech context is triggered, the recognizer begins streaming + * buffered frames to the API for recognition. Once the speech context becomes + * inactive, the recognizer raises a RECOGNIZE event along with the audio + * transcript. Unfortunately, the Azure Speech SDK currently doesn't return + * confidence values alongside transcripts, so confidence is always set to 1.0. + *

+ * + *

+ * Use of the Azure Speech Service implies acceptance of Microsoft's license + * terms, which can be found + * + * here. + *

+ * + *

+ * This pipeline component requires the following configuration properties: + *

+ * + */ +public class AzureSpeechRecognizer implements SpeechProcessor { + private final com.microsoft.cognitiveservices.speech.SpeechConfig msConfig; + + private SpeechRecognizer recognizer; + private PushAudioInputStream audioStream; + private AudioConfig audioConfig; + private boolean active; + + // Azure speech requires little-endian (wav-format) data, so we buffer + // audio frames internally to avoid mutating data coming from the speech + // context + private ByteBuffer buffer; + + /** + * initializes a new recognizer instance. + * + * @param speechConfig Spokestack speech configuration + */ + public AzureSpeechRecognizer(SpeechConfig speechConfig) { + String apiKey = speechConfig.getString("azure-api-key"); + String region = speechConfig.getString("azure-region"); + int sampleRate = speechConfig.getInteger("sample-rate"); + + if (sampleRate != 16000) { + throw new IllegalArgumentException( + "Azure only supports a 16kHz sample rate; found: " + + sampleRate); + } + + this.buffer = ByteBuffer.allocateDirect(4096) + .order(ByteOrder.LITTLE_ENDIAN); + this.msConfig = createMsConfig(apiKey, region); + } + + com.microsoft.cognitiveservices.speech.SpeechConfig createMsConfig( + String apiKey, String region) { + com.microsoft.cognitiveservices.speech.SpeechConfig config = + com.microsoft.cognitiveservices.speech.SpeechConfig + .fromSubscription(apiKey, region); + config.setProfanity(ProfanityOption.Raw); + return config; + } + + /** + * releases the resources associated with the recognizer. + */ + public void close() { + if (this.audioStream != null) { + this.audioStream.close(); + this.audioStream = null; + } + if (this.recognizer != null) { + this.recognizer.close(); + this.recognizer = null; + } + } + + /** + * processes a frame of audio. + * + * @param speechContext the current speech context + * @param frame the audio frame to detect + * + * @throws Exception if there is an error performing active recognition. + */ + public void process(SpeechContext speechContext, ByteBuffer frame) + throws Exception { + if (speechContext.isActive() && !this.active) { + begin(speechContext); + } else if (!speechContext.isActive() && this.active) { + commit(); + } else if (speechContext.isActive()) { + bufferFrame(frame); + } + } + + void begin(SpeechContext speechContext) { + this.audioStream = AudioInputStream.createPushStream(); + this.audioConfig = AudioConfig.fromStreamInput(this.audioStream); + this.recognizer = createRecognizer(speechContext); + recognizer.startContinuousRecognitionAsync(); + this.active = true; + + // send any existing frames into the stream + for (ByteBuffer frame : speechContext.getBuffer()) { + bufferFrame(frame); + } + } + + SpeechRecognizer createRecognizer(SpeechContext context) { + // factored into a separate method for testing + SpeechRecognizer rec = new SpeechRecognizer(msConfig, audioConfig); + listen(rec, context); + return rec; + } + + private void listen(SpeechRecognizer rec, SpeechContext context) { + RecognitionListener recognitionListener = + new RecognitionListener(context); + rec.recognized.addEventListener(recognitionListener); + + CancellationListener cancellationListener = + new CancellationListener(context); + rec.canceled.addEventListener(cancellationListener); + } + + void bufferFrame(ByteBuffer frame) { + if (frame != null) { + if (this.buffer.remaining() < frame.capacity()) { + flush(); + } + + frame.rewind(); + this.buffer.put(frame); + } + } + + void commit() throws Exception { + // send the end of audio + flush(); + this.audioStream.close(); + this.recognizer.stopContinuousRecognitionAsync().get(); + this.recognizer.close(); + this.audioConfig.close(); + this.active = false; + } + + private void flush() { + if (this.buffer.hasArray()) { + this.buffer.flip(); + this.audioStream.write(this.buffer.array()); + this.buffer.clear(); + } + } + + /** + * Listener for Speech SDK recognition events. + */ + static class RecognitionListener + implements EventHandler { + private SpeechContext speechContext; + + RecognitionListener(SpeechContext context) { + this.speechContext = context; + } + + @Override + public void onEvent( + Object sender, + SpeechRecognitionEventArgs recognitionArgs) { + if (recognitionArgs.getResult().getReason() + == ResultReason.RecognizedSpeech) { + String transcript = recognitionArgs.getResult().getText(); + this.speechContext.setTranscript(transcript); + this.speechContext.setConfidence(1.0); + this.speechContext.dispatch(SpeechContext.Event.RECOGNIZE); + } + } + } + + /** + * Listener for Speech SDK cancellation events. + */ + static class CancellationListener + implements EventHandler { + + private SpeechContext speechContext; + + CancellationListener(SpeechContext context) { + this.speechContext = context; + } + + @Override + public void onEvent( + Object sender, + SpeechRecognitionCanceledEventArgs cancellationArgs) { + if (cancellationArgs.getReason() + == CancellationReason.Error) { + + String message = String.format( + "%s (error code %s)", + cancellationArgs.getErrorDetails(), + cancellationArgs.getErrorCode().name()); + + this.speechContext.setError(new Exception(message)); + this.speechContext.dispatch(SpeechContext.Event.ERROR); + } + } + } +} diff --git a/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechClient.java b/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechClient.java deleted file mode 100644 index caaf978..0000000 --- a/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechClient.java +++ /dev/null @@ -1,375 +0,0 @@ -package io.spokestack.spokestack.microsoft; - -import java.util.Date; -import java.util.Map; -import java.util.TimeZone; -import java.util.UUID; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.text.SimpleDateFormat; - -import com.google.gson.Gson; - -import okio.ByteString; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; -import okhttp3.Response; -import okhttp3.WebSocket; -import okhttp3.WebSocketListener; - -/** - * microsoft bing speech api client. - * - *

- * This class uses the Microsoft speech recognition service for automatic - * speech recognition. It implements the client side of the websocket protocol - * defined here. Transcripts and errors - * are delivered to the client asynchronously via the {@link Listener} - * interface. The following sample demonstrates how to configure and use the - * client. - *

- * - *
- * {@code
- *  BingSpeechClient client = new BingSpeechClient.Builder()
- *      .setApiKey("")
- *      .setLocale("en-US")
- *      .setSampleRate(16000)
- *      .setListener(this)
- *      .build();
- *
- *  client.connect();
- *
- *  client.beginAudio();
- *  for (ByteBuffer frame: frames)
- *      client.sendAudio(frame);
- *  client.endAudio();
- *
- *  client.disconnect();
- * }
- * 
- */ -public class BingSpeechClient implements AutoCloseable { - private static final int MESSAGE_BUFFER_SIZE = 8192; - private static final String TOKEN_URL = - "https://api.cognitive.microsoft.com/sts/v1.0/issueToken"; - private static final String TOKEN_HEADER_APIKEY = - "Ocp-Apim-Subscription-Key"; - private static final String SOCKET_URL = - "wss://speech.platform.bing.com" - + "/speech/recognition/conversation/cognitiveservices/v1?language=%s"; - - private final int sampleRate; - private final String locale; - private final Listener listener; - private final OkHttpClient client; - private final String token; - private final ByteBuffer buffer; - private WebSocket socket; - private String requestId; - - BingSpeechClient(Builder builder) throws Exception { - this.sampleRate = builder.sampleRate; - this.locale = builder.locale; - this.listener = builder.listener; - this.buffer = ByteBuffer.allocateDirect(MESSAGE_BUFFER_SIZE); - this.client = builder.getHttpClient(); - - Request tokenRequest = new Request.Builder() - .url(TOKEN_URL) - .addHeader(TOKEN_HEADER_APIKEY, builder.apiKey) - .post(RequestBody.create(null, new byte[0])) - .build(); - Response tokenResponse = this.client - .newCall(tokenRequest) - .execute(); - if (tokenResponse.code() != 200) - throw new Exception("authentication failed"); - this.token = tokenResponse.body().string(); - } - - /** - * releases the resources associated with the speech client. - */ - public void close() { - disconnect(); - } - - /** - * @return true if the websocket is currently connect, false otherwise - */ - public boolean isConnected() { - return this.socket != null; - } - - /** - * @return the request identifier of the current turn - */ - public String getRequestId() { - return this.requestId; - } - - /** - * establishes a websocket connection for speech recognition. - */ - public void connect() { - if (this.socket != null) - throw new IllegalStateException(); - - String connectionId = UUID.randomUUID().toString().replace("-", ""); - Request request = new Request.Builder() - .url(String.format(SOCKET_URL, this.locale)) - .addHeader("Authorization", String.format("Bearer %s", this.token)) - .addHeader("X-ConnectionId", connectionId) - .build(); - this.socket = this.client.newWebSocket(request, new SocketListener()); - } - - /** - * disconnects the websocket. - */ - public void disconnect() { - if (this.socket != null) { - try { - this.socket.close(1000, "goodbye"); - } finally { - this.socket = null; - this.requestId = null; - this.buffer.clear(); - } - } - } - - /** - * begins audio transmission for a new conversation turn. - * @throws Exception on error - */ - public void beginAudio() throws Exception { - if (this.socket == null) - throw new IllegalStateException(); - - this.requestId = UUID.randomUUID().toString().replace("-", ""); - sendConfig(); - sendHeader(); - } - - private void sendConfig() throws Exception { - String header = - "Path:speech.config\r\n" - + String.format("X-RequestId:%s\r\n", this.requestId) - + String.format("X-Timestamp:%s\r\n", now()) - + "Content-Type:application/json; charset=utf-8\r\n"; - String body = "{}"; - String message = String.format("%s\r\n%s", header, body); - this.socket.send(message); - } - - private void sendHeader() throws Exception { - ByteBuffer message = ByteBuffer.allocateDirect(44) - .order(ByteOrder.LITTLE_ENDIAN) // wave endian - .put("RIFF".getBytes("US-ASCII")) // riff chunk id - .putInt(0) // file size - .put("WAVE".getBytes("US-ASCII")) // riff format - .put("fmt ".getBytes("US-ASCII")) // begin format chunk - .putInt(16) // format chunk size - .putShort((short) 1) // format (1=PCM) - .putShort((short) 1) // channel count - .putInt(this.sampleRate) // sample rate - .putInt(this.sampleRate * 2) // byte rate - .putShort((short) 2) // block alignment - .putShort((short) 16) // bits per sample - .put("data".getBytes("US-ASCII")) // begin data chunk - .putInt(0); // data chunk size - sendAudio(message); - flush(); - } - - /** - * transmits an audio frame over the websocket. - * @param frame the audio frame buffer to send - * @throws Exception on error - */ - public void sendAudio(ByteBuffer frame) throws Exception { - if (this.socket == null) - throw new IllegalStateException(); - - if (frame == null || this.buffer.remaining() < frame.capacity()) - flush(); - - if (this.buffer.position() == 0) { - byte[] header = ( - "Path:audio\r\n" - + String.format("X-RequestId:%s\r\n", this.requestId) - + String.format("X-Timestamp:%s\r\n", now()) - + "Content-Type:audio/x-wav\r\n" - ).getBytes("US-ASCII"); - - this.buffer - .order(ByteOrder.BIG_ENDIAN) // protocol endian - .putShort((short) header.length) // message header length - .put(header); // message header - } - - if (frame != null) { - frame.rewind(); - this.buffer.put(frame); // message body - } - } - - /** - * sends an empty audio frame indicating the end of the turn. - * @throws Exception on error - */ - public void endAudio() throws Exception { - if (this.socket == null) - throw new IllegalStateException(); - - sendAudio(null); - flush(); - } - - private void flush() { - if (this.buffer.position() > 0) { - this.buffer.flip(); - this.socket.send(ByteString.of(this.buffer)); - this.buffer.clear(); - } - } - - private String now() { - SimpleDateFormat formatter = - new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'000Z'"); - formatter.setTimeZone(TimeZone.getTimeZone("UTC")); - return formatter.format(new Date()); - } - - /** okhttp socket callback. */ - private class SocketListener extends WebSocketListener { - private final Gson gson = new Gson(); - private final Listener listener = BingSpeechClient.this.listener; - - @Override - public void onMessage(WebSocket s, String message) { - String requestHeader = String.format( - "X-RequestId:%s", - BingSpeechClient.this.requestId - ); - if (message.contains(requestHeader)) - if (message.contains("Path:speech.phrase")) - onSpeechPhrase(message); - } - - @Override - public void onClosed(WebSocket s, int code, String reason) { - if (code != 1000) - this.listener.onError( - new Exception(String.format("%d: %s", code, reason)) - ); - } - - @Override - public void onFailure(WebSocket s, Throwable e, Response r) { - this.listener.onError(e); - } - - private void onSpeechPhrase(String message) { - String body = message.split("\r\n\r\n", 2)[1]; - Map result = gson.fromJson(body, Map.class); - String status = (String) result.get("RecognitionStatus"); - if (status.equals("Success")) - this.listener.onSpeech((String) result.get("DisplayText")); - else if (!status.equals("EndOfDictation")) - this.listener.onSpeech(""); - } - } - - /** - * bing speech client builder. - */ - public static class Builder { - private String apiKey; - private String locale; - private int sampleRate; - private Listener listener; - - /** - * sets the microsoft azure api key for authentication. - * @param value api key to configure - * @return this - */ - public Builder setApiKey(String value) { - this.apiKey = value; - return this; - } - - /** - * sets the country/language code string. - * @param value locale code to configure - * @return this - */ - public Builder setLocale(String value) { - this.locale = value; - return this; - } - - /** - * sets the audio sample rate. - * @param value sample rate to configure, in Hz - * @return this - */ - public Builder setSampleRate(int value) { - this.sampleRate = value; - return this; - } - - /** - * attaches the listener callback. - * @param value event listener to configure - * @return this - */ - public Builder setListener(Listener value) { - this.listener = value; - return this; - } - - /** - * initializes a new speech client instance. - * @return the constructed speech client - * @throws Exception on error - */ - public BingSpeechClient build() throws Exception { - if (this.apiKey == null) - throw new IllegalArgumentException("apiKey"); - if (this.locale == null) - throw new IllegalArgumentException("locale"); - if (this.sampleRate == 0) - throw new IllegalArgumentException("sampleRate"); - if (this.listener == null) - throw new IllegalArgumentException("listener"); - return new BingSpeechClient(this); - } - - OkHttpClient getHttpClient() { - return new OkHttpClient.Builder().build(); - } - } - - /** - * speech client listener callback interface. - */ - public interface Listener { - /** - * called when a speech transcription is received. - * @param transcript the speech transcript, or "" if no speech - * was detected - */ - void onSpeech(String transcript); - - /** - * called when a speech detection error occurred. - * @param e the speech error - */ - void onError(Throwable e); - } -} diff --git a/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizer.java b/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizer.java deleted file mode 100644 index 073823c..0000000 --- a/src/main/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizer.java +++ /dev/null @@ -1,156 +0,0 @@ -package io.spokestack.spokestack.microsoft; - -import java.nio.ByteBuffer; - -import io.spokestack.spokestack.SpeechConfig; -import io.spokestack.spokestack.SpeechProcessor; -import io.spokestack.spokestack.SpeechContext; - -/** - * microsoft bing speech recognizer - * - *

- * This component implements the speech processor interface using the - * Bing Speech API for speech recognition. It uses the websocket protocol - * defined here to stream audio samples - * and receive recognition results asynchronously. - *

- * - *

- * When the speech context is triggered, the recognizer begins streaming - * buffered frames to the API for recognition. Once the speech context - * becomes inactive, the recognizer raises a RECOGNIZE event along with the - * audio transcript. Unfortunately, the Bing Speech API currently doesn't - * return confidence values alongside transcripts, so confidence is always - * set to 1.0. - *

- * - *

- * This pipeline component requires the following configuration properties: - *

- *
    - *
  • - * sample-rate (integer): audio sampling rate, in Hz - *
  • - *
  • - * frame-width (integer): speech frame width, in ms - *
  • - *
  • - * locale (string): language code for speech recognition - *
  • - *
  • - * bing-speech-api-key (string): API key for the Bing Speech - * service - *
  • - *
- */ -public final class BingSpeechRecognizer implements SpeechProcessor { - private static final int IDLE_TIMEOUT = 5000; - - private final BingSpeechClient client; - private final int maxIdleCount; - private SpeechContext context; - private int idleCount; - private boolean active; - - /** - * initializes a new recognizer instance. - * @param speechConfig Spokestack speech configuration - * @throws Exception on error - */ - public BingSpeechRecognizer(SpeechConfig speechConfig) throws Exception { - this(speechConfig, new BingSpeechClient.Builder()); - } - - /** - * initializes a new recognizer instance, useful for testing. - * @param speechConfig Spokestack speech configuration - * @param builder speech client builder - * @throws Exception on error - */ - public BingSpeechRecognizer( - SpeechConfig speechConfig, - BingSpeechClient.Builder builder) throws Exception { - String apiKey = speechConfig.getString("bing-speech-api-key"); - String locale = speechConfig.getString("locale"); - int sampleRate = speechConfig.getInteger("sample-rate"); - int frameWidth = speechConfig.getInteger("frame-width"); - - this.maxIdleCount = IDLE_TIMEOUT / frameWidth; - - this.client = builder - .setApiKey(apiKey) - .setLocale(locale) - .setSampleRate(sampleRate) - .setListener(new Listener()) - .build(); - } - - /** - * releases the resources associated with the recognizer. - */ - public void close() { - this.client.close(); - } - - /** - * processes a frame of audio. - * @param speechContext the current speech context - * @param frame the audio frame to detect - * @throws Exception on error - */ - public void process(SpeechContext speechContext, ByteBuffer frame) - throws Exception { - this.context = speechContext; - if (speechContext.isActive() && !this.active) - begin(speechContext); - else if (!speechContext.isActive() && this.active) - commit(); - else if (speechContext.isActive()) - send(frame); - else if (++this.idleCount > this.maxIdleCount) - this.client.disconnect(); - } - - private void begin(SpeechContext speechContext) throws Exception { - // send the audio header - if (!this.client.isConnected()) - this.client.connect(); - this.client.beginAudio(); - this.active = true; - this.idleCount = 0; - - // send any buffered frames to the api - for (ByteBuffer frame: context.getBuffer()) - send(frame); - } - - private void send(ByteBuffer frame) throws Exception { - this.client.sendAudio(frame); - } - - private void commit() throws Exception { - // send the end of audio - this.active = false; - this.client.endAudio(); - } - - /** - * speech recognizer listener. - */ - private class Listener implements BingSpeechClient.Listener { - public void onSpeech(String transcript) { - context.setTranscript(transcript); - context.setConfidence(1.0); - context.dispatch(SpeechContext.Event.RECOGNIZE); - } - - public void onError(Throwable e) { - client.disconnect(); - active = false; - - context.setError(e); - context.dispatch(SpeechContext.Event.ERROR); - } - } -} diff --git a/src/main/java/io/spokestack/spokestack/profile/PushToTalkAzureASR.java b/src/main/java/io/spokestack/spokestack/profile/PushToTalkAzureASR.java new file mode 100644 index 0000000..fb3ee41 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/PushToTalkAzureASR.java @@ -0,0 +1,43 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that relies on manual pipeline activation, using + * Azure Speech Service for ASR. + * + *

+ * Azure Speech Service requires extra configuration, which must be added to the + * pipeline build process separately from this profile: + *

+ * + *
    + *
  • + * azure-api-key (string): Azure API key + *
  • + *
  • + * azure-region (string): service region for Azure key + *
  • + *
+ * + * @see io.spokestack.spokestack.microsoft.AzureSpeechRecognizer + */ +public class PushToTalkAzureASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.microsoft.AzureSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/TFWakewordAzureASR.java b/src/main/java/io/spokestack/spokestack/profile/TFWakewordAzureASR.java new file mode 100644 index 0000000..238ebdb --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/TFWakewordAzureASR.java @@ -0,0 +1,79 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses TensorFlow Lite for wakeword detection + * and Azure Speech Service for ASR. Properties related to + * signal processing are tuned for the "Spokestack" wakeword. + * + *

+ * Wakeword detection requires configuration to locate the models used for + * classification; these properties must be set elsewhere: + *

+ * + *
    + *
  • + * wake-filter-path (string, required): file system path to the + * "filter" Tensorflow-Lite model, which is used to calculate a mel + * spectrogram frame from the linear STFT; its inputs should be shaped + * [fft-width], and its outputs [mel-width] + *
  • + *
  • + * wake-encode-path (string, required): file system path to the + * "encode" Tensorflow-Lite model, which is used to perform each + * autoregressive step over the mel frames; its inputs should be shaped + * [mel-length, mel-width], and its outputs [encode-width], with an + * additional state input/output shaped [state-width] + *
  • + *
  • + * wake-detect-path (string, required): file system path to the + * "detect" Tensorflow-Lite model; its inputs shoudld be shaped + * [encode-length, encode-width], and its outputs [1] + *
  • + *
+ * + *

+ * Azure Speech Service requires the following properties: + *

+ * + *
    + *
  • + * azure-api-key (string): Azure API key + *
  • + *
  • + * azure-region (string): service region for Azure key + *
  • + *
+ * + * @see io.spokestack.spokestack.microsoft.AzureSpeechRecognizer + * @see io.spokestack.spokestack.wakeword.WakewordTrigger + */ +public class TFWakewordAzureASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .setProperty("ans-policy", "aggressive") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-target-level-dbfs", 3) + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .setProperty("vad-mode", "very-aggressive") + .setProperty("vad-fall-delay", 800) + .addStageClass( + "io.spokestack.spokestack.wakeword.WakewordTrigger") + .setProperty("wake-threshold", 0.9) + .setProperty("pre-emphasis", 0.97) + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .setProperty("active-min", 2000) + .addStageClass( + "io.spokestack.spokestack.microsoft.AzureSpeechRecognizer"); + } +} diff --git a/src/main/java/io/spokestack/spokestack/profile/VADTriggerAzureASR.java b/src/main/java/io/spokestack/spokestack/profile/VADTriggerAzureASR.java new file mode 100644 index 0000000..052cd19 --- /dev/null +++ b/src/main/java/io/spokestack/spokestack/profile/VADTriggerAzureASR.java @@ -0,0 +1,45 @@ +package io.spokestack.spokestack.profile; + +import io.spokestack.spokestack.PipelineProfile; +import io.spokestack.spokestack.SpeechPipeline; + +/** + * A speech pipeline profile that uses voice activity detection to activate + * ASR via Azure Speech Service. + * + *

+ * Azure Speech Service requires extra configuration, which must be added to the + * pipeline build process separately from this profile: + *

+ * + *
    + *
  • + * azure-api-key (string): Azure API key + *
  • + *
  • + * azure-region (string): service region for Azure key + *
  • + *
+ * + * @see io.spokestack.spokestack.microsoft.AzureSpeechRecognizer + */ +public class VADTriggerAzureASR implements PipelineProfile { + @Override + public SpeechPipeline.Builder apply(SpeechPipeline.Builder builder) { + return builder + .setInputClass( + "io.spokestack.spokestack.android.MicrophoneInput") + .addStageClass( + "io.spokestack.spokestack.webrtc.AcousticNoiseSuppressor") + .addStageClass( + "io.spokestack.spokestack.webrtc.AutomaticGainControl") + .setProperty("agc-compression-gain-db", 15) + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityDetector") + .addStageClass( + "io.spokestack.spokestack.webrtc.VoiceActivityTrigger") + .addStageClass("io.spokestack.spokestack.ActivationTimeout") + .addStageClass( + "io.spokestack.spokestack.microsoft.AzureSpeechRecognizer"); + } +} diff --git a/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java b/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java index 1285126..c9caf7c 100644 --- a/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java +++ b/src/test/java/io/spokestack/spokestack/SpeechPipelineTest.java @@ -13,10 +13,13 @@ public class SpeechPipelineTest implements OnSpeechEventListener { private static final List> PROFILES = Arrays.asList( io.spokestack.spokestack.profile.PushToTalkAndroidASR.class, + io.spokestack.spokestack.profile.PushToTalkAzureASR.class, io.spokestack.spokestack.profile.PushToTalkGoogleASR.class, io.spokestack.spokestack.profile.TFWakewordAndroidASR.class, + io.spokestack.spokestack.profile.TFWakewordAzureASR.class, io.spokestack.spokestack.profile.TFWakewordGoogleASR.class, io.spokestack.spokestack.profile.VADTriggerAndroidASR.class, + io.spokestack.spokestack.profile.VADTriggerAzureASR.class, io.spokestack.spokestack.profile.VADTriggerGoogleASR.class ); diff --git a/src/test/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizerTest.java b/src/test/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizerTest.java new file mode 100644 index 0000000..6e05be4 --- /dev/null +++ b/src/test/java/io/spokestack/spokestack/microsoft/AzureSpeechRecognizerTest.java @@ -0,0 +1,193 @@ +package io.spokestack.spokestack.microsoft; + +import com.microsoft.cognitiveservices.speech.CancellationErrorCode; +import com.microsoft.cognitiveservices.speech.CancellationReason; +import com.microsoft.cognitiveservices.speech.ResultReason; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionCanceledEventArgs; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionEventArgs; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionResult; +import com.microsoft.cognitiveservices.speech.SpeechRecognizer; +import com.microsoft.cognitiveservices.speech.audio.AudioConfig; +import com.microsoft.cognitiveservices.speech.audio.AudioInputStream; +import com.microsoft.cognitiveservices.speech.audio.PushAudioInputStream; +import io.spokestack.spokestack.OnSpeechEventListener; +import io.spokestack.spokestack.SpeechConfig; +import io.spokestack.spokestack.SpeechContext; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.core.classloader.annotations.SuppressStaticInitializationFor; +import org.powermock.modules.junit4.PowerMockRunner; + +import java.nio.ByteBuffer; +import java.util.LinkedList; +import java.util.concurrent.Future; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.*; + +@RunWith(PowerMockRunner.class) +@PrepareForTest({ + AudioConfig.class, + PushAudioInputStream.class, + SpeechRecognitionCanceledEventArgs.class, + SpeechRecognizer.class +}) +@SuppressStaticInitializationFor({ + "com.microsoft.cognitiveservices.speech.SpeechConfig", + "com.microsoft.cognitiveservices.speech.internal.ProfanityOption", + "com.microsoft.cognitiveservices.speech.internal.CancellationErrorCode", + "com.microsoft.cognitiveservices.speech.internal.CancellationReason" +}) +public class AzureSpeechRecognizerTest implements OnSpeechEventListener { + + private SpeechRecognizer mockRecognizer; + + private SpeechConfig speechConfig; + private SpeechContext.Event event; + private SpeechRecognitionEventArgs recognitionEvent; + private SpeechRecognitionCanceledEventArgs canceledEvent; + + @Before + @SuppressWarnings("rawtypes, ResultOfMethodCallIgnored") + public void setup() { + // MS AudioInputStream + PowerMockito.mockStatic(AudioInputStream.class); + when(AudioInputStream.createPushStream()) + .thenReturn(mock(PushAudioInputStream.class)); + + // MS AudioConfig + PowerMockito.mockStatic(AudioConfig.class); + when(AudioConfig.fromStreamInput((AudioInputStream) any())) + .thenReturn(mock(AudioConfig.class)); + + // MS SpeechConfig + PowerMockito.mockStatic( + com.microsoft.cognitiveservices.speech.SpeechConfig.class); + when( + com.microsoft.cognitiveservices.speech.SpeechConfig + .fromSubscription(anyString(), anyString())) + .thenReturn(PowerMockito.mock( + com.microsoft.cognitiveservices.speech.SpeechConfig.class)); + mockRecognizer = PowerMockito.mock(SpeechRecognizer.class); + + // we have to call `get` on the return of this method to get the + // recognition to return a result, so this mock is a bit more complex + Future fakeResult = mock(Future.class); + doReturn(fakeResult).when(mockRecognizer) + .stopContinuousRecognitionAsync(); + speechConfig = createConfig(); + + // speech recognition and cancellation events + recognitionEvent = PowerMockito.mock(SpeechRecognitionEventArgs.class); + SpeechRecognitionResult result = mock(SpeechRecognitionResult.class); + doReturn("test").when(result).getText(); + doReturn(ResultReason.RecognizedSpeech).when(result).getReason(); + when(recognitionEvent.getResult()).thenReturn(result); + + canceledEvent = PowerMockito.mock(SpeechRecognitionCanceledEventArgs.class); + doReturn(CancellationReason.Error).when(canceledEvent).getReason(); + doReturn("unknown error").when(canceledEvent).getErrorDetails(); + when(canceledEvent.getErrorCode()).thenReturn(CancellationErrorCode.ServiceError); + } + + @Test + public void testConfig() { + // invalid config + // note that we're not testing valid configs explicitly because the + // constructor deals with MS objects -- that code is covered by the + // spied recognizer used in the other tests + SpeechConfig config = createConfig(); + config.put("sample-rate", 48000); + assertThrows(IllegalArgumentException.class, + () -> new AzureSpeechRecognizer(config)); + } + + @Test + public void testRecognize() throws Exception { + AzureSpeechRecognizer azureRecognizer = + spy(new AzureSpeechRecognizer(speechConfig)); + doReturn(mockRecognizer).when(azureRecognizer).createRecognizer(any()); + SpeechContext context = createContext(speechConfig); + + // inactive + azureRecognizer.process(context, context.getBuffer().getLast()); + verify(azureRecognizer, never()).begin(any()); + + // active/buffered frames + context.setActive(true); + azureRecognizer.process(context, context.getBuffer().getLast()); + verify(azureRecognizer).begin(any()); + verify(azureRecognizer, times(context.getBuffer().size())) + .bufferFrame(context.getBuffer().getLast()); + + // subsequent frame + reset(azureRecognizer); + azureRecognizer.process(context, context.getBuffer().getLast()); + verify(azureRecognizer).bufferFrame(context.getBuffer().getLast()); + + // complete + context.setActive(false); + azureRecognizer.process(context, context.getBuffer().getLast()); + verify(azureRecognizer).commit(); + + // shutdown + azureRecognizer.close(); + // once for commit(), once for close() + verify(mockRecognizer, times(2)).close(); + } + + @Test + public void testListeners() { + SpeechConfig config = createConfig(); + SpeechContext context = createContext(config); + + // recognition + new AzureSpeechRecognizer.RecognitionListener(context) + .onEvent(mockRecognizer, recognitionEvent); + assertEquals("test", context.getTranscript()); + assertEquals(1.0, context.getConfidence()); + assertEquals(SpeechContext.Event.RECOGNIZE, this.event); + + // cancellation + context = createContext(config); + new AzureSpeechRecognizer.CancellationListener(context) + .onEvent(mockRecognizer, canceledEvent); + String code = CancellationErrorCode.ServiceError.name(); + assertEquals("unknown error (error code " + code + ")", + context.getError().getMessage()); + assertEquals("", context.getTranscript()); + assertEquals(0, context.getConfidence()); + assertEquals(SpeechContext.Event.ERROR, this.event); + } + + private SpeechConfig createConfig() { + SpeechConfig config = new SpeechConfig(); + config.put("azure-api-key", "secret"); + config.put("azure-region", "mars"); + config.put("sample-rate", 16000); + config.put("frame-width", 20); + config.put("locale", "en-US"); + return config; + } + + private SpeechContext createContext(SpeechConfig config) { + SpeechContext context = new SpeechContext(config); + context.addOnSpeechEventListener(this); + + context.attachBuffer(new LinkedList<>()); + for (int i = 0; i < 3; i++) { + context.getBuffer().addLast(ByteBuffer.allocateDirect(320)); + } + + return context; + } + + public void onEvent(SpeechContext.Event event, SpeechContext context) { + this.event = event; + } +} diff --git a/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechClientTest.java b/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechClientTest.java deleted file mode 100644 index 334ef12..0000000 --- a/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechClientTest.java +++ /dev/null @@ -1,314 +0,0 @@ -package io.spokestack.spokestack.microsoft; - -import java.util.*; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; - -import org.junit.Test; -import org.junit.Before; -import org.junit.jupiter.api.function.Executable; -import static org.junit.jupiter.api.Assertions.*; - -import org.mockito.ArgumentCaptor; -import static org.mockito.Mockito.*; - -public class BingSpeechClientTest implements BingSpeechClient.Listener { - private BingSpeechClient.Builder builder; - private okhttp3.OkHttpClient http; - private okhttp3.WebSocket socket; - private String transcript; - private Throwable error; - - @Before - public void before() throws Exception { - this.builder = spy(BingSpeechClient.Builder.class); - this.http = mock(okhttp3.OkHttpClient.class); - this.socket = spy(okhttp3.WebSocket.class); - - // mock authentication response - okhttp3.Request authRequest = new okhttp3.Request.Builder() - .url("http://example.com/") - .build(); - - okhttp3.ResponseBody authBody = mock(okhttp3.ResponseBody.class); - okio.BufferedSource authSource = mock(okio.BufferedSource.class); - when(authSource.readString(any(Charset.class))) - .thenReturn("token"); - when(authBody.source()) - .thenReturn(authSource); - okhttp3.Response authResponse = new okhttp3.Response.Builder() - .request(authRequest) - .protocol(okhttp3.Protocol.HTTP_1_1) - .code(200) - .message("OK") - .body(authBody) - .build(); - - okhttp3.Call authCall = mock(okhttp3.Call.class); - when(authCall.execute()) - .thenReturn(authResponse); - when(http.newCall(any(okhttp3.Request.class))) - .thenReturn(authCall); - - // mock websocket connection - when(http.newWebSocket( - any(okhttp3.Request.class), - any(okhttp3.WebSocketListener.class) - )).thenReturn(this.socket); - - // mock http factory - when(builder.getHttpClient()) - .thenReturn(http); - - this.transcript = null; - this.error = null; - } - - @Test - public void testBuilder() throws Exception { - // invalid api key - assertThrows(IllegalArgumentException.class, new Executable() { - public void execute() throws Exception { - new BingSpeechClient.Builder() - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - } - }); - - // invalid locale - assertThrows(IllegalArgumentException.class, new Executable() { - public void execute() throws Exception { - new BingSpeechClient.Builder() - .setApiKey("secret") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - } - }); - - // invalid sample rate - assertThrows(IllegalArgumentException.class, new Executable() { - public void execute() throws Exception { - new BingSpeechClient.Builder() - .setApiKey("secret") - .setLocale("en-US") - .setListener(BingSpeechClientTest.this) - .build(); - } - }); - - // invalid listener - assertThrows(IllegalArgumentException.class, new Executable() { - public void execute() throws Exception { - new BingSpeechClient.Builder() - .setApiKey("secret") - .setLocale("en-US") - .setSampleRate(8000) - .build(); - } - }); - - // unauthorized - assertThrows(Exception.class, new Executable() { - public void execute() throws Exception { - new BingSpeechClient.Builder() - .setApiKey("invalid") - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - } - }); - - // valid configuration - this.builder - .setApiKey("secret") - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build() - .close(); - - assertEquals(null, this.error); - } - - @Test - public void testSocketConnect() throws Exception { - final BingSpeechClient client = this.builder - .setApiKey("secret") - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - - // default connection - assertFalse(client.isConnected()); - - // valid connection - client.connect(); - assertTrue(client.isConnected()); - - // failed reconnection - assertThrows(IllegalStateException.class, new Executable() { - public void execute() throws Exception { client.connect(); } - }); - - // valid disconnection - client.disconnect(); - assertFalse(client.isConnected()); - - // safe redisconnection - client.disconnect(); - assertFalse(client.isConnected()); - - client.close(); - assertEquals(null, this.error); - } - - @Test - public void testSendAudio() throws Exception { - final ByteBuffer samples = ByteBuffer.allocateDirect(160); - final BingSpeechClient client = this.builder - .setApiKey("secret") - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - - // invalid audio - assertThrows(IllegalStateException.class, new Executable() { - public void execute() throws Exception { - client.beginAudio(); - } - }); - assertThrows(IllegalStateException.class, new Executable() { - public void execute() throws Exception { - client.sendAudio(samples); - } - }); - assertThrows(IllegalStateException.class, new Executable() { - public void execute() throws Exception { - client.endAudio(); - } - }); - - client.connect(); - - // empty audio - client.beginAudio(); - client.endAudio(); - - // valid audio - client.beginAudio(); - for (int i = 0; i < 200; i++) - client.sendAudio(samples); - client.endAudio(); - - // default event responses - assertEquals(null, this.transcript); - assertEquals(null, this.error); - - client.close(); - assertEquals(null, this.error); - } - - @Test - public void testResponseEvents() throws Exception { - BingSpeechClient client = this.builder - .setApiKey("secret") - .setLocale("en-US") - .setSampleRate(8000) - .setListener(BingSpeechClientTest.this) - .build(); - String message; - - client.connect(); - client.beginAudio(); - - ArgumentCaptor captor = - ArgumentCaptor.forClass(okhttp3.WebSocketListener.class); - verify(this.http) - .newWebSocket(any(okhttp3.Request.class), captor.capture()); - okhttp3.WebSocketListener listener = captor.getValue(); - - // socket close error - listener.onClosed(this.socket, 1001, "failed"); - assertEquals(null, this.transcript); - assertEquals("1001: failed", this.error.getMessage()); - this.error = null; - - // general error - listener.onFailure(this.socket, new Exception("failed"), null); - assertEquals(null, this.transcript); - assertEquals("failed", this.error.getMessage()); - this.error = null; - - // valid close - listener.onClosed(this.socket, 1000, "goodbye"); - assertEquals(null, this.transcript); - assertEquals(null, this.error); - - // mismatched request id - message = - "X-RequestId:42\r\n" + - "Path:speech.phrase\r\n" + - "\r\n" + - "{\"RecognitionStatus\": \"Success\", \"DisplayText\": \"test\"}"; - listener.onMessage(this.socket, message); - assertEquals(null, this.transcript); - assertEquals(null, this.error); - - // mismatched path - message = - String.format("X-RequestId:%s\r\n", client.getRequestId()) + - "Path:speech.hypothesis\r\n" + - "\r\n" + - "{\"RecognitionStatus\": \"Success\", \"DisplayText\": \"test\"}"; - listener.onMessage(this.socket, message); - assertEquals(null, this.transcript); - assertEquals(null, this.error); - - // mismatched status - message = - String.format("X-RequestId:%s\r\n", client.getRequestId()) + - "Path:speech.phrase\r\n" + - "\r\n" + - "{\"RecognitionStatus\": \"EndOfDictation\", " + - "\"DisplayText\": \"test\"}"; - listener.onMessage(this.socket, message); - assertEquals(null, this.transcript); - assertEquals(null, this.error); - - // no recognition - message = - String.format("X-RequestId:%s\r\n", client.getRequestId()) + - "Path:speech.phrase\r\n" + - "\r\n" + - "{\"RecognitionStatus\": \"BabbleTimeout\", " + - "\"DisplayText\": \"test\"}"; - listener.onMessage(this.socket, message); - assertEquals("", this.transcript); - assertEquals(null, this.error); - - // valid recognition - message = - String.format("X-RequestId:%s\r\n", client.getRequestId()) + - "Path:speech.phrase\r\n" + - "\r\n" + - "{\"RecognitionStatus\": \"Success\", " + - "\"DisplayText\": \"test\"}"; - listener.onMessage(this.socket, message); - assertEquals("test", this.transcript); - assertEquals(null, this.error); - } - - public void onSpeech(String transcript) { - this.transcript = transcript; - } - - public void onError(Throwable e) { - this.error = e; - } -} diff --git a/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizerTest.java b/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizerTest.java deleted file mode 100644 index 28d23d9..0000000 --- a/src/test/java/io/spokestack/spokestack/microsoft/BingSpeechRecognizerTest.java +++ /dev/null @@ -1,135 +0,0 @@ -package io.spokestack.spokestack.microsoft; - -import java.util.*; -import java.nio.ByteBuffer; - -import org.junit.Test; -import static org.junit.jupiter.api.Assertions.*; - -import org.mockito.ArgumentCaptor; -import static org.mockito.Mockito.*; - -import io.spokestack.spokestack.OnSpeechEventListener; -import io.spokestack.spokestack.SpeechConfig; -import io.spokestack.spokestack.SpeechContext; - -public class BingSpeechRecognizerTest implements OnSpeechEventListener { - private SpeechContext.Event event; - - @Test - public void testRecognize() throws Exception { - BingSpeechClient.Builder builder = spy(BingSpeechClient.Builder.class); - BingSpeechClient client = mock(BingSpeechClient.class); - doReturn(client).when(builder).build(); - - SpeechConfig config = createConfig(); - SpeechContext context = createContext(config); - BingSpeechRecognizer recognizer = - new BingSpeechRecognizer(config, builder); - - // capture the listener - ArgumentCaptor captor = - ArgumentCaptor.forClass(BingSpeechClient.Listener.class); - verify(builder) - .setListener(captor.capture()); - BingSpeechClient.Listener listener = captor.getValue(); - - // inactive - recognizer.process(context, context.getBuffer().getLast()); - verify(client, never()) - .beginAudio(); - - // active/buffered frames - context.setActive(true); - recognizer.process(context, context.getBuffer().getLast()); - verify(client) - .beginAudio(); - verify(client, times(context.getBuffer().size())) - .sendAudio(context.getBuffer().getLast()); - - // subsequent frame - reset(client); - recognizer.process(context, context.getBuffer().getLast()); - verify(client) - .sendAudio(context.getBuffer().getLast()); - - // complete - context.setActive(false); - recognizer.process(context, context.getBuffer().getLast()); - verify(client) - .endAudio(); - - // idle timeout - for (int i = 0; i < 500; i++) - recognizer.process(context, context.getBuffer().getLast()); - verify(client, atLeast(1)) - .disconnect(); - - - // responses - listener.onSpeech("test"); - assertEquals("test", context.getTranscript()); - assertEquals(1.0, context.getConfidence()); - assertEquals(SpeechContext.Event.RECOGNIZE, this.event); - - // shutdown - recognizer.close(); - verify(client).close(); - } - - @Test - public void testError() throws Exception { - BingSpeechClient.Builder builder = spy(BingSpeechClient.Builder.class); - BingSpeechClient client = mock(BingSpeechClient.class); - doReturn(client).when(builder).build(); - - SpeechConfig config = createConfig(); - SpeechContext context = createContext(config); - BingSpeechRecognizer recognizer = - new BingSpeechRecognizer(config, builder); - - // capture the listener - ArgumentCaptor captor = - ArgumentCaptor.forClass(BingSpeechClient.Listener.class); - verify(builder) - .setListener(captor.capture()); - BingSpeechClient.Listener listener = captor.getValue(); - - // trigger active - context.setActive(true); - when(client.isConnected()) - .thenReturn(true); - recognizer.process(context, context.getBuffer().getLast()); - - // inject fault - listener.onError(new Exception("test error")); - assertEquals("test error", context.getError().getMessage()); - assertEquals("", context.getTranscript()); - assertEquals(0, context.getConfidence()); - assertEquals(SpeechContext.Event.ERROR, this.event); - } - - private SpeechConfig createConfig() { - SpeechConfig config = new SpeechConfig(); - config.put("bing-speech-api-key", "secret"); - config.put("sample-rate", 16000); - config.put("frame-width", 20); - config.put("locale", "en-US"); - return config; - } - - private SpeechContext createContext(SpeechConfig config) { - SpeechContext context = new SpeechContext(config); - context.addOnSpeechEventListener(this); - - context.attachBuffer(new LinkedList()); - for (int i = 0; i < 3; i++) - context.getBuffer().addLast(ByteBuffer.allocateDirect(320)); - - return context; - } - - public void onEvent(SpeechContext.Event event, SpeechContext context) { - this.event = event; - } -}