Skip to content
This repository has been archived by the owner on May 6, 2022. It is now read-only.

Commit

Permalink
Feature: Azure Speech Service ASR
Browse files Browse the repository at this point in the history
This replaces the now-defunct Bing Speech API component
with a component that uses the Azure Speech Service SDK for ASR.
Profiles that use the new component are included.
  • Loading branch information
space-pope committed Jan 23, 2020
1 parent 893bd17 commit b046693
Show file tree
Hide file tree
Showing 12 changed files with 634 additions and 983 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Spokestack provides an extensible speech recognition pipeline for the Android
platform. It includes a variety of built-in speech processors for Voice
Activity Detection (VAD) and Automatic Speech Recognition (ASR) via popular
speech recognition services, such as the Google Speech API and Bing Speech
speech recognition services such as the Google Speech API and Azure Speech
API.

See the [documentation](https://spokestack.io/docs) for a lot more information
Expand Down
17 changes: 15 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
<id>jcenter</id>
<url>https://jcenter.bintray.com/</url>
</repository>
<repository>
<id>microsoft</id>
<url>https://csspeechstorage.blob.core.windows.net/maven/</url>
</repository>
</repositories>

<distributionManagement>
Expand Down Expand Up @@ -106,7 +110,16 @@
<scope>provided</scope>
</dependency>

<!-- microsoft speech api / spokestack TTS -->
<!-- azure speech service -->
<dependency>
<groupId>com.microsoft.cognitiveservices.speech</groupId>
<artifactId>client-sdk</artifactId>
<version>1.9.0</version>
<type>aar</type>
<scope>provided</scope>
</dependency>

<!-- spokestack TTS -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
Expand Down Expand Up @@ -287,7 +300,7 @@
<limit>
<counter>INSTRUCTION</counter>
<value>COVEREDRATIO</value>
<minimum>0.85</minimum>
<minimum>0.8</minimum>
</limit>
</limits>
</rule>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
package io.spokestack.spokestack.microsoft;

import com.microsoft.cognitiveservices.speech.CancellationReason;
import com.microsoft.cognitiveservices.speech.ProfanityOption;
import com.microsoft.cognitiveservices.speech.ResultReason;
import com.microsoft.cognitiveservices.speech.SpeechRecognitionCanceledEventArgs;
import com.microsoft.cognitiveservices.speech.SpeechRecognitionEventArgs;
import com.microsoft.cognitiveservices.speech.SpeechRecognizer;
import com.microsoft.cognitiveservices.speech.audio.AudioConfig;
import com.microsoft.cognitiveservices.speech.audio.AudioInputStream;
import com.microsoft.cognitiveservices.speech.audio.PushAudioInputStream;
import com.microsoft.cognitiveservices.speech.util.EventHandler;
import io.spokestack.spokestack.SpeechConfig;
import io.spokestack.spokestack.SpeechContext;
import io.spokestack.spokestack.SpeechProcessor;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

/**
* microsoft azure speech service recognizer
*
* <p>
* This component implements the speech processor interface using the Azure
* Speech Service for speech recognition.
* </p>
*
* <p>
* When the speech context is triggered, the recognizer begins streaming
* buffered frames to the API for recognition. Once the speech context becomes
* inactive, the recognizer raises a RECOGNIZE event along with the audio
* transcript. Unfortunately, the Azure Speech SDK currently doesn't return
* confidence values alongside transcripts, so confidence is always set to 1.0.
* </p>
*
* <p>
* Use of the Azure Speech Service implies acceptance of Microsoft's license
* terms, which can be found
* <a href=
* "https://csspeechstorage.blob.core.windows.net/drop/license201809.html">
* here</a>.
* </p>
*
* <p>
* This pipeline component requires the following configuration properties:
* </p>
* <ul>
* <li>
* <b>sample-rate</b> (integer): audio sampling rate, in Hz
* </li>
* <li>
* <b>frame-width</b> (integer): speech frame width, in ms
* </li>
* <li>
* <b>locale</b> (string): language code for speech recognition
* </li>
* <li>
* <b>azure-api-key</b> (string): API key for the Azure Speech
* service
* </li>
* <li>
* <b>azure-region</b> (string): Azure Speech service region
* </li>
* </ul>
*/
public class AzureSpeechRecognizer implements SpeechProcessor {
private final com.microsoft.cognitiveservices.speech.SpeechConfig msConfig;

private SpeechRecognizer recognizer;
private PushAudioInputStream audioStream;
private AudioConfig audioConfig;
private boolean active;

// Azure speech requires little-endian (wav-format) data, so we buffer
// audio frames internally to avoid mutating data coming from the speech
// context
private ByteBuffer buffer;

/**
* initializes a new recognizer instance.
*
* @param speechConfig Spokestack speech configuration
*/
public AzureSpeechRecognizer(SpeechConfig speechConfig) {
String apiKey = speechConfig.getString("azure-api-key");
String region = speechConfig.getString("azure-region");
int sampleRate = speechConfig.getInteger("sample-rate");

if (sampleRate != 16000) {
throw new IllegalArgumentException(
"Azure only supports a 16kHz sample rate; found: "
+ sampleRate);
}

this.buffer = ByteBuffer.allocateDirect(4096)
.order(ByteOrder.LITTLE_ENDIAN);
this.msConfig = createMsConfig(apiKey, region);
}

com.microsoft.cognitiveservices.speech.SpeechConfig createMsConfig(
String apiKey, String region) {
com.microsoft.cognitiveservices.speech.SpeechConfig config =
com.microsoft.cognitiveservices.speech.SpeechConfig
.fromSubscription(apiKey, region);
config.setProfanity(ProfanityOption.Raw);
return config;
}

/**
* releases the resources associated with the recognizer.
*/
public void close() {
if (this.audioStream != null) {
this.audioStream.close();
this.audioStream = null;
}
if (this.recognizer != null) {
this.recognizer.close();
this.recognizer = null;
}
}

/**
* processes a frame of audio.
*
* @param speechContext the current speech context
* @param frame the audio frame to detect
*
* @throws Exception if there is an error performing active recognition.
*/
public void process(SpeechContext speechContext, ByteBuffer frame)
throws Exception {
if (speechContext.isActive() && !this.active) {
begin(speechContext);
} else if (!speechContext.isActive() && this.active) {
commit();
} else if (speechContext.isActive()) {
bufferFrame(frame);
}
}

void begin(SpeechContext speechContext) {
this.audioStream = AudioInputStream.createPushStream();
this.audioConfig = AudioConfig.fromStreamInput(this.audioStream);
this.recognizer = createRecognizer(speechContext);
recognizer.startContinuousRecognitionAsync();
this.active = true;

// send any existing frames into the stream
for (ByteBuffer frame : speechContext.getBuffer()) {
bufferFrame(frame);
}
}

SpeechRecognizer createRecognizer(SpeechContext context) {
// factored into a separate method for testing
SpeechRecognizer rec = new SpeechRecognizer(msConfig, audioConfig);
listen(rec, context);
return rec;
}

private void listen(SpeechRecognizer rec, SpeechContext context) {
RecognitionListener recognitionListener =
new RecognitionListener(context);
rec.recognized.addEventListener(recognitionListener);

CancellationListener cancellationListener =
new CancellationListener(context);
rec.canceled.addEventListener(cancellationListener);
}

void bufferFrame(ByteBuffer frame) {
if (frame != null) {
if (this.buffer.remaining() < frame.capacity()) {
flush();
}

frame.rewind();
this.buffer.put(frame);
}
}

void commit() throws Exception {
// send the end of audio
flush();
this.audioStream.close();
this.recognizer.stopContinuousRecognitionAsync().get();
this.recognizer.close();
this.audioConfig.close();
this.active = false;
}

private void flush() {
if (this.buffer.hasArray()) {
this.buffer.flip();
this.audioStream.write(this.buffer.array());
this.buffer.clear();
}
}

/**
* Listener for Speech SDK recognition events.
*/
static class RecognitionListener
implements EventHandler<SpeechRecognitionEventArgs> {
private SpeechContext speechContext;

RecognitionListener(SpeechContext context) {
this.speechContext = context;
}

@Override
public void onEvent(
Object sender,
SpeechRecognitionEventArgs recognitionArgs) {
if (recognitionArgs.getResult().getReason()
== ResultReason.RecognizedSpeech) {
String transcript = recognitionArgs.getResult().getText();
this.speechContext.setTranscript(transcript);
this.speechContext.setConfidence(1.0);
this.speechContext.dispatch(SpeechContext.Event.RECOGNIZE);
}
}
}

/**
* Listener for Speech SDK cancellation events.
*/
static class CancellationListener
implements EventHandler<SpeechRecognitionCanceledEventArgs> {

private SpeechContext speechContext;

CancellationListener(SpeechContext context) {
this.speechContext = context;
}

@Override
public void onEvent(
Object sender,
SpeechRecognitionCanceledEventArgs cancellationArgs) {
if (cancellationArgs.getReason()
== CancellationReason.Error) {

String message = String.format(
"%s (error code %s)",
cancellationArgs.getErrorDetails(),
cancellationArgs.getErrorCode().name());

this.speechContext.setError(new Exception(message));
this.speechContext.dispatch(SpeechContext.Event.ERROR);
}
}
}
}

0 comments on commit b046693

Please sign in to comment.