This repository has been archived by the owner on May 6, 2022. It is now read-only.
/
types.ts
408 lines (394 loc) · 11.4 KB
/
types.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
export type RequireSource = number
/**
* Pipeline profiles set up the speech pipeline based on your needs
*/
export enum PipelineProfile {
/**
* Set up wakeword and use local Apple/Android ASR.
* Note that wakeword.filter, wakeword.encode, and wakeword.detect
* are required if any wakeword profile is used.
*/
TFLITE_WAKEWORD_NATIVE_ASR = 0,
/**
* Apple/Android Automatic Speech Recognition is on
* when Voice Active Detection triggers it.
*/
VAD_NATIVE_ASR = 1,
/**
* Apple/Android Automatic Speech Recogntion is on
* when the speech pipeline is active.
* This is likely the more common profile
* when not using wakeword.
*/
PTT_NATIVE_ASR = 2,
/**
* Set up wakeword and use remote Spokestack ASR.
* Note that wakeword.filter, wakeword.encode, and wakeword.detect
* are required if any wakeword profile is used.
*/
TFLITE_WAKEWORD_SPOKESTACK_ASR = 3,
/**
* Spokestack Automatic Speech Recognition is on
* when Voice Active Detection triggers it.
*/
VAD_SPOKESTACK_ASR = 4,
/**
* Spokestack Automatic Speech Recogntion is on
* when the speech pipeline is active.
* This is likely the more common profile
* when not using wakeword, but Spokestack ASR is preferred.
*/
PTT_SPOKESTACK_ASR = 5
}
/**
* How much logging to show
* A lower number means more logs.
*/
export enum TraceLevel {
DEBUG = 10,
PERF = 20,
INFO = 30,
NONE = 100
}
/**
* Three formats are supported when using Spokestack TTS.
* Raw text, SSML, and Speech Markdown.
* See https://www.speechmarkdown.org/ if unfamiliar with Speech Markdown.
* IPA is expected when using SSML or Speech Markdown.
*/
export enum TTSFormat {
TEXT = 0,
SSML = 1,
SPEECHMARKDOWN = 2
}
export interface SpokestackRecognizeEvent {
transcript: string
}
export interface SpokestackErrorEvent {
error: string
}
export interface SpokestackTraceEvent {
message: string
}
export interface SpokestackPlayEvent {
playing: boolean
}
export type SpokestackEvent =
| SpokestackRecognizeEvent
| SpokestackErrorEvent
| SpokestackTraceEvent
| SpokestackPlayEvent
export interface SpokestackNLUSlot {
type: string
value: string
rawValue: string
}
export interface SpokestackNLUResult {
/** The intent based on the match provided by the NLU model */
intent: string
/** A percentage of the confidence of the match, given by the tensorflow */
confidence: number
/** Data associated with the intent, provided by the NLU model */
slots: {
[key: string]: SpokestackNLUSlot
}
}
export interface PipelineConfig {
/**
* Profiles are collections of common configurations for Pipeline stages.
* If Wakeword config files are specified, the default will be
* `TFLITE_WAKEWORD_NATIVE_ASR`.
* Otherwise, the default is `PTT_NATIVE_ASR`.
*/
profile?: PipelineProfile
/**
* Audio sampling rate, in Hz
*/
sampleRate?: number
/**
* @advanced
*
* Speech frame width, in ms
*/
frameWidth?: number
/**
* @advanced
*
* Buffer width, used with frameWidth to determine the buffer size
*/
bufferWidth?: number
/**
* Voice activity detector mode
*/
vadMode?: 'quality' | 'low-bitrate' | 'aggressive' | 'very-aggressive'
/**
* @advanced
*
* Falling-edge detection run length, in ms; this value determines
* how many negative samples must be received to flip the detector to negative
*/
vadFallDelay?: number
/**
* @advanced
*
* Android-only
*
* Rising-edge detection run length, in ms; this value determines
* how many positive samples must be received to flip the detector to positive
*/
vadRiseDelay?: number
/**
* @advanced
*
* Android-only for AcousticNoiseSuppressor
*
* Noise policy
*/
ansPolicy?: 'mild' | 'medium' | 'aggressive' | 'very-aggressive'
/**
* @advanced
*
* Android-only for AcousticGainControl
*
* Target peak audio level, in -dB,
* to maintain a peak of -9dB, configure a value of 9
*/
agcCompressionGainDb?: number
/**
* @advanced
*
* Android-only for AcousticGainControl
*
* Dynamic range compression rate, in dBFS
*/
agcTargetLevelDbfs?: number
}
export interface NLUConfig {
/**
* The NLU Tensorflow-Lite model. If specified, metadata and vocab are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `model: require('./nlu.tflite')`)
*/
model: string | RequireSource
/**
* The JSON file for NLU metadata. If specified, model and vocab are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `metadata: require('./metadata.json')`)
*/
metadata: string | RequireSource
/**
* A txt file containing the NLU vocabulary. If specified, model and metadata are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `vocab: require('./vocab.txt')`)
*/
vocab: string | RequireSource
/*
* @advanced
*
* Android-only
*
* Padded length of the model's input sequences.
* Defaults to 128 and should only be changed if this parameter
* is explicitly set to a different value at training time.
*/
inputLength?: number
}
export interface WakewordConfig {
/**
* The "filter" Tensorflow-Lite model. If specified, detect and encode are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `filter: require('./filter.tflite')`)
*
* The filter model is used to calculate a mel spectrogram frame from the linear STFT;
* its inputs should be shaped [fft-width], and its outputs [mel-width]
*/
filter: string | RequireSource
/**
* The "detect" Tensorflow-Lite model. If specified, filter and encode are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `detect: require('./detect.tflite')`)
*
* The encode model is used to perform each autoregressive step over the mel frames;
* its inputs should be shaped [mel-length, mel-width], and its outputs [encode-width],
* with an additional state input/output shaped [state-width]
*/
detect: string | RequireSource
/**
* The "encode" Tensorflow-Lite model. If specified, filter and detect are also required.
*
* This field accepts 2 types of values.
* 1. A string representing a remote URL from which to download and cache the file (presumably from a CDN).
* 2. A source object retrieved by a `require` or `import` (e.g. `encode: require('./encode.tflite')`)
*
* Its inputs should be shaped [encode-length, encode-width],
* and its outputs
*/
encode: string | RequireSource
/**
* The minimum length of an activation, in milliseconds,
* used to ignore a VAD deactivation after the wakeword
*/
activeMin?: number
/**
* The maximum length of an activation, in milliseconds,
* used to time out the activation
*/
activeMax?: number
/**
* iOS-only
*
* A comma-separated list of wakeword keywords
* Only necessary when not passing the filter, detect, and encode paths.
*/
wakewords?: string
/**
* iOS-only
*
* Length of time to allow an Apple ASR request to run, in milliseconds.
* Apple has an undocumented limit of 60000ms per request.
*/
requestTimeout?: number
/**
* @advanced
*
* The threshold of the classifier's posterior output,
* above which the trigger activates the pipeline, in the range [0, 1]
*/
threshold?: number
/**
* @advanced
*
* The length of the sliding window of encoder output
* used as an input to the classifier, in milliseconds
*/
encodeLength?: number
/**
* @advanced
*
* The size of the encoder output, in vector units
*/
encodeWidth?: number
/**
* @advanced
*
* The size of the encoder state, in vector units (defaults to wake-encode-width)
*/
stateWidth?: number
/**
* @advanced
*
* The desired linear Root Mean Squared (RMS) signal energy,
* which is used for signal normalization and should be tuned
* to the RMS target used during training
*/
rmsTarget?: number
/**
* @advanced
*
* The Exponentially-Weighted Moving Average (EWMA) update
* rate for the current RMS signal energy (0 for no RMS normalization)
*/
rmsAlpha?: number
/**
* @advanced
*
* The size of the signal window used to calculate the STFT,
* in number of samples - should be a power of 2 for maximum efficiency
*/
fftWindowSize?: number
/**
* @advanced
*
* Android-only
*
* The name of the windowing function to apply to each audio frame
* before calculating the STFT; currently the "hann" window is supported
*/
fftWindowType?: string
/**
* @advanced
*
* The length of time to skip each time the
* overlapping STFT is calculated, in milliseconds
*/
fftHopLength?: number
/**
* @advanced
*
* The pre-emphasis filter weight to apply to
* the normalized audio signal (0 for no pre-emphasis)
*/
preEmphasis?: number
/**
* @advanced
*
* The length of time to skip each time the
* overlapping STFT is calculated, in milliseconds
*/
melFrameLength?: number
/**
* @advanced
*
* The size of each mel spectrogram frame,
* in number of filterbank components
*/
melFrameWidth?: number
}
/**
* Spokestack-iOS reference: https://spokestack.github.io/spokestack-ios/index.html
* spokestack-android reference: https://javadoc.io/doc/io.spokestack/spokestack-android/latest/index.html
*/
export interface SpokestackConfig {
/**
* This option is only used when remote URLs are passed to fields such as `wakeword.filter`.
*
* Set this to true to allow downloading models over cellular.
* Note that `Spokestack.initialize()` will still reject the promise if
* models need to be downloaded but there is no network at all.
*
* Ideally, the app will include network handling itself and
* inform the user about file downloads.
*
* Default: false
*/
allowCellularDownloads?: boolean
/**
* Wakeword and NLU model files are cached internally.
* Set this to true whenever a model is changed
* during development to refresh the internal model cache.
*
* This affects models passed with `require()` as well
* as models downloaded from remote URLs.
*
* Default: false
*/
refreshModels?: boolean
/**
* This controls the log level for the underlying native
* iOS and Android libraries.
* See the TraceLevel enum for values.
*/
traceLevel?: TraceLevel
/**
* Most of these options are advanced aside from "profile"
*/
pipeline?: PipelineConfig
/** Only needed if using Spokestack.classify */
nlu?: NLUConfig
/**
* Only required for wakeword
* Most options are advanced aside from
* filter, encode, and decode for specifying config files.
*/
wakeword?: WakewordConfig
}