/
text-to-speech.R
341 lines (294 loc) · 10.2 KB
/
text-to-speech.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#' Perform text to speech
#'
#' Synthesizes speech synchronously: receive results after all text input has been processed.
#'
#' @param input The text to turn into speech
#' @param output Where to save the speech audio file
#' @param languageCode The language of the voice as a \code{BCP-47} language code
#' @param name Name of the voice, see list via \link{gl_talk_languages} for supported voices. Set to \code{NULL} to make the service choose a voice based on \code{languageCode} and \code{gender}.
#' @param gender The gender of the voice, if available
#' @param audioEncoding Format of the requested audio stream
#' @param speakingRate Speaking rate/speed between \code{0.25} and \code{4.0}
#' @param pitch Speaking pitch between \code{-20.0} and \code{20.0} in semitones.
#' @param volumeGainDb Volumne gain in dB
#' @param sampleRateHertz Sample rate for returned audio
#' @param inputType Choose between \code{text} (the default) or SSML markup. The \code{input} text must be SSML markup if you choose \code{ssml}
#' @param effectsProfileIds Optional. An identifier which selects 'audio effects' profiles that are applied on (post synthesized) text to speech. Effects are applied on top of each other in the order they are given
#' @param forceLanguageCode If \code{name} is provided, this will ensure that the passed \code{languageCode} is used instead of being inferred from name. This is necessary for models that require the exact code (en-us, en-gb, ...), not just the two letters shorthand (en, es, ...)
#'
#' @details
#'
#' Requires the Cloud Text-To-Speech API to be activated for your Google Cloud project.
#'
#' Supported voices are here \url{https://cloud.google.com/text-to-speech/docs/voices} and can be imported into R via \link{gl_talk_languages}
#'
#' To play the audio in code via a browser see \link{gl_talk_player}
#'
#' To use Speech Synthesis Markup Language (SSML) select \code{inputType=ssml} - more details on using this to insert pauses, sounds and breaks in your audio can be found here: \url{https://cloud.google.com/text-to-speech/docs/ssml}
#'
#' To use audio profiles, supply a character vector of the available audio profiles listed here: \url{https://cloud.google.com/text-to-speech/docs/audio-profiles} - the audio profiles are applied in the order given. For instance \code{effectsProfileIds="wearable-class-device"} will optimise output for smart watches, \code{effectsProfileIds=c("wearable-class-device","telephony-class-application")} will apply sound filters optimised for smart watches, then telephonic devices.
#'
#' @seealso \url{https://cloud.google.com/text-to-speech/docs/}
#'
#' @return The file output name you supplied as \code{output}
#' @examples
#'
#' \dontrun{
#' library(magrittr)
#' gl_talk("The rain in spain falls mainly in the plain",
#' output = "output.wav")
#'
#' gl_talk("Testing my new audio player") %>% gl_talk_player()
#'
#' # using SSML
#' gl_talk('<speak>The <say-as interpret-as=\"characters\">SSML</say-as>
#' standard <break time=\"1s\"/>is defined by the
#' <sub alias=\"World Wide Web Consortium\">W3C</sub>.</speak>',
#' inputType = "ssml")
#'
#' # using effects profiles
#' gl_talk("This sounds great on headphones",
#' effectsProfileIds = "headphone-class-device")
#'
#' }
#'
#' @export
#' @importFrom googleAuthR gar_api_generator
#' @import assertthat
gl_talk <- function(input,
output = "output.wav",
languageCode = "en",
gender = c("SSML_VOICE_GENDER_UNSPECIFIED", "MALE","FEMALE","NEUTRAL"),
name = NULL,
audioEncoding = c("LINEAR16","MP3","OGG_OPUS"),
speakingRate = 1,
pitch = 0,
volumeGainDb = 0,
sampleRateHertz = NULL,
inputType = c("text","ssml"),
effectsProfileIds = NULL,
forceLanguageCode = FALSE){
inputType <- match.arg(inputType)
gender <- match.arg(gender)
audioEncoding <- match.arg(audioEncoding)
assert_that(
is.string(input),
nchar(input) <= 5000L,
is.string(output),
is.string(languageCode),
speakingRate >= 0.25,
speakingRate <= 4.0,
pitch >= -20.0,
pitch <= 20.0,
is.logical(forceLanguageCode)
)
if(!is.null(name)){
assert_that(is.string(name))
if(!forceLanguageCode){
languageCode <- substr(name, 1,2)
}
gender <- NULL
}
if(!is.null(sampleRateHertz)){
assert_that(is.scalar(sampleRateHertz))
}
## change fileextension of output based on audioEncoding
file_ext <- switch(audioEncoding,
LINEAR16 = "wav",
MP3 = "mp3",
OGG_OPUS = "ogg|opus|mka|mkv|webm")
if(!grepl(paste0("\\.",file_ext,"$"), output)){
warning("Output file extension (",
output,
") does not match audio encoding (",
audioEncoding,")")
}
if(inputType == "ssml"){
assert_that(grepl("^<speak>", input))
the_input <- list(
ssml = input
)
} else {
the_input <- list(
text = input
)
}
if(!is.null(effectsProfileIds)){
assert_that(is.character(effectsProfileIds))
effectsProfileIds <- as.list(effectsProfileIds)
}
body <- list(
input = the_input,
voice = list(
languageCode = languageCode,
name = name,
ssmlGender = gender
),
audioConfig = list(
audioEncoding = audioEncoding,
speakingRate = speakingRate,
pitch = pitch,
volumeGainDb = volumeGainDb,
sampleRateHertz = sampleRateHertz,
effectsProfileId = effectsProfileIds
)
)
body <- rmNullObs(body)
my_message("Calling text-to-speech API: ", input, level = 3)
call_api <- gar_api_generator("https://texttospeech.googleapis.com/v1beta1/text:synthesize",
"POST",
data_parse_function = audio_decode)
o <- call_api(the_body = body)
## write the file
writeBin(o, con = output)
output
}
audio_decode <- function(x){
jsonlite::base64_dec(x$audioContent)
}
#' Get a list of voices available for text to speech
#'
#' Returns a list of voices supported for synthesis.
#'
#' @param languageCode A \code{BCP-47} language tag. If specified, will only return voices that can be used to synthesize this languageCode
#'
#' @import assertthat
#' @importFrom googleAuthR gar_api_generator
#' @export
gl_talk_languages <- function(languageCode = NULL){
if(!is.null(languageCode)){
assert_that(is.string(languageCode))
pars <- list(languageCode = languageCode)
} else {
pars <- NULL
}
call_api <- gar_api_generator("https://texttospeech.googleapis.com/v1beta1/voices",
"GET",
pars_args = pars,
data_parse_function = parse_talk_language)
call_api()
}
parse_talk_language <- function(x){
if(length(x) == 0){
my_message("No languages found")
return(tibble())
}
o <- x$voices
lc <- map_chr(o$languageCodes, 1)
tibble(languageCodes = lc,
name = o$name,
ssmlGender = o$ssmlGender,
naturalSampleRateHertz = o$naturalSampleRateHertz)
}
#' Play audio in a browser
#'
#' This uses HTML5 audio tags to play audio in your browser
#'
#' @param audio The file location of the audio file. Must be supported by HTML5
#' @param html The html file location that will be created host the audio
#'
#' @details
#'
#' A platform neutral way to play audio is not easy, so this uses your browser to play it instead.
#'
#' @examples
#'
#' \dontrun{
#'
#' gl_talk("Testing my new audio player") %>% gl_talk_player()
#'
#' }
#'
#' @export
#' @importFrom utils browseURL
gl_talk_player <- function(audio = "output.wav",
html = "player.html"){
writeLines(sprintf('<html><body>
<audio controls autoplay>
<source src="%s">
</audio>
</body></html>',
audio),
html)
utils::browseURL(html)
}
#' Speak in Shiny module (ui)
#'
#' @param id The Shiny id
#'
#' @details
#'
#' Shiny Module for use with \link{gl_talk_shiny}.
#'
#' @export
gl_talk_shinyUI <- function(id){
ns <- shiny::NS(id)
shiny::htmlOutput(ns("talk"))
}
#' Speak in Shiny module (server)
#'
#' Call via \code{shiny::callModule(gl_talk_shiny, "your_id")}
#'
#' @param input shiny input
#' @param output shiny output
#' @param session shiny session
#' @param transcript The (reactive) text to talk
#' @inheritDotParams gl_talk
#' @param autoplay passed to the HTML audio player - default \code{TRUE} plays on load
#' @param controls passed to the HTML audio player - default \code{TRUE} shows controls
#' @param loop passed to the HTML audio player - default \code{FALSE} does not loop
#' @param keep_wav keep the generated wav files if TRUE.
#' @export
#' @import assertthat
gl_talk_shiny <- function(input, output, session,
transcript, ...,
autoplay = TRUE, controls = TRUE, loop = FALSE,
keep_wav = FALSE){
assert_that(
is.flag(autoplay),
is.flag(controls),
is.flag(keep_wav)
)
# to deal with TRUE = NA, FALSE = NULL
if(autoplay){
autoplay <- NA
} else {
autoplay <- NULL
}
if(controls){
controls <- NA
} else {
controls <- NULL
}
if(loop){
loop <- NA
} else {
loop <- NULL
}
# make a www folder to host the audio file
talk_file <- shiny::reactive({
# to ensure this fires each new transcript
shiny::req(transcript())
# make www folder if it doesn't exisit
if(!dir.exists("www")){
dir.create("www")
}
# clean up any existing wav files
if(!keep_wav){
unlink(list.files("www", pattern = ".wav$", full.names = TRUE))
}
# to prevent browser caching its new every time
paste0(basename(tempfile(fileext = ".wav")))
})
output$talk <- shiny::renderUI({
shiny::req(talk_file())
gl_talk(transcript(),
output = file.path("www", talk_file()),
...)
## the audio file sits in folder www, but the audio file must be referenced without www
shiny::tags$audio(autoplay = autoplay,
controls = controls,
loop = loop,
shiny::tags$source(src = talk_file()))
})
}