diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java index e1b6b171875..dfd0ea02342 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java @@ -38,7 +38,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties { public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue(); - private static final Float SPEED = 1.0f; + private static final Double SPEED = 1.0; private static final String VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue(); diff --git a/models/spring-ai-elevenlabs/src/main/java/org/springframework/ai/elevenlabs/ElevenLabsTextToSpeechModel.java b/models/spring-ai-elevenlabs/src/main/java/org/springframework/ai/elevenlabs/ElevenLabsTextToSpeechModel.java index 68ed07568a8..662d0efcef4 100644 --- a/models/spring-ai-elevenlabs/src/main/java/org/springframework/ai/elevenlabs/ElevenLabsTextToSpeechModel.java +++ b/models/spring-ai-elevenlabs/src/main/java/org/springframework/ai/elevenlabs/ElevenLabsTextToSpeechModel.java @@ -23,7 +23,6 @@ import reactor.core.publisher.Flux; import org.springframework.ai.audio.tts.Speech; -import org.springframework.ai.audio.tts.StreamingTextToSpeechModel; import org.springframework.ai.audio.tts.TextToSpeechModel; import org.springframework.ai.audio.tts.TextToSpeechPrompt; import org.springframework.ai.audio.tts.TextToSpeechResponse; @@ -35,12 +34,11 @@ import org.springframework.util.MultiValueMap; /** - * Implementation of the {@link TextToSpeechModel} and {@link StreamingTextToSpeechModel} - * interfaces + * Implementation of the {@link TextToSpeechModel} interface for ElevenLabs TTS API. * * @author Alexandros Pappas */ -public class ElevenLabsTextToSpeechModel implements TextToSpeechModel, StreamingTextToSpeechModel { +public class ElevenLabsTextToSpeechModel implements TextToSpeechModel { private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java index 759eac07e09..3f5e250e0de 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java @@ -16,18 +16,20 @@ package org.springframework.ai.openai; +import java.util.List; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import reactor.core.publisher.Flux; +import org.springframework.ai.audio.tts.Speech; +import org.springframework.ai.audio.tts.TextToSpeechModel; +import org.springframework.ai.audio.tts.TextToSpeechOptions; +import org.springframework.ai.audio.tts.TextToSpeechPrompt; +import org.springframework.ai.audio.tts.TextToSpeechResponse; import org.springframework.ai.chat.metadata.RateLimit; import org.springframework.ai.openai.api.OpenAiAudioApi; import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.AudioResponseFormat; -import org.springframework.ai.openai.audio.speech.Speech; -import org.springframework.ai.openai.audio.speech.SpeechModel; -import org.springframework.ai.openai.audio.speech.SpeechPrompt; -import org.springframework.ai.openai.audio.speech.SpeechResponse; -import org.springframework.ai.openai.audio.speech.StreamingSpeechModel; import org.springframework.ai.openai.metadata.audio.OpenAiAudioSpeechResponseMetadata; import org.springframework.ai.openai.metadata.support.OpenAiResponseHeaderExtractor; import org.springframework.ai.retry.RetryUtils; @@ -46,13 +48,13 @@ * @see OpenAiAudioApi * @since 1.0.0-M1 */ -public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel { +public class OpenAiAudioSpeechModel implements TextToSpeechModel { /** * The speed of the default voice synthesis. * @see OpenAiAudioSpeechOptions */ - private static final Float SPEED = 1.0f; + private static final Double SPEED = 1.0; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -118,14 +120,14 @@ public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi, OpenAiAudioSpeechOptions @Override public byte[] call(String text) { - SpeechPrompt speechRequest = new SpeechPrompt(text); - return call(speechRequest).getResult().getOutput(); + TextToSpeechPrompt prompt = new TextToSpeechPrompt(text); + return call(prompt).getResult().getOutput(); } @Override - public SpeechResponse call(SpeechPrompt speechPrompt) { + public TextToSpeechResponse call(TextToSpeechPrompt prompt) { - OpenAiAudioApi.SpeechRequest speechRequest = createRequest(speechPrompt); + OpenAiAudioApi.SpeechRequest speechRequest = createRequest(prompt); ResponseEntity speechEntity = this.retryTemplate .execute(ctx -> this.audioApi.createSpeech(speechRequest)); @@ -134,48 +136,42 @@ public SpeechResponse call(SpeechPrompt speechPrompt) { if (speech == null) { logger.warn("No speech response returned for speechRequest: {}", speechRequest); - return new SpeechResponse(new Speech(new byte[0])); + return new TextToSpeechResponse(List.of(new Speech(new byte[0]))); } RateLimit rateLimits = OpenAiResponseHeaderExtractor.extractAiResponseHeaders(speechEntity); - return new SpeechResponse(new Speech(speech), new OpenAiAudioSpeechResponseMetadata(rateLimits)); + return new TextToSpeechResponse(List.of(new Speech(speech)), new OpenAiAudioSpeechResponseMetadata(rateLimits)); } /** * Streams the audio response for the given speech prompt. - * @param speechPrompt The speech prompt containing the text and options for speech + * @param prompt The speech prompt containing the text and options for speech * synthesis. - * @return A Flux of SpeechResponse objects containing the streamed audio and + * @return A Flux of TextToSpeechResponse objects containing the streamed audio and * metadata. */ @Override - public Flux stream(SpeechPrompt speechPrompt) { + public Flux stream(TextToSpeechPrompt prompt) { - OpenAiAudioApi.SpeechRequest speechRequest = createRequest(speechPrompt); + OpenAiAudioApi.SpeechRequest speechRequest = createRequest(prompt); Flux> speechEntity = this.retryTemplate .execute(ctx -> this.audioApi.stream(speechRequest)); - return speechEntity.map(entity -> new SpeechResponse(new Speech(entity.getBody()), + return speechEntity.map(entity -> new TextToSpeechResponse(List.of(new Speech(entity.getBody())), new OpenAiAudioSpeechResponseMetadata(OpenAiResponseHeaderExtractor.extractAiResponseHeaders(entity)))); } - private OpenAiAudioApi.SpeechRequest createRequest(SpeechPrompt request) { - OpenAiAudioSpeechOptions options = this.defaultOptions; - - if (request.getOptions() != null) { - if (request.getOptions() instanceof OpenAiAudioSpeechOptions runtimeOptions) { - options = this.merge(runtimeOptions, options); - } - else { - throw new IllegalArgumentException("Prompt options are not of type SpeechOptions: " - + request.getOptions().getClass().getSimpleName()); - } - } + private OpenAiAudioApi.SpeechRequest createRequest(TextToSpeechPrompt prompt) { + OpenAiAudioSpeechOptions runtimeOptions = (prompt + .getOptions() instanceof OpenAiAudioSpeechOptions openAiAudioSpeechOptions) ? openAiAudioSpeechOptions + : null; + OpenAiAudioSpeechOptions options = (runtimeOptions != null) ? this.merge(runtimeOptions, this.defaultOptions) + : this.defaultOptions; String input = StringUtils.hasText(options.getInput()) ? options.getInput() - : request.getInstructions().getText(); + : prompt.getInstructions().getText(); OpenAiAudioApi.SpeechRequest.Builder requestBuilder = OpenAiAudioApi.SpeechRequest.builder() .model(options.getModel()) @@ -187,6 +183,11 @@ private OpenAiAudioApi.SpeechRequest createRequest(SpeechPrompt request) { return requestBuilder.build(); } + @Override + public TextToSpeechOptions getDefaultOptions() { + return this.defaultOptions; + } + private OpenAiAudioSpeechOptions merge(OpenAiAudioSpeechOptions source, OpenAiAudioSpeechOptions target) { OpenAiAudioSpeechOptions.Builder mergedBuilder = OpenAiAudioSpeechOptions.builder(); diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java index 7b293a104b7..a7c0d771a54 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java @@ -19,7 +19,7 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; -import org.springframework.ai.model.ModelOptions; +import org.springframework.ai.audio.tts.TextToSpeechOptions; import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.AudioResponseFormat; import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice; @@ -33,7 +33,7 @@ * @since 1.0.0-M1 */ @JsonInclude(JsonInclude.Include.NON_NULL) -public class OpenAiAudioSpeechOptions implements ModelOptions { +public class OpenAiAudioSpeechOptions implements TextToSpeechOptions { /** * ID of the model to use for generating the audio. For OpenAI's TTS API, use one of @@ -67,7 +67,7 @@ public class OpenAiAudioSpeechOptions implements ModelOptions { * 4.0 (fastest). Defaults to 1 (normal) */ @JsonProperty("speed") - private Float speed; + private Double speed; public static Builder builder() { return new Builder(); @@ -109,14 +109,34 @@ public void setResponseFormat(AudioResponseFormat responseFormat) { this.responseFormat = responseFormat; } - public Float getSpeed() { + @Override + public Double getSpeed() { return this.speed; } - public void setSpeed(Float speed) { + public void setSpeed(Double speed) { this.speed = speed; } + // TextToSpeechOptions interface methods + + @Override + public String getFormat() { + return (this.responseFormat != null) ? this.responseFormat.name().toLowerCase() : null; + } + + @Override + @SuppressWarnings("unchecked") + public OpenAiAudioSpeechOptions copy() { + return OpenAiAudioSpeechOptions.builder() + .model(this.model) + .input(this.input) + .voice(this.voice) + .responseFormat(this.responseFormat) + .speed(this.speed) + .build(); + } + @Override public int hashCode() { final int prime = 31; @@ -217,7 +237,7 @@ public Builder responseFormat(AudioResponseFormat responseFormat) { return this; } - public Builder speed(Float speed) { + public Builder speed(Double speed) { this.options.speed = speed; return this; } diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java index cd89852d244..a79c08227ac 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java @@ -396,7 +396,7 @@ public record SpeechRequest( @JsonProperty("input") String input, @JsonProperty("voice") String voice, @JsonProperty("response_format") AudioResponseFormat responseFormat, - @JsonProperty("speed") Float speed) { + @JsonProperty("speed") Double speed) { // @formatter:on public static Builder builder() { @@ -491,7 +491,7 @@ public static final class Builder { private AudioResponseFormat responseFormat = AudioResponseFormat.MP3; - private Float speed; + private Double speed; public Builder model(String model) { this.model = model; @@ -518,7 +518,7 @@ public Builder responseFormat(AudioResponseFormat responseFormat) { return this; } - public Builder speed(Float speed) { + public Builder speed(Double speed) { this.speed = speed; return this; } diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/Speech.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/Speech.java deleted file mode 100644 index 66e8dd53c23..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/Speech.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import java.util.Arrays; -import java.util.Objects; - -import org.springframework.ai.model.ModelResult; -import org.springframework.ai.openai.metadata.audio.OpenAiAudioSpeechMetadata; -import org.springframework.lang.Nullable; - -/** - * The Speech class represents the result of speech synthesis from an AI model. It - * implements the ModelResult interface with the output type of byte array. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.Speech} from the core package - * instead. This class will be removed in a future release. - */ -@Deprecated -public class Speech implements ModelResult { - - private final byte[] audio; - - private OpenAiAudioSpeechMetadata speechMetadata; - - public Speech(byte[] audio) { - this.audio = audio; - } - - @Override - public byte[] getOutput() { - return this.audio; - } - - @Override - public OpenAiAudioSpeechMetadata getMetadata() { - return this.speechMetadata != null ? this.speechMetadata : OpenAiAudioSpeechMetadata.NULL; - } - - public Speech withSpeechMetadata(@Nullable OpenAiAudioSpeechMetadata speechMetadata) { - this.speechMetadata = speechMetadata; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Speech that)) { - return false; - } - return Arrays.equals(this.audio, that.audio) && Objects.equals(this.speechMetadata, that.speechMetadata); - } - - @Override - public int hashCode() { - return Objects.hash(Arrays.hashCode(this.audio), this.speechMetadata); - } - - @Override - public String toString() { - return "Speech{" + "text=" + this.audio + ", speechMetadata=" + this.speechMetadata + '}'; - } - -} diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechMessage.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechMessage.java deleted file mode 100644 index 8de55fe4f11..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechMessage.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import java.util.Objects; - -/** - * The {@link SpeechMessage} class represents a single text message to be converted to - * speech by the OpenAI TTS API. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.TextToSpeechMessage} from the - * core package instead. This class will be removed in a future release. - */ -@Deprecated -public class SpeechMessage { - - private String text; - - /** - * Constructs a new {@link SpeechMessage} object with the given text. - * @param text the text to be converted to speech - */ - public SpeechMessage(String text) { - this.text = text; - } - - /** - * Returns the text of this speech message. - * @return the text of this speech message - */ - public String getText() { - return this.text; - } - - /** - * Sets the text of this speech message. - * @param text the new text for this speech message - */ - public void setText(String text) { - this.text = text; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof SpeechMessage that)) { - return false; - } - return Objects.equals(this.text, that.text); - } - - @Override - public int hashCode() { - return Objects.hash(this.text); - } - -} diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechModel.java deleted file mode 100644 index 98161933814..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechModel.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import org.springframework.ai.model.Model; - -/** - * The {@link SpeechModel} interface provides a way to interact with the OpenAI - * Text-to-Speech (TTS) API. It allows you to convert text input into lifelike spoken - * audio. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.TextToSpeechModel} from the - * core package instead. This interface will be removed in a future release. - */ -@Deprecated -@FunctionalInterface -public interface SpeechModel extends Model { - - /** - * Generates spoken audio from the provided text message. - * @param message the text message to be converted to audio - * @return the resulting audio bytes - */ - default byte[] call(String message) { - SpeechPrompt prompt = new SpeechPrompt(message); - return call(prompt).getResult().getOutput(); - } - - /** - * Sends a speech request to the OpenAI TTS API and returns the resulting speech - * response. - * @param request the speech prompt containing the input text and other parameters - * @return the speech response containing the generated audio - */ - SpeechResponse call(SpeechPrompt request); - -} diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechPrompt.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechPrompt.java deleted file mode 100644 index bfce1e311ee..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechPrompt.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import java.util.Objects; - -import org.springframework.ai.model.ModelOptions; -import org.springframework.ai.model.ModelRequest; -import org.springframework.ai.openai.OpenAiAudioSpeechOptions; - -/** - * The {@link SpeechPrompt} class represents a request to the OpenAI Text-to-Speech (TTS) - * API. It contains a list of {@link SpeechMessage} objects, each representing a piece of - * text to be converted to speech. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.TextToSpeechPrompt} from the - * core package instead. This class will be removed in a future release. - */ -@Deprecated -public class SpeechPrompt implements ModelRequest { - - private final SpeechMessage message; - - private OpenAiAudioSpeechOptions speechOptions; - - public SpeechPrompt(String instructions) { - this(new SpeechMessage(instructions), OpenAiAudioSpeechOptions.builder().build()); - } - - public SpeechPrompt(String instructions, OpenAiAudioSpeechOptions speechOptions) { - this(new SpeechMessage(instructions), speechOptions); - } - - public SpeechPrompt(SpeechMessage speechMessage) { - this(speechMessage, OpenAiAudioSpeechOptions.builder().build()); - } - - public SpeechPrompt(SpeechMessage speechMessage, OpenAiAudioSpeechOptions speechOptions) { - this.message = speechMessage; - this.speechOptions = speechOptions; - } - - @Override - public SpeechMessage getInstructions() { - return this.message; - } - - @Override - public ModelOptions getOptions() { - return this.speechOptions; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof SpeechPrompt that)) { - return false; - } - return Objects.equals(this.speechOptions, that.speechOptions) && Objects.equals(this.message, that.message); - } - - @Override - public int hashCode() { - return Objects.hash(this.speechOptions, this.message); - } - -} diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechResponse.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechResponse.java deleted file mode 100644 index 9662764aec5..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/SpeechResponse.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import java.util.Collections; -import java.util.List; -import java.util.Objects; - -import org.springframework.ai.model.ModelResponse; -import org.springframework.ai.openai.metadata.audio.OpenAiAudioSpeechResponseMetadata; - -/** - * Creates a new instance of SpeechResponse with the given speech result. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.TextToSpeechResponse} from the - * core package instead. This class will be removed in a future release. - */ -@Deprecated -public class SpeechResponse implements ModelResponse { - - private final Speech speech; - - private final OpenAiAudioSpeechResponseMetadata speechResponseMetadata; - - /** - * Creates a new instance of SpeechResponse with the given speech result. - * @param speech the speech result to be set in the SpeechResponse - * @see Speech - */ - public SpeechResponse(Speech speech) { - this(speech, OpenAiAudioSpeechResponseMetadata.NULL); - } - - /** - * Creates a new instance of SpeechResponse with the given speech result and speech - * response metadata. - * @param speech the speech result to be set in the SpeechResponse - * @param speechResponseMetadata the speech response metadata to be set in the - * SpeechResponse - * @see Speech - * @see OpenAiAudioSpeechResponseMetadata - */ - public SpeechResponse(Speech speech, OpenAiAudioSpeechResponseMetadata speechResponseMetadata) { - this.speech = speech; - this.speechResponseMetadata = speechResponseMetadata; - } - - @Override - public Speech getResult() { - return this.speech; - } - - @Override - public List getResults() { - return Collections.singletonList(this.speech); - } - - @Override - public OpenAiAudioSpeechResponseMetadata getMetadata() { - return this.speechResponseMetadata; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof SpeechResponse that)) { - return false; - } - return Objects.equals(this.speech, that.speech) - && Objects.equals(this.speechResponseMetadata, that.speechResponseMetadata); - } - - @Override - public int hashCode() { - return Objects.hash(this.speech, this.speechResponseMetadata); - } - -} diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/StreamingSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/StreamingSpeechModel.java deleted file mode 100644 index fa8daadf159..00000000000 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/StreamingSpeechModel.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2023-2024 the original author or authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.springframework.ai.openai.audio.speech; - -import reactor.core.publisher.Flux; - -import org.springframework.ai.model.StreamingModel; - -/** - * The {@link StreamingSpeechModel} interface provides a way to interact with the OpenAI - * Text-to-Speech (TTS) API using a streaming approach, allowing you to receive the - * generated audio in a real-time fashion. - * - * @author Ahmed Yousri - * @since 1.0.0-M1 - * @deprecated Use {@link org.springframework.ai.audio.tts.StreamingTextToSpeechModel} - * from the core package instead. This interface will be removed in a future release. - */ -@Deprecated -@FunctionalInterface -public interface StreamingSpeechModel extends StreamingModel { - - /** - * Generates a stream of audio bytes from the provided text message. - * @param message the text message to be converted to audio - * @return a Flux of audio bytes representing the generated speech - */ - default Flux stream(String message) { - SpeechPrompt prompt = new SpeechPrompt(message); - return stream(prompt).map(SpeechResponse::getResult).map(Speech::getOutput); - } - - /** - * Sends a speech request to the OpenAI TTS API and returns a stream of the resulting - * speech responses. - * @param prompt the speech prompt containing the input text and other parameters - * @return a Flux of speech responses, each containing a portion of the generated - * audio - */ - @Override - Flux stream(SpeechPrompt prompt); - -} diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java index 67310709492..dcc8c13c5c0 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java @@ -24,6 +24,8 @@ import org.junit.jupiter.params.provider.ValueSource; import reactor.core.publisher.Flux; +import org.springframework.ai.audio.tts.TextToSpeechPrompt; +import org.springframework.ai.audio.tts.TextToSpeechResponse; import org.springframework.ai.openai.OpenAiAudioSpeechOptions; import org.springframework.ai.openai.OpenAiTestConfiguration; import org.springframework.ai.openai.api.OpenAiAudioApi; @@ -41,7 +43,7 @@ @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+") class OpenAiSpeechModelIT extends AbstractIT { - private static final Float SPEED = 1.0f; + private static final Double SPEED = 1.0; @Test void shouldSuccessfullyStreamAudioBytesForEmptyMessage() { @@ -59,16 +61,16 @@ void shouldProduceAudioBytesDirectlyFromMessage() { } @Test - void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() { + void shouldGenerateNonEmptyMp3AudioFromTextToSpeechPrompt() { OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - SpeechResponse response = this.speechModel.call(speechPrompt); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + TextToSpeechResponse response = this.speechModel.call(speechPrompt); byte[] audioBytes = response.getResult().getOutput(); assertThat(response.getResults()).hasSize(1); assertThat(response.getResults().get(0).getOutput()).isNotEmpty(); @@ -77,16 +79,16 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() { } @Test - void shouldGenerateNonEmptyWavAudioFromSpeechPrompt() { + void shouldGenerateNonEmptyWavAudioFromTextToSpeechPrompt() { OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.WAV) .model(OpenAiAudioApi.TtsModel.TTS_1.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - SpeechResponse response = this.speechModel.call(speechPrompt); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + TextToSpeechResponse response = this.speechModel.call(speechPrompt); byte[] audioBytes = response.getResult().getOutput(); assertThat(response.getResults()).hasSize(1); assertThat(response.getResults().get(0).getOutput()).isNotEmpty(); @@ -102,10 +104,10 @@ void speechRateLimitTest() { .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - SpeechResponse response = this.speechModel.call(speechPrompt); - OpenAiAudioSpeechResponseMetadata metadata = response.getMetadata(); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + TextToSpeechResponse response = this.speechModel.call(speechPrompt); + OpenAiAudioSpeechResponseMetadata metadata = (OpenAiAudioSpeechResponseMetadata) response.getMetadata(); assertThat(metadata).isNotNull(); assertThat(metadata.getRateLimit()).isNotNull(); assertThat(metadata.getRateLimit().getRequestsLimit()).isPositive(); @@ -114,7 +116,7 @@ void speechRateLimitTest() { } @Test - void shouldStreamNonEmptyResponsesForValidSpeechPrompts() { + void shouldStreamNonEmptyResponsesForValidTextToSpeechPrompts() { OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) @@ -123,11 +125,11 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() { .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - Flux responseFlux = this.speechModel.stream(speechPrompt); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + Flux responseFlux = this.speechModel.stream(speechPrompt); assertThat(responseFlux).isNotNull(); - List responses = responseFlux.collectList().block(); + List responses = responseFlux.collectList().block(); assertThat(responses).isNotNull(); responses.forEach(response -> // System.out.println("Audio data chunk size: " + @@ -144,9 +146,9 @@ void speechVoicesTest(String voice) { .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - SpeechResponse response = this.speechModel.call(speechPrompt); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + TextToSpeechResponse response = this.speechModel.call(speechPrompt); byte[] audioBytes = response.getResult().getOutput(); assertThat(response.getResults()).hasSize(1); assertThat(response.getResults().get(0).getOutput()).isNotEmpty(); diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java index 51df242073d..8cd5836dc3d 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java @@ -22,6 +22,8 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.springframework.ai.audio.tts.TextToSpeechPrompt; +import org.springframework.ai.audio.tts.TextToSpeechResponse; import org.springframework.ai.model.SimpleApiKey; import org.springframework.ai.openai.OpenAiAudioSpeechModel; import org.springframework.ai.openai.OpenAiAudioSpeechOptions; @@ -51,7 +53,7 @@ @RestClientTest(OpenAiSpeechModelWithSpeechResponseMetadataTests.Config.class) public class OpenAiSpeechModelWithSpeechResponseMetadataTests { - private static final Float SPEED = 1.0f; + private static final Double SPEED = 1.0; private static String TEST_API_KEY = "sk-1234567890"; @@ -78,14 +80,15 @@ void aiResponseContainsImageResponseMetadata() { .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); - SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", - speechOptions); - SpeechResponse response = this.openAiSpeechClient.call(speechPrompt); + TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt( + "Today is a wonderful day to build something people love!", speechOptions); + TextToSpeechResponse response = this.openAiSpeechClient.call(speechPrompt); byte[] audioBytes = response.getResult().getOutput(); assertThat(audioBytes).hasSizeGreaterThan(0); - OpenAiAudioSpeechResponseMetadata speechResponseMetadata = response.getMetadata(); + OpenAiAudioSpeechResponseMetadata speechResponseMetadata = (OpenAiAudioSpeechResponseMetadata) response + .getMetadata(); assertThat(speechResponseMetadata).isNotNull(); var requestLimit = speechResponseMetadata.getRateLimit(); Long requestsLimit = requestLimit.getRequestsLimit(); diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech.adoc index 52de29ff2a2..8ce08918cb3 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech.adoc @@ -1,9 +1,121 @@ [[Speech]] = Text-To-Speech (TTS) API -Spring AI provides support for the following Text-To-Speech (TTS) providers: +Spring AI provides a unified API for Text-To-Speech (TTS) through the `TextToSpeechModel` and `StreamingTextToSpeechModel` interfaces. This allows you to write portable code that works across different TTS providers. + +== Supported Providers - xref:api/audio/speech/openai-speech.adoc[OpenAI's Speech API] - xref:api/audio/speech/elevenlabs-speech.adoc[Eleven Labs Text-To-Speech API] -Future enhancements may introduce additional providers, at which point a common `TextToSpeechModel` and `StreamingTextToSpeechModel` interface will be extracted. \ No newline at end of file +== Common Interface + +All TTS providers implement the following shared interfaces: + +=== TextToSpeechModel + +The `TextToSpeechModel` interface provides methods for converting text to speech: + +[source,java] +---- +public interface TextToSpeechModel extends Model, StreamingTextToSpeechModel { + + /** + * Converts text to speech with default options. + */ + default byte[] call(String text) { + // Default implementation + } + + /** + * Converts text to speech with custom options. + */ + TextToSpeechResponse call(TextToSpeechPrompt prompt); + + /** + * Returns the default options for this model. + */ + default TextToSpeechOptions getDefaultOptions() { + // Default implementation + } +} +---- + +=== StreamingTextToSpeechModel + +The `StreamingTextToSpeechModel` interface provides methods for streaming audio in real-time: + +[source,java] +---- +@FunctionalInterface +public interface StreamingTextToSpeechModel extends StreamingModel { + + /** + * Streams text-to-speech responses with metadata. + */ + Flux stream(TextToSpeechPrompt prompt); + + /** + * Streams audio bytes for the given text. + */ + default Flux stream(String text) { + // Default implementation + } +} +---- + +=== TextToSpeechPrompt + +The `TextToSpeechPrompt` class encapsulates the input text and options: + +[source,java] +---- +TextToSpeechPrompt prompt = new TextToSpeechPrompt( + "Hello, this is a text-to-speech example.", + options +); +---- + +=== TextToSpeechResponse + +The `TextToSpeechResponse` class contains the generated audio and metadata: + +[source,java] +---- +TextToSpeechResponse response = model.call(prompt); +byte[] audioBytes = response.getResult().getOutput(); +TextToSpeechResponseMetadata metadata = response.getMetadata(); +---- + +== Writing Portable Code + +The shared interfaces allow you to write code that works with any TTS provider: + +[source,java] +---- +@Service +public class NarrationService { + + private final TextToSpeechModel textToSpeechModel; + + public NarrationService(TextToSpeechModel textToSpeechModel) { + this.textToSpeechModel = textToSpeechModel; + } + + public byte[] narrate(String text) { + return textToSpeechModel.call(text); + } + + public byte[] narrateWithOptions(String text, TextToSpeechOptions options) { + TextToSpeechPrompt prompt = new TextToSpeechPrompt(text, options); + TextToSpeechResponse response = textToSpeechModel.call(prompt); + return response.getResult().getOutput(); + } +} +---- + +This service works seamlessly with OpenAI, ElevenLabs, or any other TTS provider, with the actual implementation determined by your Spring Boot configuration. + +== Provider-Specific Features + +While the shared interfaces provide portability, each provider also offers specific features through provider-specific options classes (e.g., `OpenAiAudioSpeechOptions`, `ElevenLabsSpeechOptions`). These classes implement the `TextToSpeechOptions` interface while adding provider-specific capabilities. \ No newline at end of file diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc index 0c2758fd556..66f61b6e029 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc @@ -102,6 +102,8 @@ TIP: All properties prefixed with `spring.ai.openai.image.options` can be overri The `OpenAiAudioSpeechOptions` class provides the options to use when making a text-to-speech request. On start-up, the options specified by `spring.ai.openai.audio.speech` are used but you can override these at runtime. +The `OpenAiAudioSpeechOptions` class implements the `TextToSpeechOptions` interface, providing both portable and OpenAI-specific configuration options. + For example: [source,java] @@ -110,11 +112,11 @@ OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .model("gpt-4o-mini-tts") .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .speed(1.0f) + .speed(1.0) .build(); -SpeechPrompt speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions); -SpeechResponse response = openAiAudioSpeechModel.call(speechPrompt); +TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt("Hello, this is a text-to-speech example.", speechOptions); +TextToSpeechResponse response = openAiAudioSpeechModel.call(speechPrompt); ---- == Manual Configuration @@ -152,15 +154,15 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi); var speechOptions = OpenAiAudioSpeechOptions.builder() .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .speed(1.0f) + .speed(1.0) .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); -var speechPrompt = new SpeechPrompt("Hello, this is a text-to-speech example.", speechOptions); -SpeechResponse response = openAiAudioSpeechModel.call(speechPrompt); +var speechPrompt = new TextToSpeechPrompt("Hello, this is a text-to-speech example.", speechOptions); +TextToSpeechResponse response = openAiAudioSpeechModel.call(speechPrompt); // Accessing metadata (rate limit info) -OpenAiAudioSpeechResponseMetadata metadata = response.getMetadata(); +OpenAiAudioSpeechResponseMetadata metadata = (OpenAiAudioSpeechResponseMetadata) response.getMetadata(); byte[] responseAsBytes = response.getResult().getOutput(); ---- @@ -169,6 +171,8 @@ byte[] responseAsBytes = response.getResult().getOutput(); The Speech API provides support for real-time audio streaming using chunk transfer encoding. This means that the audio is able to be played before the full file has been generated and made accessible. +The `OpenAiAudioSpeechModel` implements the `StreamingTextToSpeechModel` interface, providing both standard and streaming capabilities. + [source,java] ---- var openAiAudioApi = new OpenAiAudioApi() @@ -179,16 +183,79 @@ var openAiAudioSpeechModel = new OpenAiAudioSpeechModel(openAiAudioApi); OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder() .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY) - .speed(1.0f) + .speed(1.0) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) .model(OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.value) .build(); -SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); +TextToSpeechPrompt speechPrompt = new TextToSpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); + +Flux responseStream = openAiAudioSpeechModel.stream(speechPrompt); + +// You can also stream raw audio bytes directly +Flux audioByteStream = openAiAudioSpeechModel.stream("Hello, world!"); +---- + +== Migration Guide + +If you're upgrading from the deprecated `SpeechModel` and `SpeechPrompt` classes, here's what has changed: + +=== Updated Interfaces + +[cols="1,1"] +|==== +| Deprecated (Removed) | New Interface + +| `SpeechModel` +| `TextToSpeechModel` + +| `StreamingSpeechModel` +| `StreamingTextToSpeechModel` + +| `SpeechPrompt` +| `TextToSpeechPrompt` + +| `SpeechResponse` +| `TextToSpeechResponse` -Flux responseStream = openAiAudioSpeechModel.stream(speechPrompt); +| `Speech` (in `org.springframework.ai.openai.audio.speech`) +| `Speech` (in `org.springframework.ai.audio.tts`) +|==== + +=== Code Changes + +Before (deprecated): +[source,java] +---- +import org.springframework.ai.openai.audio.speech.*; + +SpeechModel model = new OpenAiAudioSpeechModel(audioApi); +SpeechPrompt prompt = new SpeechPrompt("Hello", options); +SpeechResponse response = model.call(prompt); +byte[] audio = response.getResult().getOutput(); +---- + +After (using shared interfaces): +[source,java] +---- +import org.springframework.ai.audio.tts.*; +import org.springframework.ai.openai.OpenAiAudioSpeechModel; + +TextToSpeechModel model = new OpenAiAudioSpeechModel(audioApi); +TextToSpeechPrompt prompt = new TextToSpeechPrompt("Hello", options); +TextToSpeechResponse response = model.call(prompt); +byte[] audio = response.getResult().getOutput(); ---- +The `OpenAiAudioSpeechOptions` class remains unchanged and continues to work with the new interfaces. + +=== Benefits of the Migration + +- **Portability**: Write code once, switch between OpenAI, ElevenLabs, or other TTS providers easily +- **Consistency**: Same patterns as ChatModel and other Spring AI abstractions +- **Type Safety**: Improved type hierarchy with proper interface implementations +- **Future-Proof**: New TTS providers will automatically work with your existing code + == Example Code * The link:https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java[OpenAiSpeechModelIT.java] test provides some general examples of how to use the library. diff --git a/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/StreamingTextToSpeechModel.java b/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/StreamingTextToSpeechModel.java index f342b0fb0aa..9d0d558f578 100644 --- a/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/StreamingTextToSpeechModel.java +++ b/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/StreamingTextToSpeechModel.java @@ -25,6 +25,7 @@ * * @author Alexandros Pappas */ +@FunctionalInterface public interface StreamingTextToSpeechModel extends StreamingModel { default Flux stream(String text) { diff --git a/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/TextToSpeechModel.java b/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/TextToSpeechModel.java index 1f417992acd..3aa14f43b4d 100644 --- a/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/TextToSpeechModel.java +++ b/spring-ai-model/src/main/java/org/springframework/ai/audio/tts/TextToSpeechModel.java @@ -24,12 +24,16 @@ * * @author Alexandros Pappas */ -public interface TextToSpeechModel extends Model { +public interface TextToSpeechModel extends Model, StreamingTextToSpeechModel { default byte[] call(String text) { TextToSpeechPrompt prompt = new TextToSpeechPrompt(text); ModelResult result = call(prompt).getResult(); - return (result != null) ? result.getOutput() : new byte[0]; + if (result == null) { + return new byte[0]; + } + byte[] output = result.getOutput(); + return (output != null) ? output : new byte[0]; } @Override diff --git a/spring-ai-model/src/test/java/org/springframework/ai/audio/tts/TextToSpeechModelTests.java b/spring-ai-model/src/test/java/org/springframework/ai/audio/tts/TextToSpeechModelTests.java new file mode 100644 index 00000000000..8371b88384a --- /dev/null +++ b/spring-ai-model/src/test/java/org/springframework/ai/audio/tts/TextToSpeechModelTests.java @@ -0,0 +1,202 @@ +/* + * Copyright 2025-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.audio.tts; + +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isA; +import static org.mockito.BDDMockito.given; +import static org.mockito.Mockito.doCallRealMethod; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +/** + * Unit Tests for {@link TextToSpeechModel}. + * + * @author Mark Pollack + * @since 1.1.0 + */ +class TextToSpeechModelTests { + + @Test + void callWithStringCallsCallWithPromptAndReturnsAudioCorrectly() { + String inputText = "Hello, world!"; + byte[] expectedAudio = new byte[] { 1, 2, 3, 4, 5 }; + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + Speech mockSpeech = Mockito.mock(Speech.class); + given(mockSpeech.getOutput()).willReturn(expectedAudio); + + TextToSpeechResponse response = Mockito.mock(TextToSpeechResponse.class); + given(response.getResult()).willReturn(mockSpeech); + + doCallRealMethod().when(mockModel).call(anyString()); + + given(mockModel.call(any(TextToSpeechPrompt.class))).willAnswer(invocationOnMock -> { + TextToSpeechPrompt prompt = invocationOnMock.getArgument(0); + + assertThat(prompt).isNotNull(); + assertThat(prompt.getInstructions().getText()).isEqualTo(inputText); + + return response; + }); + + byte[] actualAudio = mockModel.call(inputText); + + assertThat(actualAudio).isEqualTo(expectedAudio); + + verify(mockModel, times(1)).call(eq(inputText)); + verify(mockModel, times(1)).call(isA(TextToSpeechPrompt.class)); + verify(response, times(1)).getResult(); + verify(mockSpeech, times(1)).getOutput(); + verifyNoMoreInteractions(mockModel, mockSpeech, response); + } + + @Test + void callWithEmptyStringReturnsEmptyAudio() { + String inputText = ""; + byte[] expectedAudio = new byte[0]; + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + Speech mockSpeech = Mockito.mock(Speech.class); + given(mockSpeech.getOutput()).willReturn(expectedAudio); + + TextToSpeechResponse response = Mockito.mock(TextToSpeechResponse.class); + given(response.getResult()).willReturn(mockSpeech); + + doCallRealMethod().when(mockModel).call(anyString()); + given(mockModel.call(any(TextToSpeechPrompt.class))).willReturn(response); + + byte[] result = mockModel.call(inputText); + + assertThat(result).isEqualTo(expectedAudio); + verify(mockModel, times(1)).call(eq(inputText)); + verify(mockModel, times(1)).call(isA(TextToSpeechPrompt.class)); + } + + @Test + void callWhenPromptCallThrowsExceptionPropagatesCorrectly() { + String inputText = "Test message"; + RuntimeException expectedException = new RuntimeException("API call failed"); + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + doCallRealMethod().when(mockModel).call(anyString()); + given(mockModel.call(any(TextToSpeechPrompt.class))).willThrow(expectedException); + + assertThatThrownBy(() -> mockModel.call(inputText)).isEqualTo(expectedException); + + verify(mockModel, times(1)).call(eq(inputText)); + verify(mockModel, times(1)).call(isA(TextToSpeechPrompt.class)); + } + + @Test + void callWhenResponseIsNullHandlesGracefully() { + String inputText = "Test message"; + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + doCallRealMethod().when(mockModel).call(anyString()); + given(mockModel.call(any(TextToSpeechPrompt.class))).willReturn(null); + + assertThatThrownBy(() -> mockModel.call(inputText)).isInstanceOf(NullPointerException.class); + + verify(mockModel, times(1)).call(eq(inputText)); + verify(mockModel, times(1)).call(isA(TextToSpeechPrompt.class)); + } + + @Test + void callWhenSpeechIsNullReturnsEmptyArray() { + String inputText = "Test message"; + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + TextToSpeechResponse response = Mockito.mock(TextToSpeechResponse.class); + given(response.getResult()).willReturn(null); + + doCallRealMethod().when(mockModel).call(anyString()); + given(mockModel.call(any(TextToSpeechPrompt.class))).willReturn(response); + + byte[] result = mockModel.call(inputText); + + assertThat(result).isEmpty(); + verify(mockModel, times(1)).call(eq(inputText)); + verify(response, times(1)).getResult(); + } + + @Test + void callWhenAudioOutputIsNullReturnsEmptyArray() { + String inputText = "Test message"; + + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + Speech mockSpeech = Mockito.mock(Speech.class); + given(mockSpeech.getOutput()).willReturn(null); + + TextToSpeechResponse response = Mockito.mock(TextToSpeechResponse.class); + given(response.getResult()).willReturn(mockSpeech); + + doCallRealMethod().when(mockModel).call(anyString()); + given(mockModel.call(any(TextToSpeechPrompt.class))).willReturn(response); + + byte[] result = mockModel.call(inputText); + + assertThat(result).isEmpty(); + verify(mockModel, times(1)).call(eq(inputText)); + verify(mockSpeech, times(1)).getOutput(); + } + + @Test + void callMultipleTimesWithSameModelMaintainsState() { + TextToSpeechModel mockModel = Mockito.mock(TextToSpeechModel.class); + + doCallRealMethod().when(mockModel).call(anyString()); + + // First call + setupMockResponse(mockModel, new byte[] { 1, 2, 3 }); + byte[] result1 = mockModel.call("Message 1"); + assertThat(result1).isEqualTo(new byte[] { 1, 2, 3 }); + + // Second call + setupMockResponse(mockModel, new byte[] { 4, 5, 6 }); + byte[] result2 = mockModel.call("Message 2"); + assertThat(result2).isEqualTo(new byte[] { 4, 5, 6 }); + + verify(mockModel, times(2)).call(anyString()); + verify(mockModel, times(2)).call(any(TextToSpeechPrompt.class)); + } + + private void setupMockResponse(TextToSpeechModel mockModel, byte[] audioOutput) { + Speech mockSpeech = Mockito.mock(Speech.class); + given(mockSpeech.getOutput()).willReturn(audioOutput); + + TextToSpeechResponse response = Mockito.mock(TextToSpeechResponse.class); + given(response.getResult()).willReturn(mockSpeech); + + given(mockModel.call(any(TextToSpeechPrompt.class))).willReturn(response); + } + +}