# Neural Speech Generation for Shakespeare Play

In [2]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set Up Environment

In [5]:
from dotenv import load_dotenv

load_dotenv()

True

## Load Data

In [3]:
fname = "../data/The-Comedy-of-Errors.txt"

with open(fname) as f: 
    play = f.read()

#print(play)

## Set Up Azure OpenAI

In [4]:
import os
import openai

# Set up Azure OpenAI
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-03-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

True

## Deploy a Model

In [5]:
# id of desired_model
desired_model = 'gpt-4-32k' 
desired_capability = 'chat_completion' # apply as completion, since gpt-4 is only released as chat in Azure OpenAI

# list models deployed with
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    print(model)
    # check if desired_model is deployed, and if it has 'completion' capability
    if model["id"] == desired_model and model['capabilities'][desired_capability]:
        deployment_id = deployment["id"]
        
# if no model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')

    # Deploy the model
    print(f'Creating a new deployment with model: {desired_model}')
    result = openai.Deployment.create(model=desired_model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {desired_model} that supports text {desired_capability} with id: {deployment_id}.')
else:
    print(f'Found a succeeded deployment of "{desired_model}" that supports text {desired_capability} with id: {deployment_id}.')

{
  "capabilities": {
    "chat_completion": true,
    "completion": false,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]
  },
  "created_at": 1679356800,
  "deprecation": {
    "inference": 1742515200
  },
  "id": "gpt-4-32k",
  "lifecycle_status": "preview",
  "object": "model",
  "status": "succeeded",
  "updated_at": 1679356800
}
{
  "capabilities": {
    "chat_completion": true,
    "completion": false,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]
  },
  "created_at": 1679356800,
  "deprecation": {
    "inference": 1742515200
  },
  "id": "gpt-4",
  "lifecycle_status": "preview",
  "object": "model",
  "status": "succeeded",
  "updated_at": 1679356800
}
{
  "capabilities": {
    "chat_completion": false,
    "completion": true,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]


## Identify Casts, Scenes, Acts, Dialogues, Synopsis with OpenAI

In [None]:
# Prompt postfix
prompt_postfix = """ <document>
  \n###
  \nExtract synopsis, act, scene, casts and associated dialogues, into json format. 
"""
print(prompt_postfix)

## Generate Neural Speech  <<--- Do this first

In [17]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=os.getenv('SPEECH_KEY'), region=os.getenv('SPEECH_REGION'))
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

# Text to synthesise 
text = "Hey, how are you?"

# Synthesise speech
result = speech_synthesizer.speak_text_async(text).get()

In [26]:
def speech_synthesis_to_mp3_file():
    """performs speech synthesis to a mp3 file"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=os.getenv('SPEECH_KEY'), region=os.getenv('SPEECH_REGION'))
    
    # Sets the synthesis output format.
    # The full list of supported format can be found here:
    # https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
    
    # Creates a speech synthesizer using file as audio output.
    # Replace with your own audio file name.
    file_name = "outputaudio.mp3"
    file_config = speechsdk.audio.AudioOutputConfig(filename=file_name)

    # Sets the synthesis voice name.
    voice = "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)"  <--- update this
    speech_config.speech_synthesis_voice_name = voice

    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)

    try:
        #text = input()
        text = "Proceed, Solinus, to procure my fall, And by the doom of death end woes and all."
    except EOFError as e:
        print(e)
        exit()
        
    result = speech_synthesizer.speak_text_async(text).get()
    # Check result
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text, file_name))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

In [27]:
speech_synthesis_to_mp3_file()

Speech synthesized for text [Proceed, Solinus, to procure my fall, And by the doom of death end woes and all.], and the audio was saved to [outputaudio.mp3]
