# 3. Generate Audio (HD Voice)

https://learn.microsoft.com/en-us/azure/ai-services/speech-service/high-definition-voices

In [1]:
import os
from dotenv import load_dotenv

# Get current working directory
print(f"Current working directory: {os.getcwd()}")

# Load .env from parent directory (where the app.py is located)
env_path = os.path.join(os.path.dirname(os.getcwd()), '.env')
print(f"Looking for .env at: {env_path}")

if os.path.exists(env_path):
    load_dotenv(env_path)
    print("✓ .env file loaded successfully")
else:
    print("✗ .env file not found")
    # Try alternative path
    alt_env_path = '../.env'
    if os.path.exists(alt_env_path):
        load_dotenv(alt_env_path)
        print(f"✓ .env file loaded from {alt_env_path}")
    else:
        print("✗ .env file not found in alternative location either")

Current working directory: /Users/rh/workspace/ai_podcast/azure-notebooklm/notebook
Looking for .env at: /Users/rh/workspace/ai_podcast/azure-notebooklm/.env
✓ .env file loaded successfully


In [2]:
import json

with open("1706.json", "r", encoding="utf-8") as f:
    jsfile = f.read()

conversation = json.loads(jsfile)
conversation

[{'speaker': 'Host (Alice)',
  'text': "Welcome to the podcast, Dr. Vaswani! I'm thrilled to have you here today."},
 {'speaker': 'Guest',
  'text': "Thank you, Alice! It's a pleasure to join you and discuss the Transformer model."},
 {'speaker': 'Host (Alice)',
  'text': "Let's dive right in. The Transformer—why is it considered a breakthrough in AI?"},
 {'speaker': 'Guest',
  'text': 'Well, it revolutionized sequence modeling by using attention mechanisms instead of recurrence.'},
 {'speaker': 'Host (Alice)',
  'text': 'No recurrence or convolutions? That sounds like a bold departure from tradition!'},
 {'speaker': 'Guest',
  'text': 'Exactly. This approach allows for parallelization, making training faster and more efficient.'},
 {'speaker': 'Host (Alice)',
  'text': 'Could you explain how attention works in this context? Break it down for us.'},
 {'speaker': 'Guest',
  'text': 'Sure! Attention identifies relationships between inputs and outputs, regardless of their distance.'},
 {'

## HD Options

Default [temperature parameter](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/high-definition-voices#how-to-use-azure-ai-speech-hd-voices) is 1.0

In [3]:
def generate_ssml(host_voice, guest_voice, temperature=1.0):
    
    ssml = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>"
    for r in conversation:
        #print(row.to_dict())
        
        if r['speaker'] == 'Host (Alice)':
            ssml += f"\n<voice name='{host_voice}' parameters='temperature={temperature}'>{r['text']}</voice>"
        else:
            ssml += f"\n<voice name='{guest_voice}' parameters='temperature={temperature}'>{r['text']}</voice>"
    ssml += "\n</speak>"

    #print(ssml)

    return ssml

In [4]:
import azure.cognitiveservices.speech as speechsdk
import os

# Debug: Check environment variables
speech_key = os.getenv('SPEECH_KEY')
service_region = os.getenv('SPEECH_REGION')

print(f"Speech Key: {'[SET]' if speech_key else '[NOT SET]'}")
print(f"Speech Region: {service_region}")

if not speech_key or not service_region:
    print("ERROR: Missing required environment variables SPEECH_KEY and/or SPEECH_REGION")
    print("Please check your .env file")
else:
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, "logs")
    print("Speech SDK configured successfully")

Speech Key: [SET]
Speech Region: westeurope
Speech SDK configured successfully


In [5]:
import random
import string

def generate_random_filename(length=8):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

def generate_podcast(ssml, filename=None):
    if filename is None:
        temporary_file= "./" + generate_random_filename() + ".wav"
    else:
        temporary_file = filename
        
    audio_output = speechsdk.audio.AudioOutputConfig(filename=temporary_file)

    # Creates a speech synthesizer using the Azure Speech Service.
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)
    result = speech_synthesizer.speak_ssml_async(ssml).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesis was successful. Audio was written to '{}'".format(temporary_file))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
        print("Did you update the subscription info?")

In [6]:
# HD voice (optimized for podcasts)
# host_voice = 'en-us-Ava3:DragonHDLatestNeural'
# guest_voice = 'en-us-Andrew3:DragonHDLatestNeural'

# HD voice (optimized for conversational content) - works best for a 2-person podcast
host_voice = 'en-us-Emma2:DragonHDLatestNeural'
guest_voice = 'en-us-Andrew2:DragonHDLatestNeural'


In [7]:
temp = 0.9
ssml = generate_ssml(host_voice, guest_voice, temp)
generate_podcast(ssml, "1706.wav")

Speech synthesis was successful. Audio was written to '1706.wav'
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
