In [1]:
import logging
import os

# set the directory to the location of the script
try:
    os.chdir("..")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

Changed directory to: c:\Users\pablosal\Desktop\gbb-ai-audio-agent


In [16]:
import os
import sys
from pathlib import Path
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech.transcription import ConversationTranscriber
from dotenv import load_dotenv

# Load credentials
load_dotenv()
KEY = os.getenv("AZURE_SPEECH_KEY")
REGION = os.getenv("AZURE_SPEECH_REGION", "eastus")
AUDIO = Path("podcast_voice_tests/ground_truth/Podcast_sample_CLT_SD.wav")
OUT_DIR = Path("podcast_voice_tests/ground_truth")
TRANSCRIPT_FILE = OUT_DIR / "Podcast_sample_CLT_SD_transcript.txt"

# Validate
if not KEY:
    print("Missing AZURE_SPEECH_KEY")
    sys.exit(1)
if not AUDIO.exists():
    print(f"Missing audio at {AUDIO}")
    sys.exit(1)
OUT_DIR.mkdir(parents=True, exist_ok=True)


def make_speech_config():
    cfg = speechsdk.SpeechConfig(subscription=KEY, region=REGION)
    # Enable continuous language detection
    cfg.set_property(
        property_id=speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode,
        value="Continuous",
    )
    return cfg


def transcribe_conversation(audio_path: Path, out_file: Path):
    """Transcribe audio and save to a text file."""
    speech_cfg = make_speech_config()
    audio_cfg = speechsdk.AudioConfig(filename=str(audio_path))
    transcriber = ConversationTranscriber(speech_cfg, audio_cfg)

    done = False
    with open(out_file, "w", encoding="utf-8") as f:

        def stop_cb(evt):
            nonlocal done
            print("[CONVERSATION] Session stopped.")
            done = True

        def on_transcribed(evt):
            speaker = evt.result.speaker_id or "?"
            text = evt.result.text
            line = f"[Speaker {speaker}] {text}"
            print(line)
            f.write(line + "\n")

        transcriber.transcribed.connect(on_transcribed)
        transcriber.session_stopped.connect(stop_cb)
        transcriber.canceled.connect(stop_cb)

        print("[CONVERSATION] Starting transcription…")
        transcriber.start_transcribing_async()
        while not done:
            pass
        transcriber.stop_continuous_recognition()

In [None]:
transcribe_conversation(AUDIO, TRANSCRIPT_FILE)

## Exploring TTS Models (Azure)

In [None]:
AZURE_KEY = ""
AZURE_REGION = "northcentralus"

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# Output directory for test WAVs
OUTPUT_DIR = Path("podcast_voice_tests")
OUTPUT_DIR.mkdir(exist_ok=True)

# A short “podcast” excerpt for testing
PODCAST_TEXT = """\
Hello and welcome to “Tech Talks Weekly.” 
I’m your host, Alice, bringing you the latest in AI. 
Next up, Bob will dive into our featured topic: high-definition neural voices!\
"""

# Define voices to test: a mix of Azure AI Speech HD and OpenAI NeuralHD
VOICE_LIST_HD = [
    # Azure AI Speech HD voices
    "en-US-Ava:DragonHDLatestNeural",
    "en-US-Andrew3:DragonHDLatestNeural",
    "es-ES-Ximena:DragonHDLatestNeural",
]

VOICE_LIST_OPENAI = [
    # Azure AI Speech HD voices
    "en-US-AlloyMultilingualNeuralHD",
    "en-US-NovaMultilingualNeuralHD",
]


def synthesize_to_file(ssml: str, voice_name: str, out_path: Path):
    """
    Synthesize the given SSML and write a WAV file.
    """
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
    speech_config.speech_synthesis_voice_name = voice_name
    # Use high-quality WAV
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Riff48Khz16BitMonoPcm
    )
    audio_cfg = speechsdk.audio.AudioOutputConfig(filename=str(out_path))
    synthesizer = speechsdk.SpeechSynthesizer(speech_config, audio_cfg)

    result = synthesizer.speak_ssml_async(ssml).get()
    if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
        error = result.cancellation_details
        print(
            f"[ERROR] {voice_name} failed: {error.error_details if error else result.reason}"
        )


def build_plain_ssml(text: str, voice_name: str, locale: str = "en-US") -> str:
    """
    Wrap plain text in a single <voice> tag for SSML synthesis.
    """
    return f"""
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" 
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="{locale}">
  <voice name="{voice_name}">{text}</voice>
</speak>
"""


# --- SINGLE-VOICE TEST ------------------------------------------------------------

for voice in VOICE_LIST_OPENAI:
    ssml = build_plain_ssml(PODCAST_TEXT, voice)
    filename = OUTPUT_DIR / f"podcast_{voice.replace(':','-')}.wav"
    print(f"⏳ Synthesizing with {voice} → {filename.name}")
    synthesize_to_file(ssml, voice, filename)

⏳ Synthesizing with en-US-AlloyMultilingualNeuralHD → podcast_en-US-AlloyMultilingualNeuralHD.wav
⏳ Synthesizing with en-US-NovaMultilingualNeuralHD → podcast_en-US-NovaMultilingualNeuralHD.wav


## Azure AI Speech

In [36]:
import os
import sys
from pathlib import Path
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv

"""
This script reads a ground-truth transcript with speaker labels,
builds an SSML document assigning distinct Azure HD voices to each speaker,
and synthesizes the result into a single WAV file.
"""

# Load Azure credentials from environment or .env
# Load Azure credentials
load_dotenv()
AZURE_KEY = os.getenv("AZURE_SPEECH_KEY")
AZURE_REGION = os.getenv("AZURE_SPEECH_REGION", "eastus")

if not AZURE_KEY:
    print("Error: Please set AZURE_SPEECH_KEY in your environment or .env")
    sys.exit(1)

# Paths & files
OUTPUT_DIR = Path("podcast_voice_tests")
GROUND_DIR = OUTPUT_DIR / "ground_truth"
TRANSCRIPT = GROUND_DIR / "Podcast_sample_CLT_SD_transcript.txt"
OUTPUT_WAV = OUTPUT_DIR / "podcast_dynamic_HD.wav"

# Ensure dirs exist
for d in (OUTPUT_DIR, GROUND_DIR):
    d.mkdir(exist_ok=True)

if not TRANSCRIPT.exists():
    print(f"Error: Transcript file not found at {TRANSCRIPT}")
    sys.exit(1)

# Map each speaker to a voice persona
VOICE_MAP = {
    "Speaker Guest-1": "en-US-Andrew3:DragonHDLatestNeural",  # Male
    "Speaker Guest-2": "en-US-Ava:DragonHDLatestNeural",  # Female
}


def load_segments(path: Path):
    """Parse the transcript into (speaker, text) segments."""
    segments = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or not line.startswith("[") or "]" not in line:
            continue
        speaker_label, text = line.split("]", 1)
        speaker = speaker_label.lstrip("[").strip()
        segments.append((speaker, text.strip()))
    return segments


def build_ssml(segments):
    """Build SSML with express-as styles and prosody for natural flow."""
    ssml = [
        '<speak version="1.0"',
        '       xmlns="http://www.w3.org/2001/10/synthesis"',
        '       xmlns:mstts="https://www.w3.org/2001/mstts"',
        '       xml:lang="en-US">',
    ]
    for i, (speaker, text) in enumerate(segments):
        voice = VOICE_MAP.get(speaker, next(iter(VOICE_MAP.values())))
        rate = "+3%" if i % 2 == 0 else "-2%"
        ssml.append(
            f"""
  <voice name="{voice}">
    <mstts:express-as style="chat">
      <prosody rate="{rate}">
        {text}
      </prosody>
    </mstts:express-as>
  </voice>""".strip()
        )
    ssml.append("</speak>")
    return "\n".join(ssml)


def synthesize_ssml(ssml: str, out_path: Path):
    """Synthesize the SSML to a WAV file using Azure TTS."""
    config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
    config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Riff48Khz16BitMonoPcm
    )
    audio_config = speechsdk.audio.AudioOutputConfig(filename=str(out_path))
    synthesizer = speechsdk.SpeechSynthesizer(config, audio_config)

    print(f"Synthesizing to {out_path}...")
    result = synthesizer.speak_ssml_async(ssml).get()
    if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
        err = result.cancellation_details
        raise RuntimeError(f"TTS failed: {err.error_details if err else result.reason}")
    print("Synthesis complete.")


segments = load_segments(TRANSCRIPT)
if not segments:
    print("Error: No valid segments found in transcript.")
    sys.exit(1)

ssml_doc = build_ssml(segments)
synthesize_ssml(ssml_doc, OUTPUT_WAV)

Synthesizing to podcast_voice_tests\podcast_dynamic_HD.wav...
Synthesis complete.


In [None]:
import os
import sys
from pathlib import Path
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv

"""
This script reads a ground-truth transcript with speaker labels,
builds an SSML document assigning distinct Azure HD voices to each speaker,
and synthesizes the result into a single WAV file.
"""

# Load Azure credentials from environment or .env
# Load Azure credentials


if not AZURE_KEY:
    print("Error: Please set AZURE_SPEECH_KEY in your environment or .env")
    sys.exit(1)

# Paths & files
OUTPUT_DIR = Path("podcast_voice_tests")
GROUND_DIR = OUTPUT_DIR / "ground_truth"
TRANSCRIPT = GROUND_DIR / "Podcast_sample_CLT_SD_transcript.txt"
OUTPUT_WAV = OUTPUT_DIR / "podcast_dynamic_AOAI.wav"

# Ensure dirs exist
for d in (OUTPUT_DIR, GROUND_DIR):
    d.mkdir(exist_ok=True)

if not TRANSCRIPT.exists():
    print(f"Error: Transcript file not found at {TRANSCRIPT}")
    sys.exit(1)


# Map each speaker to a voice persona
VOICE_MAP = {
    "Speaker Guest-1": "en-US-AlloyMultilingualNeuralHD",  # Male
    "Speaker Guest-2": "en-US-NovaMultilingualNeuralHD",  # Female
}


def load_segments(path: Path):
    """Parse the transcript into (speaker, text) segments."""
    segments = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or not line.startswith("[") or "]" not in line:
            continue
        speaker_label, text = line.split("]", 1)
        speaker = speaker_label.lstrip("[").strip()
        segments.append((speaker, text.strip()))
    return segments


def build_ssml(segments):
    """Build SSML with express-as styles and prosody for natural flow."""
    ssml = [
        '<speak version="1.0"',
        '       xmlns="http://www.w3.org/2001/10/synthesis"',
        '       xmlns:mstts="https://www.w3.org/2001/mstts"',
        '       xml:lang="en-US">',
    ]
    for i, (speaker, text) in enumerate(segments):
        voice = VOICE_MAP.get(speaker, next(iter(VOICE_MAP.values())))
        rate = "+3%" if i % 2 == 0 else "-2%"
        ssml.append(
            f"""
  <voice name="{voice}">
    <mstts:express-as style="chat">
      <prosody rate="{rate}">
        {text}
      </prosody>
    </mstts:express-as>
  </voice>""".strip()
        )
    ssml.append("</speak>")
    return "\n".join(ssml)


def synthesize_ssml(ssml: str, out_path: Path):
    """Synthesize the SSML to a WAV file using Azure TTS."""
    config = speechsdk.SpeechConfig(subscription=AZURE_KEY, region=AZURE_REGION)
    config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Riff48Khz16BitMonoPcm
    )
    audio_config = speechsdk.audio.AudioOutputConfig(filename=str(out_path))
    synthesizer = speechsdk.SpeechSynthesizer(config, audio_config)

    print(f"Synthesizing to {out_path}...")
    result = synthesizer.speak_ssml_async(ssml).get()
    if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
        err = result.cancellation_details
        raise RuntimeError(f"TTS failed: {err.error_details if err else result.reason}")
    print("Synthesis complete.")


segments = load_segments(TRANSCRIPT)
if not segments:
    print("Error: No valid segments found in transcript.")
    sys.exit(1)

ssml_doc = build_ssml(segments)
synthesize_ssml(ssml_doc, OUTPUT_WAV)

Synthesizing to podcast_voice_tests\podcast_dynamic_AOAI.wav...
Synthesis complete.
