# Azure Speech Podcast Maker

This sample creates an audio file of a podcast conversation between two hosts

#### Pre-requisites
- Azure Subscription
- Azure Speech Service in one of these regions: East US, West Europe, and Southeast Asia
- Azure Bing Search Service
- A python environment to run this notebook

#### Steps
- Update .env file with environment details (endpoints, models, keys)
- Create a Python virtual environment (optional)
- Install required libraries
- Run every cell on this notebook until you reach the "Stop Here" section where you can customize the podcast content as needed

In [None]:
#Import required libraries

import os
import datetime
import requests
import shutil

from openai import AzureOpenAI
import azure.cognitiveservices.speech as speechsdk

import PyPDF2

from dotenv import load_dotenv
load_dotenv()

In [22]:
#Load environment variables

AOAI_ENDPOINT = os.environ.get("AOAI_ENDPOINT")
AOAI_KEY = os.environ.get("AOAI_KEY")
AOAI_MODEL_NAME = os.environ.get("AOAI_MODEL_NAME")
AOAI_API_VERSION = os.environ.get("AOAI_API_VERSION")
SPEECH_KEY = os.environ.get("SPEECH_KEY")
SPEECH_REGION = os.environ.get("SPEECH_REGION")
BING_API_KEY = os.environ.get("BING_API_KEY")

The next series of code cells define the functions needed

In [23]:
# Auxiliary to print with time

def printwithtime(*args):
    # show milliseconds
    import datetime
    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), *args)

In [24]:
# download file from url

def download(url, filename):
    #TODO: additional url verifications

    # if url start with http or https
    if not url.startswith("http") or not url.startswith("https"):
        # copy the file
        shutil.copy(url, filename)
    else:
        response = requests.get(url)
        with open(filename, 'wb') as file:
            file.write(response.content)    

     # return context type
    return response.headers['content-type']

In [25]:
# convert pdf to text

def pdf2text(pdf_file):
    pdf_file = open(pdf_file, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += "\n" + page.extract_text()

    printwithtime(f"Text extracted from pdf: {len(text)} characters")
    return text

In [26]:
# Add Dall-3 function to generate cover image


In [27]:
# Add Bing Search function to get latest news related to the topic

def GetNewsFromBing(text):

    client = AzureOpenAI(
        api_key = AOAI_KEY,  
        api_version = AOAI_API_VERSION,
        azure_endpoint = AOAI_ENDPOINT
        )
    
    prompt_topics = "Extract the key entities from the text in a way that can be used to search for news articles."
    prompt_summary = """
Create a summary of recent news related to a list of topics provided. 
The summary should be in a single sentence of less than 300 words, prioritizing the news articles that seem more relevant to the topics provided.
The sentence should be written in a way that is easy to understand how the news articles are related to the topics.
Most relevant information is in the description, name & provider of the news articles. but other data included in the response may be useful too.
"""

    trycount = 3
    while trycount > 0:
        try:
            completion = client.chat.completions.create(
                model=AOAI_MODEL_NAME,
                messages=[
                    {"role": "system", "content": prompt_topics},
                    {"role": "user", "content": text}
                ],
                temperature=0.4,
                max_tokens=4096,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None)
            
            newstopics = completion.choices[0].message.content 
            # print(newstopics)

            url = f"https://api.bing.microsoft.com/v7.0/news/search?q={newstopics}&count=5"
            headers = {
                "Ocp-Apim-Subscription-Key": BING_API_KEY
            }
            response = requests.get(url, headers=headers)
            news = response.json()
           
            bingtext = f"""
            Topics:
            {newstopics}

            Bing Response:
            {news}
            """

            completion = client.chat.completions.create(
                model=AOAI_MODEL_NAME,
                messages=[
                    {"role": "system", "content": prompt_summary},
                    {"role": "user", "content": bingtext}
                ],
                temperature=0.9,
                max_tokens=4096,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None)
            
            newssummary = completion.choices[0].message.content 
            # print(newssummary)

            return newssummary

            break
        except Exception as e:
            print(e)
            trycount -= 1
            continue
    



In [28]:
def CreatePodcastSsml(text, prompt, lang, outssml):

    client = AzureOpenAI(
        api_key = AOAI_KEY,  
        api_version = AOAI_API_VERSION,
        azure_endpoint = AOAI_ENDPOINT
        )
    
    trycount = 3
    while trycount > 0:
        try:
            completion = client.chat.completions.create(
                model=AOAI_MODEL_NAME,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": text}
                ],
                temperature=0.4,
                max_tokens=4096,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None)
            
            podcasttext = completion.choices[0].message.content 
            break
        except Exception as e:
            print(e)
            trycount -= 1
            continue

    # create ssml file
    with open(outssml, 'w') as file:
        file.write(podcasttext)

    return podcasttext 

In [29]:
# Generate audio with Azure TTS HD voices

def GenerateAudio(ssml, outaudio):
    speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)

    # Creates an audio configuration that points to an audio file.
    audio_output = speechsdk.audio.AudioOutputConfig(filename=outaudio)

    # Creates a speech synthesizer using the Azure Speech Service.
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)

    # Synthesizes the received text to speech.
    result = speech_synthesizer.speak_ssml_async(ssml).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        printwithtime("Speech synthesis was successful. Audio was written to '{}'".format(outaudio))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        printwithtime("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                printwithtime("Error details: {}".format(cancellation_details.error_details))


In [30]:
def GeneratePodcastFromText(text, prompt, addnews = True, lang = "en-US", outaudio = None):
    # get the file name from url
    if outaudio is None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        outaudio  = f"./podcasts/podcast_{timestamp}.wav"

    outssml = outaudio.replace(".wav", ".ssml")

    # Ensure the directory exists before writing to the file
    directory = os.path.dirname(outaudio)
    print(directory)
    if not os.path.exists(directory):
        os.makedirs(directory)

    if addnews:
        # get news from Bing
        printwithtime("Getting news from Bing")
        news = GetNewsFromBing(text)
        text += "\n Related news: \n" + news

    text = "Main topic: \n" + text
    
    # create podcast ssml
    printwithtime("Creating podcast ssml")
    ssml = CreatePodcastSsml(text, prompt, lang, outssml)
    
    # generate podcast
    printwithtime("Generating podcast with Azure TTS")
    GenerateAudio(ssml, outaudio)

## Stop Here
#### This is where you setup the podcast content details

In [31]:
lang = "en-US"

showname = "AI Morning Bytes"
hosts = "Brad & Rachel"
addnews = True

content = f"""
Most recent innovations in AI from Microsoft
"""

# There is no need to modify the prompt below unless a change in style, format, etc. is desired

prompt = f"""
    Create a conversational, engaging podcast script named '{showname}' between two hosts {hosts}
    The podcast should be based on the input text and include additional related content to make it engaging
    The conversation should include examples, analogies, rethorical questions, and other engaging elements
    Include interjections such as yeah, uh-huh, right, etc. to make it sound more natural
    Include pauses, laughter, and other conversational elements as well as informal language like haha, wow, etc.
    Do not use acronyms or jargon even if included in the input text and rather spell it out in a conversational manner
    Maintain a conversational tone throughout the podcast
    Text may include related news, use these news as supplemental content, but focus mostly on the main topic
    Regardless of the language in the input text, the output conversation should be based on locale {lang}

    Output into SSML format like below, please don't change voice name even if hosts have other names

    <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='en-us-Andrew:DragonHDLatestNeural'><lang xml:lang='{lang}'>text</lang></voice> 
    <voice name='en-us-ava:DragonHDLatestNeural'><lang xml:lang='{lang}'>text</lang></voice>
    </speak>
"""

In [None]:
# This is the step that will trigger everything else an generate the audio file

GeneratePodcastFromText(content, prompt, addnews, lang)