# PDF to Podcast

## Prerequisites

1. Create a virtual environment and install the required packages.

    ```bash
    python -m venv venv
    ```

2. Install the required packages:

In [None]:
pip install -qU autogen pypdf langchain langchain-text-splitters langchain-core langchain-community lancedb langchain-openai python-dotenv azure-cognitiveservices-speech

In [None]:
# Read .env file

from dotenv import load_dotenv

load_dotenv()

In [16]:
import os

def get_file(file_name: str):
  """Get file path

  Args:
      file_name (str): File name

  Returns:
      File path
  """
  output_folder = 'outputs'
  if not os.path.exists(output_folder):
    os.makedirs(output_folder)
  return os.path.join(output_folder, file_name)

## PDF Information

In [17]:
# Set the pdf information

pdf_title = 'LoRA: Low-Rank Adaptation of Large Language Models'
pdf_url = 'https://arxiv.org/pdf/2106.09685'

In [None]:
# Define the file name without special characters

pdf_filename = pdf_title.replace(':', '').replace('-', '') + '.pdf'
print(pdf_filename)


## Load PDF as langchain document

In [19]:
# Download PDF file

import requests

response = requests.get(pdf_url)

with open(get_file(pdf_filename), 'wb') as file:
  file.write(response.content)



In [20]:
# Create the documents from the PDF file

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(get_file(pdf_filename))
documents = loader.load()

In [22]:
# Update the metadata of all documents

for document in documents:
  document.metadata['title'] = pdf_title
  document.metadata['source'] = pdf_url
  document.metadata['description'] = ''
  document.metadata['thumbnail_url'] = ''
  document.metadata['type'] = 'pdf'

## Create embeddings

In [23]:
# Split the document in chunks of maximum 1000 characters with 200 characters overlap using langchain

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=1000,
  chunk_overlap=200
)
splits = text_splitter.split_documents(documents)

In [24]:
# Define the embeddings model

from langchain_openai import AzureOpenAIEmbeddings

azure_openai_embeddings = AzureOpenAIEmbeddings(
  api_key=os.environ['OPENAI_API_KEY'],
  azure_endpoint=os.environ['OPENAI_AZURE_ENDPOINT'],
  api_version=os.environ['OPENAI_API_VERSION'],
  azure_deployment=os.environ['OPENAI_AZURE_DEPLOYMENT_EMBEDDINGS']
)

In [25]:
# Create the vector store

import lancedb
from langchain_community.vectorstores import LanceDB

db = lancedb.connect("/tmp/lancedb")

vectorstore = LanceDB.from_documents(
  documents=splits,
  embedding=azure_openai_embeddings
)

retriever = vectorstore.as_retriever()

In [26]:
# Clean up: delete the downloaded PDF file

os.remove(get_file(pdf_filename))

## Create the langchain chain to do RAG

In [27]:
# Create the prompt for the chain with embeddings and LLM

from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "/n/n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [28]:
# Define the LLM model

from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
  api_key=os.environ['OPENAI_API_KEY'],
  azure_endpoint=os.environ['OPENAI_AZURE_ENDPOINT'],
  api_version=os.environ['OPENAI_API_VERSION'],
  azure_deployment=os.environ['OPENAI_AZURE_DEPLOYMENT'],
  temperature=0,
  top_p=1
)

In [29]:
# Define the rag chain

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

## Create the outline of the podcast

In [30]:
podcast_outline_response = rag_chain.invoke({"input": "Create an outline for a podcast on LoRA."})
podcast_outline = podcast_outline_response['answer']

In [31]:
# Write the podcast outline

podcast_outline_file_name = pdf_filename.replace('.pdf', '_script.txt')

with open(get_file(podcast_outline_file_name), "w") as f:
    f.write(podcast_outline)

## Create the podcast script

In [32]:
def read_and_print_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        return content
        #print(content)

# Example usage
file_path = '../../data/How to fine-tune a model using LoRA (step by step)_podcast_outline.txt'  # Replace with the path to your file
podcast_outline = read_and_print_file(file_path)


In [None]:
segments_outline = podcast_outline.split("####")
print(segments_outline)

In [34]:
from IPython.display import Image, display

import autogen
from autogen.coding import LocalCommandLineCodeExecutor

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST"
)

In [35]:
import json
import os

# create an AssistantAgent named "assistant"
writer = autogen.AssistantAgent(
    name="writer",
    system_message="""You are a writer of a podcast. If you get a bad review from the reviewer on a segment of a podcast, you need to rewrite that podcast segment. 
    Output the rewritten segment as a JSON with the following fields:
        - text: an array of objects with the speaker, the intonation and the text to be spoken
        Return only the json as plain text.
    "Return 'TERMINATE' when the task is done.""",
    llm_config={
        "cache_seed": 41,  # seed for caching and reproducibility
        "config_list": config_list,  # a list of OpenAI API configurations
        "temperature": 0  # temperature for sampling
    },  # configuration for autogen's enhanced inference API which is compatible with OpenAI API
)

reviewer = autogen.AssistantAgent(
    name="reviewer",
    system_message="""You are a reviewer of a podcast.
    If you see questions and answers that are duplicate one, please ask to remove them. 
    "Return 'TERMINATE' when the task is done.""",
    llm_config={
        "cache_seed": 41,  # seed for caching and reproducibility
        "config_list": config_list,  # a list of OpenAI API configurations
        "temperature": 0  # temperature for sampling
    },  # configuration for autogen's enhanced inference API which is compatible with OpenAI API
)
# create a UserProxyAgent instance named "user_proxy"
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
    # code_execution_config={
    #     # the executor to run the generated code
    #     "executor": LocalCommandLineCodeExecutor(work_dir="coding"),
    # },
)

groupchat = autogen.GroupChat(agents=[user_proxy, reviewer, writer], messages=[], max_round=12)
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config={
        "cache_seed": 41,  # seed for caching and reproducibility
        "config_list": config_list,  # a list of OpenAI API configurations
        "temperature": 0  # temperature for sampling
    })

In [None]:
podcast_conversation = ""
full_script = ""

for segment in range(len(segments_outline[1:])):

  
    
    # Create a prompt with the outline to get a full podcast text
    if len(podcast_conversation) == 0:
        podcast_prompt = f"""Create the first segment of a podcast text which is the introduction based on the following part of an outline:

        {segments_outline[segment]}

        This text will be used to generate the audio of the podcast. 
        There are 2 participants in the podcast: the host and the guest. 
        The host will introduce the podcast and the guest. 
        Both the host and the speaker should stick to the {segments_outline[segment]} as topic of the podcast.
        The host should follow the {segments_outline[segment]} for asking questions to the guest.
        The name of the host is Bill and his role is to be the listener's podcast assistant. 
        The name of the guest is Melinda and her role is to be the expert in the podcast topic. 
        The name of the podcast is "Advanced AI Podcast".
        

        When you thanks someone, write "Thank you" and the name of the person without a comma. For example, "Thank you Bill".

        Output as a JSON with the following fields:
        - text: an array of objects with the speaker, the intonation and the text to be spoken
        Return only the json as plain text.
        """
    elif len(podcast_conversation) != 0 and segment!=len(segments_outline)-1:
        podcast_prompt = f"""Create a segment which is in the middle of a podcast text based on the following part of an outline:

        {segments_outline[segment]}

        This text will be used to generate the audio of the podcast. 
        There are 2 participants in the podcast: the host and the guest. 
        The host will ask questions about the {segments_outline[segment]} to the guest and the guest will answer them. 
        Both the host and the speaker should stick to the {segments_outline[segment]} as topic of the podcast.
        The host should follow the {segments_outline[segment]} for asking questions to the guest.
        The name of the host is Bill and his role is to be the listener's podcast assistant. 
        The name of the guest is Melinda and her role is to be the expert in the podcast topic.
        This is in the middle of the podcast, so don't welcome the listeners again and don't specify what this segment is about!

        Output as a JSON with the following fields:
        - text: an array of objects with the speaker, the intonation and the text to be spoken
        Return only the json as plain text.
        """
    elif segment==len(segments_outline)-1 :
        podcast_prompt = f"""Create a segment which is the closing of a podcast text based on the following part of an outline:

        {segments_outline[segment]}

        This text will be used to generate the audio of the podcast. 
        There are 2 participants in the podcast: the host and the guest.  
        The host will thank the guest and close the podcast.
        The host should follow the {segments_outline[segment]} for asking questions to the guest.
        Both the host and the speaker should stick to the {segments_outline[segment]} as topic of the podcast.
        The name of the host is Bill and his role is to be the listener's podcast assistant. 
        The name of the guest is Melinda and her role is to be the expert in the podcast topic. 

        When you thanks someone, write "Thank you" and the name of the person without a comma. For example, "Thank you Bill".

        Output as a JSON with the following fields:
        - text: an array of objects with the speaker, the intonation and the text to be spoken
        Return only the json as plain text.
        """

    formatted_podcast_prompt = podcast_prompt.format(segments_outline[segment])

    podcast_script_response = rag_chain.invoke({"input": formatted_podcast_prompt})
    podcast_script_text = podcast_script_response['answer']
    
    if segment>0:
        chat_res = user_proxy.initiate_chat(
        recipient=manager,
        message=f"""Combine""" + podcast_previous_script_text + " and " + podcast_script_text + f""" and evaluate if the conversation is 
        a coherent one, has a good narative, if it has a same conversation style, and a seamless transition between the two parts. 
        """,
       
        summary_method="reflection_with_llm"
        )   
    else:
        chat_res = user_proxy.initiate_chat(
        recipient=manager,
        message=f"""Evaluate if the start of the conversation {podcast_script_text} is 
        a coherent one, has a good narative. 
        """,
      
        summary_method="reflection_with_llm"
        )   

    podcast_previous_script_text = podcast_script_text
    podcast_conversation = podcast_conversation + podcast_script_text

    try:
        full_script = full_script + chat_res.chat_history[2]['content']
    except:
        pass
        # print(podcast_script_text)
    

In [None]:
# type(chat_res.chat_history[2]['content'])
json_dict = json.loads("""{  "title": "How to Fine-Tune a Model Using LoRA (Step by Step)",
  "text": [""" + full_script.replace('TERMINATE','').replace('}{', '},{').replace(""""text": [""",'').replace('},{', '},').replace(']','').replace("\n},\n",",").replace("{ \n \n  {", "[ \n {")[1:-1] + "] }")
print(json_dict)


In [38]:
# # Create a prompt with the outline to get a full podcast text

# podcast_prompt = f"""Create a podcast complete text based on the following outline:

# {podcast_outline}

# This text will be used to generate the audio of the podcast. There are 2 participants in the podcast: the host and the guest. The host will introduce the podcast and the guest. The guest will explain the outline of the podcast. The host will ask questions to the guest and the guest will answer them. The host will thank the guest and close the podcast.
# The name of the host is Bill and his role is to be the listener's podcast assistant. The name of the guest is Melinda and her role is to be the expert in the podcast topic. The name of the podcast is "Advanced AI Podcast".

# When you thanks someone, write "Thank you" and the name of the person without a comma. For example, "Thank you Bill".

# Output as a JSON with the following fields:
# - title: Title of the podcast
# - text: an array of objects with the speaker, the intonation and the text to be spoken
# Return only the json as plain text.
# """

# formatted_podcast_prompt = podcast_prompt.format(podcast_outline)

In [None]:
# # Generate the podcast script

# podcast_script_response = rag_chain.invoke({"input": formatted_podcast_prompt})
# podcast_script_text = podcast_script_response['answer']

In [39]:
podcast_string = """{  "title": "How to Fine-Tune a Model Using LoRA (Step by Step)",
  "text": [""" + full_script.replace('TERMINATE','').replace('}{', '},{').replace(""""text": [""",'').replace('},{', '},').replace(']','').replace("\n},\n",",").replace("{ \n \n  {", "[ \n {")[1:-1] + "] }"

In [81]:
# # Save the podcast script

# podcast_script_file_name = pdf_filename.replace('.pdf', '_script.json')

# with open(get_file(podcast_script_file_name), "w") as f:
#     f.write(podcast_script_text)

## Generate the podcast audio

In [None]:
podcast_script_text = json_dict
print(podcast_script_text)

In [91]:
import os
from openai import AzureOpenAI

def add_ssml_and_style(line, line_style):
    # Retrieve environment variables
    api_key = os.environ['OPENAI_API_KEY']
    azure_endpoint = os.environ['OPENAI_AZURE_ENDPOINT']
    api_version = os.environ['OPENAI_API_VERSION']
    deployment_name = os.environ['OPENAI_AZURE_DEPLOYMENT'] 
    
        # Initialize AzureOpenAI client
    azure_openai_client = AzureOpenAI(
        api_key=api_key,
        azure_endpoint=azure_endpoint,
        api_version=api_version
    )

    if not api_key or not azure_endpoint or not deployment_name:
        raise ValueError("Environment variables for Azure OpenAI Key and Endpoint are not set.")

    prompt_template = """Given following text and its entonation, rewrite the intonations of this text with SSML
    Text: {text}
    Intonation:
    {intonation}
    You can use the intonation to add the style to the text as in this example:
    '''<mstts:express-as style="Excited" styledegree="1">Hello everyone!</mstts:express-as>'''
    The styledegree can go from 0.01 to 2
    Note that you do not need to add the "<speak> and <voice> tags. 
    Do not change the pitch.
    Keep the rate always to medium
    ONLY return the imrpoved modified text!!
    """
    prompt = prompt_template.format(text=line, intonation=line_style)
    system_p = "You are an expert in SSML. You will be given a text and an intonation and you will have to return the same text improved with SSML"
    result = azure_openai_client.chat.completions.create(       
        model=deployment_name,
        temperature=0,
        top_p=1,
        messages=[
            {"role": "system", "content": system_p},
            {"role": "user", "content": prompt},
        ]).choices[0].message.content
    return result
    


In [116]:
import json

# This is an example of SSML (Speech Synthesis Markup Language) format.
# <speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US">
#   <voice name="en-US-AvaMultilingualNeural">
#     When you're on the freeway, it's a good idea to use a GPS.
#   </voice>
# </speak>
# Parse the JSON response and create a SSML with en-US-AndrewMultilingualNeural for Bill Voice
# and en-US-AvaMultilingualNeural for Melinda Voice
podcast_script_json = json.loads(str(podcast_string))
# podcast_script_json = podcast_script_text
ssml_text = "<speak version='1.0' xmlns='https://www.w3.org/2001/10/synthesis' xml:lang='en-US'>"
for line in podcast_script_json['text']:
    speaker = line['speaker']
    text = line['text']
    if speaker == 'Bill':
        ssml_text += f"<voice name='en-US-AndrewMultilingualNeural'>{text}</voice>"
    elif speaker == 'Melinda':
        ssml_text += f"<voice name='en-US-AvaMultilingualNeural'>{text}</voice>"
ssml_text += "</speak>"

# # use the default speaker as audio output.
# speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

# result = speech_synthesizer.speak_ssml_async(ssml_text).get()
# stream = speechsdk.AudioDataStream(result)
# podcast_filename = pdf_filename.replace('.pdf', '_podcast.wav')
# stream.save_to_wav_file(get_file(podcast_filename))



In [None]:
import os

# Function to split SSML content into chunks
def split_ssml(ssml_content, max_length=10000):
    chunks = []
    while len(ssml_content) > max_length:
        split_index = ssml_content.rfind('</voice>', 0, max_length)
        if split_index == -1:
            raise ValueError("Cannot split SSML content properly.")
        chunks.append(ssml_content[:split_index+8])
        ssml_content = ssml_content[split_index+8:]
    chunks.append(ssml_content)
    return chunks

# Function to wrap content in SSML tags
def wrap_in_ssml(i,content):
    #first chunck
    if i == 0:
        return f"{content}</speak>"
    #all the subchunks
    if i < number_of_chuncks-1:
        return f"<speak version='1.0' xmlns='https://www.w3.org/2001/10/synthesis' xml:lang='en-US'>{content}</speak>"
    # the last chunck
    elif i == number_of_chuncks-1:
        return f"<speak version='1.0' xmlns='https://www.w3.org/2001/10/synthesis' xml:lang='en-US'>{content}"

# Split the SSML content into chunks
chunks = split_ssml(ssml_text)

# lenght of chunks
number_of_chuncks = len(chunks)
# Save each chunk to a separate SSML file
for i, chunk in enumerate(chunks):
    wrapped_chunk = wrap_in_ssml(i,chunk)
    with open(f'output_part_{i}.ssml', 'w') as file:
        file.write(wrapped_chunk)

print("SSML content split and saved to multiple files.")


In [None]:
import os

infiles = []

# Fetch environment variables
speech_key = os.environ.get('AZURE_SPEECH_SERVICE_KEY')
service_region = os.environ.get('AZURE_SPEECH_SERVICE_REGION')
service_region_azure = service_region_azure = service_region.replace(" ", "").lower()

if not speech_key or not service_region or not service_region_azure:
    raise ValueError("Environment variables for Azure Speech Key and Region are not set.")

# Function to perform speech synthesis using REST API
def synthesize_speech_rest(ssml_content, output_filename):
    url = f"https://{service_region_azure}.tts.speech.microsoft.com/cognitiveservices/v1"
    headers = {
        'Ocp-Apim-Subscription-Key': speech_key,
        'Content-Type': 'application/ssml+xml',
        'X-Microsoft-OutputFormat': 'riff-16khz-16bit-mono-pcm',
        'User-Agent': 'AutoPodCaster/1.0'
    }
    response = requests.post(url, headers=headers, data=ssml_content)
    
    if response.status_code == 200:
        with open(output_filename, 'wb') as audio_file:
            audio_file.write(response.content)
        print(f"Speech synthesized to '{output_filename}'")
    else:
        print(f"Speech Error: {response.status_code}")
        print(response.text)


for i  in range(number_of_chuncks):
    file_path = '/workspaces/AutoPodCaster/src/notebooks/output_part_{i}.ssml'
    
    # Read SSML content
    with open(file_path, 'r') as file:
        ssml_content = file.read()
 
    # Set the output format to WAV
    podcast_filename = pdf_filename.replace(' ', '_').replace('.pdf', f'_podcast_{i+1}.wav')


    # Perform speech synthesis using REST API
    synthesize_speech_rest(ssml_content, podcast_filename)
    
    # If code is needed for Azure Speech SDK: this can be used, gave me issues with the region name.
    # # Create a synthesizer with the given settings
    # speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

    # # Synthesize the speech
    # result = speech_synthesizer.speak_ssml_async(ssml_content).get()

    # # Check result
    # if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    #     print("Speech synthesized to '*.wav'")
    #     stream = speechsdk.AudioDataStream(result)
    #     stream.save_to_wav_file(os.path.join(current_dir, podcast_filename))
        
    # else:
    #     print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
    #     if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
    #         print(f"Error details: {result.cancellation_details.error_details}")

    # speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    # result = speech_synthesizer.speak_ssml_async(ssml_content).get()
    # stream = speechsdk.AudioDataStream(result)
    # podcast_filename = pdf_filename.replace(' ', '_').replace('.pdf', f'_podcast_{i+1}.wav')
    # stream.save_to_wav_file(os.path.join(current_dir, podcast_filename))
    
    
    infiles.append(os.path.join('/workspaces/AutoPodCaster/src/notebooks/',podcast_filename))


In [None]:
print(podcast_filename)
print(infiles)

In [None]:
import wave

# Merge multiple wav files into one
outfile = podcast_filename.replace('_' + str(number_of_chuncks) + '.wav', '_merged.wav')

data = []

for infile in infiles:
    try:
        with wave.open(infile, 'rb') as w:
            data.append([w.getparams(), w.readframes(w.getnframes())])
    except EOFError:
        print(f"Error reading {infile}: Unexpected end of file. Skipping this file.")
    except wave.Error as e:
        print(f"Error reading {infile}: {e}. Skipping this file.")

if data:
    with wave.open(outfile, 'wb') as output:
        output.setparams(data[0][0])
        for params, frames in data:
            output.writeframes(frames)
    print(f"Merged podcast saved as {outfile}")
else:
    print("No valid WAV files to merge.")

# Example usage