**Goal** : Convert youtube video podcasts into concise audio summaries

**Plan** : 

- Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts
- Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts
- Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries 

In [1]:
# imports

import enum
import instructor
import os
import re
from abc import ABC, abstractmethod
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, conlist, field_validator
from typing import Any, ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [2]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

In [3]:
# wrapper

def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, list] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
):
    """Wrapper function to generate LLM completion"""
    messages = []

    # Add system prompt if provided
    if system_prompt is not None:
        messages.append({"role": "system", "content": system_prompt})

    # Add additional messages before user_prompt
    if additional_messages is not None:
        if isinstance(additional_messages, list):
            for message in additional_messages:
                messages.append({"role": "user", "content": message})
        else:
            messages.append({"role": "user", "content": additional_messages})

    # Add user context if provided
    if user_prompt is not None:
        if isinstance(user_prompt, list):
            for context in user_prompt:
                messages.append({"role": "user", "content": context})
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion
    completion = client.chat.completions.create(
        model=GPT_MODEL,
        response_model=response_model,
        max_retries=max_retries,
        messages=messages
    )
    
    # Check if response_model is None and return appropriate result
    if response_model is None:
        return completion.choices[0].message.content.strip()
    else:
        return completion

In [4]:
# transcript tool

class Transcript(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and 
    formats it as a JSON string.
    """

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id)

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None

In [110]:
# summary tool
class Summary(BaseModel):
    """
    This tool summarises a given input text
    """

    summary: str = Field(None, description="A clear, concise summary of the text in under 50 words.")
    
    def run(self, text: str) -> 'Summary':
        """
        Summarize the input text.

        Parameters:
            text (str): The input text to summarize.

        Returns:
            Summary: An instance of the Summary class with the summarized text.
        """

        system_prompt: str = """
        You are an expert in AI and science communication, able to make technical content detailed, interesting and accessible 
        - You write in the spirit of Richard Feynman, making complex concepts easy to understand without sacrificing quality. 
        - Create a long, detailed and comprehensive summary of the provided text.
        - At least 500 words in length.
        - Provide detail on all central ideas; also include peripheral items that are interesting
        - Rely strictly on the provided text, without including external information.
        - Format the summary in markdown with bullet points, signposting and any other tools for accessible reading.
        - Ensure text is connected and clear with each sentence flowing seamlessly into the next.
        - Have a tone of voice that is simple, clear, accessible and direct.
        """

# - Write a summary in the style of Raymond Carver but preserve all detail: simple, clear and direct.

        completion = wrapper(
            system_prompt=system_prompt,
            user_prompt=text,
            response_model=Summary,  
            max_retries=3
        )

        return completion.summary

In [6]:
# text to speech tool

class TextToSpeech(BaseModel):
    """
    This tool converts input text into speech using the OpenAI text-to-speech API and saves it as an MP3 file.
    """

    def run(self, text: str, model: str = "tts-1", voice: str = "alloy", speed: float = 1.0, response_format: str = "mp3", filename: str = "speech") -> Optional[str]:
        """
        Convert input text into speech and save it as an MP3 file.

        Parameters:
            text (str): The text to convert into speech.
            model (str): The TTS model to use. Defaults to "tts-1".
            voice (str): The voice to use for the speech. Defaults to "alloy".
            speed (float): The speed of the speech. Defaults to 1.0.
            response_format (str): The format of the output audio file. Defaults to "mp3".
            filename (str): The name of the output file (without extension). Defaults to "speech".

        Returns:
            Optional[str]: The path to the saved MP3 file if successful, otherwise None.
        """
        try:
            # Generate the speech
            response = audio_client.audio.speech.create(
                model=model,
                voice=voice,
                input=text,
                speed=speed,
                response_format=response_format
            )

            # Define the path to save the audio file
            speech_file_path = Path(os.getcwd()) / f"{filename}.{response_format}"

            # Save the audio content to the file
            response.stream_to_file(speech_file_path)

            # Print the file path for easy access
            print(f"Saved speech file at: {speech_file_path}")

            return str(speech_file_path)
        except Exception as e:
            print(f"Error generating or saving speech: {e}")
            return None

In [7]:
class Fallback(BaseModel):
    """
    A fallback tool to be selected when the other tools are not appropriate.

    This class serves as a placeholder or default option in cases where no other 
    specific tool is suitable for the given task. It can be used to provide a 
    default response or action.
    """
    pass


In [111]:
# tools

class Tools(BaseModel):
    """
    This class represents available tools for the user's context.
    
    Attributes:
        tools: (Union[Transcript, Summary, TextToSpeech, Fallback]): Available tool classes
    """
    tools: Union[Transcript, Summary, TextToSpeech, Fallback]

    tool_class_mapping: ClassVar[Dict[str, Type[BaseModel]]] = {
        "Transcript": Transcript,
        "Summary": Summary,
        "TextToSpeech": TextToSpeech,
        "Fallback": Fallback
    }   

# tool title

class ToolTitle(BaseModel):
    """
    This class represents the title of the most relevant tool selected for the user's context.
    
    Attributes:
        tool_title (str): The title of the single most relevant tool selected for the user's context.
                          This attribute provides a clear and concise identifier for the selected tool,
                          which is determined based on the user's prompt or context.
    """

    tool_title: str = Field(..., description="The title of the single most relevant tool selected for the user's context")


In [112]:
# router 

# bug note 1 : maybe using Literal for the string enforces better checking for tool_title?

class Router(BaseModel):
    """
    Router tool for selecting the appropriate tool based on user prompt.
    """

    # tool_class_mapping: Dict[str, Type[BaseModel]] = {
    #     "Transcript": Transcript,
    #     "Summary": Summary,
    #     "TextToSpeech": TextToSpeech,
    #     "Fallback": Fallback
    # }

    def select(self, user_prompt: str) -> ToolTitle:
        """
        Select the appropriate tool based on the user prompt.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.

        Returns:
            Select: The single selected tool as appropriate to the user prompt.
        """

        tools = Tools.model_json_schema()
        system_prompt: str = f"You are an intelligent tool selector. Select and return the single right tool for the user from this list : {tools}"

        completion = wrapper(
            system_prompt=system_prompt,
            user_prompt=f"Select and return the single relevant tool title for : {user_prompt}",
            response_model=ToolTitle,
            max_retries=3
        )
        return completion
    
    def run(self, user_prompt: str, input_data: Any = None) -> Any:
        """
        Run the appropriate tool based on the user prompt and input data.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.
            input_data (Any): The input data to pass to the tool's run method.

        Returns:
            Any: The result of the tool's run method.
        """
        # Call the select method to get the appropriate tool
        selected_tool = self.select(user_prompt)
        
        # Extract the tool_title
        tool_title = selected_tool.tool_title
        
        # Instantiate the appropriate tool class based on the tool_title
        tool_class = Tools.tool_class_mapping.get(tool_title, Fallback)
        tool_instance = tool_class()
        
        # Invoke the run method of the tool instance
        result = tool_instance.run(input_data)
        
        return result


In [113]:
task_1 = "Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts"
task_2 = "Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts"
task_3 = "Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries"
task_4 = "Task 4 : Add numbers"

In [85]:
router = Router()
response = router.select(task_4)
response

ToolTitle(tool_title='Fallback')

In [121]:
url = "https://www.youtube.com/watch?v=ltuHJ0wzr2o"
router = Router()
response_1 = router.run(task_1, url)
response_2 = router.run(task_2, response_1)
pp(response_2)
response_3 = router.run(task_3, response_2)

('An engaging and wide-ranging talk that explores the impact and future of '
 'algorithms, focusing on AI and digital minions in our lives and their roles '
 'in the economy.')
Saved speech file at: c:\Storage\python_projects\ashvin\sandbox\pydantic\speech.mp3


In [109]:
router = Router()
response_2 = router.run(task_2, response_1)
pp(response_2)

('The text outlines the process of building an AI application from scratch, '
 'highlighting the ease and speed with which it can be done. It includes steps '
 'for transforming raw text into a polished front end, emphasizing iterative '
 'development, effective use of models, and structured data handling.')


In [122]:
response_1


'[{"text": "thanks so much to all of you who are in", "start": 0.24, "duration": 3.96}, {"text": "the room my name is mar Kitz everyone", "start": 1.64, "duration": 4.28}, {"text": "seems to be talking about artificial", "start": 4.2, "duration": 4.08}, {"text": "intelligence these days right half of", "start": 5.92, "duration": 4.56}, {"text": "the sessions that start by are all about", "start": 8.28, "duration": 6.12}, {"text": "AI but it\'s not just AI uh AI is one", "start": 10.48, "duration": 5.799}, {"text": "technology that\'s changing the way we", "start": 14.4, "duration": 3.879}, {"text": "leave work and think but there\'s a", "start": 16.279, "duration": 4.601}, {"text": "broader category that I call algorithms", "start": 18.279, "duration": 4.881}, {"text": "here so AI is an algorithm or you know", "start": 20.88, "duration": 5.159}, {"text": "AI systems are running or following uh", "start": 23.16, "duration": 4.6}, {"text": "uh algorithms but there are other", "start": 26