**Goal** : Convert youtube video podcasts into concise audio summaries

**Plan** : 

- Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts
- Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts
- Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries 

In [118]:
# imports

import enum
import instructor
import os
import re
from abc import ABC, abstractmethod
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, conlist, field_validator
from typing import Any, ClassVar, Iterable, List, Optional, Type, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [4]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

In [5]:
# wrapper

def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, list] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
):
    """Wrapper function to generate LLM completion"""
    messages = []

    # Add system prompt if provided
    if system_prompt is not None:
        messages.append({"role": "system", "content": system_prompt})

    # Add additional messages before user_prompt
    if additional_messages is not None:
        if isinstance(additional_messages, list):
            for message in additional_messages:
                messages.append({"role": "user", "content": message})
        else:
            messages.append({"role": "user", "content": additional_messages})

    # Add user context if provided
    if user_prompt is not None:
        if isinstance(user_prompt, list):
            for context in user_prompt:
                messages.append({"role": "user", "content": context})
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion
    completion = client.chat.completions.create(
        model=GPT_MODEL,
        response_model=response_model,
        max_retries=max_retries,
        messages=messages
    )
    
    # Check if response_model is None and return appropriate result
    if response_model is None:
        return completion.choices[0].message.content.strip()
    else:
        return completion

In [6]:
# transcript tool

class Transcript(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and 
    formats it as a JSON string.
    """

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id)

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None

In [12]:
# summary tool
class Summary(BaseModel):
    """
    This tool summarises a given input text
    """

    summary: str = Field(None, description="A clear, concise summary of the text in under 50 words.")
    
    def run(self, text: str) -> 'Summary':
        """
        Summarize the input text.

        Parameters:
            text (str): The input text to summarize.

        Returns:
            Summary: An instance of the Summary class with the summarized text.
        """

        system_prompt: str = """
        You are an expert podcast summarizer, condensing information into digestible summaries with appropriate signposting.
        Provide a concise, clear, and understandable summary of the given text. 
        Include upfront a one sentence TL;DR
        """
        completion = wrapper(
            system_prompt=system_prompt,
            user_prompt=text,
            response_model=Summary,  
            max_retries=3
        )

        return completion

In [8]:
# text to speech tool

class TextToSpeech(BaseModel):
    """
    This tool converts input text into speech using the OpenAI text-to-speech API and saves it as an MP3 file.
    """

    def run(self, text: str, model: str = "tts-1", voice: str = "alloy", speed: float = 1.0, response_format: str = "mp3", filename: str = "speech") -> Optional[str]:
        """
        Convert input text into speech and save it as an MP3 file.

        Parameters:
            text (str): The text to convert into speech.
            model (str): The TTS model to use. Defaults to "tts-1".
            voice (str): The voice to use for the speech. Defaults to "alloy".
            speed (float): The speed of the speech. Defaults to 1.0.
            response_format (str): The format of the output audio file. Defaults to "mp3".
            filename (str): The name of the output file (without extension). Defaults to "speech".

        Returns:
            Optional[str]: The path to the saved MP3 file if successful, otherwise None.
        """
        try:
            # Generate the speech
            response = audio_client.audio.speech.create(
                model=model,
                voice=voice,
                input=text,
                speed=speed,
                response_format=response_format
            )

            # Define the path to save the audio file
            speech_file_path = Path(os.getcwd()) / f"{filename}.{response_format}"

            # Save the audio content to the file
            response.stream_to_file(speech_file_path)

            # Print the file path for easy access
            print(f"Saved speech file at: {speech_file_path}")

            return str(speech_file_path)
        except Exception as e:
            print(f"Error generating or saving speech: {e}")
            return None

In [98]:
## Fallback class

class Fallback(BaseModel):
    message: str = "A fallback tool to be selected when the other tools are not appropriate"

In [114]:
# selector

class Select(BaseModel):
    """
    This class represents the selection of the most relevant tool for the user's context.
    
    Attributes:
        tool_choice (Union[Transcript, Summary, TextToSpeech, Fallback]): A union of available tool classes from which only the single most relevant one is selected.
        tool_title (str): The title of the single most relevant tool. Must be one of the tool classes.
    """
    tool_choice: Union[Transcript, Summary, TextToSpeech, Fallback]
    tool_title: str = Field(..., description="The title of the single most relevant tool selected for the user's context")


In [119]:
# router 

# bug note 1 : maybe using Literal for the string enforces better checking for tool_title?
# bug note 2 : persistently in tool_choice always returns transcript only
# option : discard instructor and go organic on tool choice with router? [Seems like a good option tbh and reduces dependency]
# option : add a tools class with a list of tools then add a too choice class as select one from the list of tools

class Router(BaseModel):
    """
    Router tool for selecting the appropriate tool based on user prompt.
    """

    tool_class_mapping: ClassVar[dict[str, Type[BaseModel]]] = {
        "Transcript": Transcript,
        "Summary": Summary,
        "TextToSpeech": TextToSpeech,
        "Fallback": Fallback
    }

    def select(self, user_prompt: str) -> Select:
        """
        Select the appropriate tool based on the user prompt.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.

        Returns:
            Select: The single selected tool as appropriate to the user prompt.
        """

        system_prompt: str = "You are an intelligent tool selector. Select and return the single right tool for the user."

        completion = wrapper(
            system_prompt=system_prompt,
            # user_prompt=f"Select and return the title of the single relevant tool for {user_prompt}",
            user_prompt=f"Select and return the single relevant tool and tool title for : {user_prompt}",
            response_model=Select,
            max_retries=3
        )
        return completion
    
    def run(self, user_prompt: str, input_data: Any = None) -> Any:
        """
        Run the appropriate tool based on the user prompt and input data.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.
            input_data (Any): The input data to pass to the tool's run method.

        Returns:
            Any: The result of the tool's run method.
        """
        # Call the select method to get the appropriate tool
        selected_tool = self.select(user_prompt)
        
        # Extract the tool_title
        tool_title = selected_tool.tool_title
        
        # Instantiate the appropriate tool class based on the tool_title
        tool_class = self.tool_class_mapping.get(tool_title, Fallback)
        tool_instance = tool_class()
        
        # Invoke the run method of the tool instance
        result = tool_instance.run(input_data)
        
        return result

In [104]:
response =Select.model_json_schema()
response


{'$defs': {'Fallback': {'properties': {'message': {'default': 'A fallback tool to be selected when the other tools are not appropriate',
     'title': 'Message',
     'type': 'string'}},
   'title': 'Fallback',
   'type': 'object'},
  'Summary': {'description': 'This tool summarises a given input text',
   'properties': {'summary': {'default': None,
     'description': 'A clear, concise summary of the text in under 50 words.',
     'title': 'Summary',
     'type': 'string'}},
   'title': 'Summary',
   'type': 'object'},
  'TextToSpeech': {'description': 'This tool converts input text into speech using the OpenAI text-to-speech API and saves it as an MP3 file.',
   'properties': {},
   'title': 'TextToSpeech',
   'type': 'object'},
  'Transcript': {'description': 'This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and \nformats it as a JSON string.',
   'properties': {},
   'title': 'Transcript',
   'type': 'object'}},
 'description': "This class represe

In [102]:
task_1 = "Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts"
task_2 = "Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts"
task_3 = "Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries"
task_4 = "Task 4 : Add numbers"

In [116]:
router = Router()
response = router.select(task_3)
response

Select(tool_choice=Transcript(), tool_title='TextToSpeech')

In [None]:
response.tool_title

In [121]:
router = Router()
response_1 = router.run(task_1, "https://www.youtube.com/watch?v=krixaEhLnlA")

In [122]:
response_2 = router.run(task_2, response_1)

In [123]:
response_2.summary

"Elon Musk raised $6 billion for his AI startup XAI and predicted AGI would surpass humans next year, sparking debate with Meta's Chief AI scientist Yann LeCun. The podcast also discusses the financial and operational struggles of Stability AI, the shortcomings of Google's AI, privacy concerns with Meta's data policies, and skepticism around new AI products like the Humane pin and Rabbit R1. Finally, it covers OpenAI's continued efforts with GPT-5 and questions about the true motives behind AI safety discussions."

In [124]:
response_3 = router.run(task_3, response_2.summary)

Saved speech file at: c:\Storage\python_projects\ashvin\sandbox\pydantic\speech.mp3
