**Goal** : Convert youtube video podcasts into concise audio summaries

**Plan** : 

- Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts
- Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts
- Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries 

In [2]:
# imports

import enum
import instructor
import os
import re
from abc import ABC, abstractmethod
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, conlist, field_validator
from typing import Any, Iterable, List, Optional, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [3]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

In [4]:
# wrapper

def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, list] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
):
    """Wrapper function to generate LLM completion"""
    messages = []

    # Add system prompt if provided
    if system_prompt is not None:
        messages.append({"role": "system", "content": system_prompt})

    # Add additional messages before user_prompt
    if additional_messages is not None:
        if isinstance(additional_messages, list):
            for message in additional_messages:
                messages.append({"role": "user", "content": message})
        else:
            messages.append({"role": "user", "content": additional_messages})

    # Add user context if provided
    if user_prompt is not None:
        if isinstance(user_prompt, list):
            for context in user_prompt:
                messages.append({"role": "user", "content": context})
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion
    completion = client.chat.completions.create(
        model=GPT_MODEL,
        response_model=response_model,
        max_retries=max_retries,
        messages=messages
    )
    
    # Check if response_model is None and return appropriate result
    if response_model is None:
        return completion.choices[0].message.content.strip()
    else:
        return completion

In [5]:
# base tool

class Tool(BaseModel):
    """
    A tool model that uses Pydantic's data validation features.

    Attributes:
        name (str): The name of the tool.
        description (str): A brief description of what the tool does.
    """

    name: str
    description: str

    def run(self, input_data: Any) -> Any:
        raise NotImplementedError("Each tool must implement its own run method.")

In [6]:
# transcript tool

class Transcript(Tool):
    """
    A tool for extracting and formatting YouTube video transcripts.

    This class inherits from Tool and overrides the run method to extract
    the YouTube video ID from a given URL, retrieve the transcript, and 
    format it as a JSON string.
    """

    name: str = "YouTube Transcript Extractor"
    description: str = "Extracts the YouTube video ID from a URL, retrieves the transcript, and formats it as JSON."

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id)

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None

In [7]:
# summary tool
class Summary(Tool):
    """
    A tool for summarizing a given text.

    This class inherits from Tool and provides functionality to
    summarize input text.
    """

    # Tool properties
    name: str = "Text Summarizer"
    description: str = "Summarizes the input text into a concise version."
    system_prompt: str = """
    You are an expert podcast summarizer, condensing information into digestible summaries with appropriate signposting.
    Provide a concise, clear, and understandable summary of the given text. 
    Include upfront a one sentence TL;DR
    """

    # Output property
    summary: str = Field(None, description="A clear, concise summary of the text in under 50 words")
    
    def run(self, text: str) -> 'Summary':
        """
        Summarize the input text.

        Parameters:
            text (str): The input text to summarize.

        Returns:
            Summary: An instance of the Summary class with the summarized text.
        """
        completion = wrapper(
            system_prompt=self.system_prompt,
            user_prompt=text,
            response_model=Summary,  
            max_retries=3
        )

        return completion

In [8]:
# text to speech tool

class TextToSpeech(Tool):
    """
    A tool for converting text to speech using the OpenAI text-to-speech API.

    This class inherits from Tool and overrides the run method to convert
    input text into speech and save it as an MP3 file.
    """

    name: str = "Text to Speech Converter"
    description: str = "Converts text into speech and saves it as an MP3 file."

    def run(self, text: str, model: str = "tts-1", voice: str = "alloy", speed: float = 1.0, response_format: str = "mp3", filename: str = "speech") -> Optional[str]:
        """
        Convert input text into speech and save it as an MP3 file.

        Parameters:
            text (str): The text to convert into speech.
            model (str): The TTS model to use. Defaults to "tts-1".
            voice (str): The voice to use for the speech. Defaults to "alloy".
            speed (float): The speed of the speech. Defaults to 1.0.
            response_format (str): The format of the output audio file. Defaults to "mp3".
            filename (str): The name of the output file (without extension). Defaults to "speech".

        Returns:
            Optional[str]: The path to the saved MP3 file if successful, otherwise None.
        """
        try:
            # Generate the speech
            response = audio_client.audio.speech.create(
                model=model,
                voice=voice,
                input=text,
                speed=speed,
                response_format=response_format
            )

            # Define the path to save the audio file
            speech_file_path = Path(os.getcwd()) / f"{filename}.{response_format}"

            # Save the audio content to the file
            response.stream_to_file(speech_file_path)

            # Print the file path for easy access
            print(f"Saved speech file at: {speech_file_path}")

            return str(speech_file_path)
        except Exception as e:
            print(f"Error generating or saving speech: {e}")
            return None

In [None]:
transcript_tool = Transcript()
summary_tool = Summary()
text_to_speech_tool = TextToSpeech()

url = "https://www.youtube.com/watch?v=FEuYmGTbaYI"
transcript = transcript_tool.run(url=url)
summary = summary_tool.run(transcript)
text_to_speech = text_to_speech_tool.run(summary.summary)

In [10]:
# router

# class SelectTool(Tool):
#     """
#     Tool for selecting a single tool from a group of tools that is most relevant to a user provided need.

#     Attributes:
#         name (str): The name of the tool.
#         description (str): A brief description of what the tool does.
#         tools (Union[Transcript, Summary, TextToSpeech]): Group of tools available to select from.
#         selected_tool_class (Literal["Transcript", "Summary", "TextToSpeech"]): The class name of the selected tool.
#     """
#     name: str = "Select Tool"
#     description: str = "Select one tool from a group based on a user provided context."
#     # tools: Union[Transcript, Summary, TextToSpeech] = Field(None, description="Group of tools available to select from")
#     # selected_tool_class: Literal["Transcript", "Summary", "TextToSpeech"] = Field(None, description="The class name of the selected tool")
#     selected_tool: Union[Transcript, Summary, TextToSpeech]

class Selector(Tool):
    tools: Union[Transcript, Summary, TextToSpeech] = Field(None, description="select the most appropriate tool from this list")

class Router(Tool):
    """
    Router tool for selecting the appropriate tool based on user prompt.

    Attributes:
        name (str): The name of the tool.
        description (str): A brief description of what the tool does.
        system_prompt (str): The system prompt used by the tool.
    """
    name: str = "Router Tool"
    description: str = "Selects and returns the appropriate tool based on user prompt."
    system_prompt: str = "You are an intelligent router. Select and return the single right tool for the user."

    def select(self, user_prompt: str) -> Selector:
        """
        Select the appropriate tool based on the user prompt.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.

        Returns:
            Selector: The single selected tool as appropriate to the user prompt
        """
        completion = wrapper(
            system_prompt=self.system_prompt,
            user_prompt=f"Select and return the single relevant tool for {user_prompt}",
            response_model=Selector,
            max_retries=3
        )
        return completion

    def run(self):
        pass

In [11]:
task_1 = "Task 1 : Retrieve Transcripts: Download transcripts from YouTube video podcasts"
task_2 = "Task 2 : Summarise Transcripts: Generate concise summaries from downloaded transcripts"
task_3 = "Task 3 : Convert Summaries to Audio: Use text-to-speech to create audio summaries"

In [12]:
router = Router()
selected_tool = router.select(task_2)
# Print the result
# print(f"Type of selected_tool: {type(selected_tool)}")
# print(f"Selected Tool Name: {selected_tool.name}")
# print(f"Selected Tool Description: {selected_tool.description}")
# print(f"Selected Tool Class: {selected_tool.selected_tool_class}")

In [13]:
print(selected_tool)

name='Task 2: Summarise Transcripts' description='Generate concise summaries from downloaded transcripts' tools=Transcript(name='Text Summarizer', description='Summarizes the input text into a concise version.')


In [14]:
schema = Selector.model_json_schema()
pp(schema)

{'$defs': {'Summary': {'description': 'A tool for summarizing a given text.\n'
                                      '\n'
                                      'This class inherits from Tool and '
                                      'provides functionality to\n'
                                      'summarize input text.',
                       'properties': {'description': {'default': 'Summarizes '
                                                                 'the input '
                                                                 'text into a '
                                                                 'concise '
                                                                 'version.',
                                                      'title': 'Description',
                                                      'type': 'string'},
                                      'name': {'default': 'Text Summarizer',
                                               'title'