**Goal** : Convert YouTube videos into concise audio summaries

**Plan** : 

- Task 1 : Retrieve Video Metadata: Extract key metadata from a YouTube video
- Task 2 : Retrieve Transcript: Download transcript from YouTube video
- Task 3 : Summarise Transcript: Generate concise summary from downloaded transcript
- Task 4 : Merge Metadata: Integrate key metadata into the summary before converting to audio
- Task 5 : Convert Summary to Audio: Use text-to-speech to create audio summary


## Feature Wishlist

- [response tokens](https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response)
- audio longer than 4 mins
- lecture notes
- Use Literal for toolbox?

In [16]:
# imports

import enum
import instructor
import os
import re
from abc import ABC, abstractmethod
from datetime import datetime
from dotenv import load_dotenv
from googleapiclient.discovery import build
from openai import OpenAI
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, conlist, field_validator
import tiktoken
import time
from typing import Any, ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [2]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

In [3]:
# wrapper

def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, list] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
):
    """Wrapper function to generate LLM completion"""
    messages = []

    # Add system prompt if provided
    if system_prompt is not None:
        messages.append({"role": "system", "content": system_prompt})

    # Add additional messages before user_prompt
    if additional_messages is not None:
        if isinstance(additional_messages, list):
            for message in additional_messages:
                messages.append({"role": "user", "content": message})
        else:
            messages.append({"role": "user", "content": additional_messages})

    # Add user context if provided
    if user_prompt is not None:
        if isinstance(user_prompt, list):
            for context in user_prompt:
                messages.append({"role": "user", "content": context})
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion
    completion = client.chat.completions.create(
        model=GPT_MODEL,
        response_model=response_model,
        max_retries=max_retries,
        messages=messages
    )
    
    # Check if response_model is None and return appropriate result
    if response_model is None:
        return completion.choices[0].message.content.strip()
    else:
        return completion

In [4]:
# Youtube details tool

class YoutubeDetails(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves key metadata,
    and formats it as a dictionary.
    
    The metadata extracted includes:
        - Title: The title of the video.
        - Description: The description of the video.
        - Published At: The date and time when the video was published.
        - Channel Title: The title of the channel that uploaded the video.
        - Views: The number of views the video has received.
        - Likes: The number of likes the video has received.
        - Dislikes: The number of dislikes the video has received.
        - Comments: The number of comments on the video.
        - Duration: The duration of the video in ISO 8601 format.
        - Tags: A list of tags associated with the video.
    """

    def run(self, url: str) -> Optional[Dict[str, str]]:
        """
        Extract the YouTube video ID from a given URL, retrieve key metadata,
        and format it as a dictionary.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[Dict[str, str]]: The video metadata if the video ID is valid and the
                                      metadata is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            if not YOUTUBE_API_KEY:
                print("API key not found in environment variables.")
                return None

            youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
            request = youtube.videos().list(part='snippet,contentDetails,statistics', id=video_id)
            response = request.execute()

            if not response['items']:
                print("No video found with the provided video ID.")
                return None

            video_details = response['items'][0]
            metadata = {
                "Title": video_details['snippet']['title'],
                "Description": video_details['snippet']['description'],
                "Published At": video_details['snippet']['publishedAt'],
                "Channel Title": video_details['snippet']['channelTitle'],
                "Views": video_details['statistics'].get('viewCount', 'N/A'),
                "Likes": video_details['statistics'].get('likeCount', 'N/A'),
                "Dislikes": video_details['statistics'].get('dislikeCount', 'N/A'),
                "Comments": video_details['statistics'].get('commentCount', 'N/A'),
                "Duration": video_details['contentDetails']['duration'],
                "Tags": video_details['snippet'].get('tags', [])
            }

            return metadata
        except Exception as e:
            print(f"Error retrieving video metadata: {e}")
            return None

In [6]:
# transcript tool

class Transcript(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and 
    formats it as a JSON string.
    """

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id)

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None
        
    def count(self, url: Optional[str] = None, transcript: Optional[str] = None) -> Optional[int]:
        """
        Count the number of tokens in the transcript of the YouTube video.

        Parameters:
            url (Optional[str]): The YouTube URL from which to extract the video ID.
            transcript (Optional[str]): The pre-fetched transcript to count tokens for.

        Returns:
            Optional[int]: The number of tokens in the transcript if available, otherwise None.
        """
        if transcript is None:
            if url is None:
                print("Either url or transcript must be provided.")
                return None
            transcript = self.run(url)
            if transcript is None:
                return None

        # model list here : https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
        # maybe later separate count into its' own class under Utils or something
        
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens = encoding.encode(transcript)
        return len(tokens)

In [7]:
# summary tool
class Summary(BaseModel):
    """
    This tool summarises a given input text
    """

    summary: str = Field(None, description="A clear, concise summary of the text in under 500 words.")
    
    def run(self, text: str) -> 'Summary':
        """
        Summarize the input text.

        Parameters:
            text (str): The input text to summarize.

        Returns:
            Summary: An instance of the Summary class with the summarized text.
        """

        system_prompt: str = """
        You are an expert in AI and science communication, able to make technical content detailed, interesting and accessible 
        - You write in the spirit of Richard Feynman, making complex concepts easy to understand without sacrificing quality. 
        - Create a long, detailed and comprehensive summary of the provided text.
        - At least 500 words in length.
        - Provide detail on all central ideas; also include peripheral items that are interesting
        - Rely strictly on the provided text, without including external information.
        - Format the summary in markdown with bullet points, signposting and any other tools for accessible reading.
        - Ensure text is connected and clear with each sentence flowing seamlessly into the next.
        - Have a tone of voice that is simple, clear, accessible and direct.
        """

    # - Write a summary in the style of Raymond Carver but preserve all detail: simple, clear and direct.

        completion = wrapper(
            system_prompt=system_prompt,
            user_prompt=text,
            response_model=Summary,  
            max_retries=3
        )

        return completion.summary

In [8]:
# merge tool

class MergeMetadata(BaseModel):
    """
    This tool merges video metadata with a summary, creating a combined summary
    that includes a preamble with key information about the video in a concise format.
    """

    def run(self, metadata: Dict[str, str], summary: str) -> Optional[str]:
        """
        Merge video metadata with the summary, adding a preamble with key information.

        Parameters:
            metadata (Dict[str, str]): The metadata of the YouTube video.
            summary (str): The summary of the video's transcript.

        Returns:
            Optional[str]: The merged summary with metadata preamble if both summary and
                           metadata are valid, otherwise None.
        """
        if not metadata or not summary:
            print("Both metadata and summary must be provided.")
            return None

        try:
            title = metadata.get('Title', 'N/A')
            published_at = metadata.get('Published At', 'N/A')
            channel_title = metadata.get('Channel Title', 'N/A')
            duration = metadata.get('Duration', 'N/A')

            # Convert duration from ISO 8601 format to a more readable format
            duration_minutes = self._convert_duration_to_minutes(duration)
            # Convert the published_at date to a human-readable format
            readable_published_at = self._convert_date_to_human_readable(published_at)

            preamble = (
                f"This video is titled '{title}', and was published on {readable_published_at} "
                f"by {channel_title} channel. It is {duration_minutes} minutes long."
            )

            merged_summary = f"{preamble}\n\nSummary:\n{summary}"
            return merged_summary
        except Exception as e:
            print(f"Error merging metadata with summary: {e}")
            return None

    def _convert_duration_to_minutes(self, duration: str) -> str:
        """
        Convert ISO 8601 duration format to total minutes.

        Parameters:
            duration (str): The ISO 8601 duration string.

        Returns:
            str: The duration in total minutes.
        """
        match = re.match(r'PT(\d+H)?(\d+M)?(\d+S)?', duration)
        if not match:
            return 'N/A'

        hours = int(match.group(1)[:-1]) if match.group(1) else 0
        minutes = int(match.group(2)[:-1]) if match.group(2) else 0
        seconds = int(match.group(3)[:-1]) if match.group(3) else 0

        total_minutes = hours * 60 + minutes + seconds / 60
        return f"{total_minutes:.2f}"

    def _convert_date_to_human_readable(self, date_str: str) -> str:
        """
        Convert ISO 8601 date format to a more human-readable format.

        Parameters:
            date_str (str): The ISO 8601 date string.

        Returns:
            str: The date in a human-readable format.
        """
        try:
            date_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
            return date_obj.strftime("%B %d, %Y")
        except ValueError as e:
            print(f"Error converting date: {e}")
            return date_str


In [9]:
# text to speech tool

class TextToSpeech(BaseModel):
    """
    This tool converts input text into speech using the OpenAI text-to-speech API and saves it as an MP3 file.
    """

    def run(self, text: str, model: str = "tts-1", voice: str = "alloy", speed: float = 1.0, response_format: str = "mp3", filename: str = "speech") -> Optional[str]:
        """
        Convert input text into speech and save it as an MP3 file.

        Parameters:
            text (str): The text to convert into speech.
            model (str): The TTS model to use. Defaults to "tts-1".
            voice (str): The voice to use for the speech. Defaults to "alloy".
            speed (float): The speed of the speech. Defaults to 1.0.
            response_format (str): The format of the output audio file. Defaults to "mp3".
            filename (str): The name of the output file (without extension). Defaults to "speech".

        Returns:
            Optional[str]: The path to the saved MP3 file if successful, otherwise None.
        """
        try:
            # Generate the speech
            response = audio_client.audio.speech.create(
                model=model,
                voice=voice,
                input=text,
                speed=speed,
                response_format=response_format
            )

            # Define the path to save the audio file
            speech_file_path = Path(os.getcwd()) / f"{filename}.{response_format}"

            # Save the audio content to the file
            response.stream_to_file(speech_file_path)

            # Print the file path for easy access
            print(f"Saved speech file at: {speech_file_path}")

            return str(speech_file_path)
        except Exception as e:
            print(f"Error generating or saving speech: {e}")
            return None

In [10]:
class Fallback(BaseModel):
    """
    A fallback tool to be selected when the other tools are not appropriate.

    This class serves as a placeholder or default option in cases where no other 
    specific tool is suitable for the given task. It can be used to provide a 
    default response or action.
    """
    pass


In [11]:
# tools

class Tools(BaseModel):
    """
    This class represents available tools for the user's context.
    
    Attributes:
        tools: (Union[YoutubeDetails, Transcript, Summary, MergeMetadata, TextToSpeech, Fallback]): Available tool classes
    """
    tools: Union[YoutubeDetails, Transcript, Summary, MergeMetadata, TextToSpeech, Fallback]

    tool_class_mapping: ClassVar[Dict[str, Type[BaseModel]]] = {
        "YoutubeDetails": YoutubeDetails,
        "Transcript": Transcript,
        "Summary": Summary,
        "MergeMetadata": MergeMetadata,
        "TextToSpeech": TextToSpeech,
        "Fallback": Fallback,

    }   

# tool title

class ToolTitle(BaseModel):
    """
    This class represents the title of the most relevant tool selected for the user's context.
    
    Attributes:
        tool_title (str): The title of the single most relevant tool selected for the user's context.
                          This attribute provides a clear and concise identifier for the selected tool,
                          which is determined based on the user's prompt or context.
    """

    tool_title: str = Field(..., description="The title of the single most relevant tool selected for the user's context")


In [12]:
# router 

# bug note 1 : maybe using Literal for the string enforces better checking for tool_title?

class Router(BaseModel):
    """
    Router tool for selecting the appropriate tool based on user prompt.
    """

    def select(self, user_prompt: str) -> ToolTitle:
        """
        Select the appropriate tool based on the user prompt.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.

        Returns:
            Select: The single selected tool as appropriate to the user prompt.
        """

        tools = Tools.model_json_schema()
        system_prompt: str = f"You are an intelligent tool selector. Select and return the single right tool for the user from this list : {tools}"

        completion = wrapper(
            system_prompt=system_prompt,
            user_prompt=f"Select and return the single relevant tool title for : {user_prompt}",
            response_model=ToolTitle,
            max_retries=3
        )
        return completion
    
    def run(self, user_prompt: str, input_data: Any = None) -> Any:
        """
        Run the appropriate tool based on the user prompt and input data.

        Parameters:
            user_prompt (str): The user prompt to guide tool selection.
            input_data (Any): The input data to pass to the tool's run method.

        Returns:
            Any: The result of the tool's run method.
        """
        # Call the select method to get the appropriate tool
        selected_tool = self.select(user_prompt)
        
        # Extract the tool_title
        tool_title = selected_tool.tool_title
        
        # Instantiate the appropriate tool class based on the tool_title
        tool_class = Tools.tool_class_mapping.get(tool_title, Fallback)
        tool_instance = tool_class()
        
        # Handle multiple input data
        if isinstance(input_data, (list, tuple)):
            result = tool_instance.run(*input_data)  # Unpacking the input data
        else:
            result = tool_instance.run(input_data)
        
        return result


In [13]:
# list of tasks
task_1 = "Task 1 : Retrieve Video Metadata: Extract key metadata from YouTube videos"
task_2 = "Task 2 : Retrieve Transcripts: Download transcripts from YouTube video podcasts"
task_3 = "Task 3 : Summarise Transcripts: Generate concise summaries from downloaded transcripts"
task_4 = "Task 4 : Merge Metadata: Integrate key metadata into the summary before converting to audio"
task_5 = "Task 5 : Convert Summaries to Audio: Use text-to-speech to create audio summaries"


# fallback task for unit testing
task_6 = "Task 5 : Do a matrix multiplication"

In [14]:
# unit test evals : tool selection
router = Router()
response = router.select(task_4)
response

ToolTitle(tool_title='MergeMetadata')

In [25]:
url = "https://www.youtube.com/watch?v=MXPYbjjyHXc"
router = Router()
response_1 = router.run(task_1, url)
time.sleep(2)
response_2 = router.run(task_2, url)
time.sleep(2)
response_3 = router.run(task_3, response_2)
time.sleep(2)
response_4 = router.run(task_4, input_data=[response_1, response_3])
time.sleep(2)
response_5 = router.run(task_5, response_4)

Saved speech file at: c:\Storage\python_projects\ashvin\sandbox\pydantic\speech.mp3


In [26]:
pp(response_3)

('In the video, Lori, the VP of Developer Relations at Llama Index, introduces '
 'and discusses the concept of context-augmented knowledge assistance, '
 'focusing on the limitations of Retrieval-Augmented Generation (RAG) and '
 'advocating for the use of more sophisticated agents in knowledge retrieval '
 'tasks. She begins by explaining what Llama Index is—a framework in Python '
 'and TypeScript for building LLM-enabled applications over data—and mentions '
 'their monetization strategy through the Llama Cloud, which offers a hosted, '
 'scalable data retrieval and querying system currently in beta. Additionally, '
 'she highlights Llama Parse, a cloud service for document parsing essential '
 'for knowledge retrieval, and mentions that it is free for up to 1,000 pages '
 'a day with a pay-as-you-go model afterward. Lori then transitions to '
 'discussing why naive RAG pipelines, while effective for many tasks, have '
 'limitations, especially in summarization, comparison, and mul

In [None]:
Tools.model_json_schema()