## Flow

In [1]:
# goal
goal = "Generate a list of scifi and fantasy books to read"

# tasks
task_1 = "Get a transcript from a youtube video"
task_2 = "Extract books from that video"

## Source

In [None]:
# urls



## Setup

In [44]:
# imports

import enum
import instructor
import json
import os
import re
import uuid
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv
from exa_py import Exa
from googleapiclient.discovery import build
from IPython.display import display
from openai import OpenAI
import pandas as pd
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, UUID4, conlist, constr, field_validator
import requests
import tiktoken
import time
from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [29]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
EXA_API_KEY = os.getenv("EXA_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model
GPT_35_MODEL = "gpt-3.5-turbo"
URL = None

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

## Utilities

In [40]:
# cost decorator

class CostDetails(BaseModel):
    input_cost: float
    output_cost: float
    total_cost: float

    def formatted_input_cost(self):
        return f"${self.input_cost:.6f}"

    def formatted_output_cost(self):
        return f"${self.output_cost:.6f}"

    def formatted_total_cost(self):
        return f"${self.total_cost:.6f}"

def cost(function: Callable) -> Callable:
    """
    Decorator to calculate and add the cost of token usage based on predefined model pricing.
    
    This decorator enriches the output of the decorated function by calculating the cost
    based on the number of prompt and completion tokens used. The costs are computed
    according to a hardcoded pricing table for supported models.

    Args:
        function (Callable): The function to be decorated, expected to return an instance
                             of a model with token counts included.

    Returns:
        Callable: A decorator that enhances the function's output with cost calculations.
    """

    # Define the pricing table within the decorator
    pricing = {
        'gpt-4o': {
            'input': 5.00 / 1000000,  # $5.00 per 1M tokens
            'output': 15.00 / 1000000  # $15.00 per 1M tokens
        }
    }

    def decorated_function(*args, **kwargs) -> Any:
        # Call the original function and capture its output
        result = function(*args, **kwargs)
        
        # Extract token counts using dot notation
        prompt_tokens = result.token_counts.prompt_tokens
        completion_tokens = result.token_counts.completion_tokens

        # Determine the model used; default to 'gpt-4o' for now
        model = 'gpt-4o'  # This could be dynamically determined based on args/kwargs if needed

        # Calculate costs based on the price table for the specific model
        input_cost = prompt_tokens * pricing[model]['input']
        output_cost = completion_tokens * pricing[model]['output']
        total_cost = input_cost + output_cost
        
        # Assign cost details using the CostDetails model
        result.cost_details = CostDetails(
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost
        )

        # Optionally print formatted cost details for transparency
        print(f"Cost Details: Input: {result.cost_details.formatted_input_cost()}, Output: {result.cost_details.formatted_output_cost()}, Total: {result.cost_details.formatted_total_cost()}")
        return result

    return decorated_function

In [41]:
# wrapper

@cost
def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, List[str]] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
) -> 'WrapperOutput':
    
    """
    Generates LLM completions using provided parameters and collects token usage information.
    
    This function dynamically constructs a message array for the LLM based on input parameters,
    handles the completion process using either standard or model-based completions depending on 
    the presence of a response model, and returns structured outputs including both the completion 
    response and token usage statistics.

    Args:
        system_prompt (str, optional): System-level initial prompt or instruction.
        user_prompt (Union[str, List[str]], optional): User-provided content or context as a single string or list of strings.
        response_model (BaseModel, optional): Pydantic model to structure the response when using model-specific completions.
        max_retries (int): Maximum number of retries for the LLM request.
        additional_messages (Union[str, List[str]], optional): Additional messages to precede the user prompt.

    Returns:
        WrapperOutput: A Pydantic model containing the LLM response and detailed token counts.

    Classes Defined Inside:
        TokenCounts: A Pydantic model detailing the counts of different types of tokens.
        WrapperOutput: A Pydantic model encapsulating the response and TokenCounts model.
    """

    class TokenCounts(BaseModel):
        completion_tokens: int
        prompt_tokens: int
        total_tokens: int

    class WrapperOutput(BaseModel):
        response: Union[str, BaseModel]
        token_counts: TokenCounts
        cost_details: Optional[Dict[str, str]] = None

    messages = []

    # Construct the messages list based on provided inputs
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    if additional_messages:
        # Can handle both list of messages or a single string
        if isinstance(additional_messages, List):
            messages.extend([{"role": "user", "content": message} for message in additional_messages])
        else:
            messages.append({"role": "user", "content": additional_messages})

    if user_prompt:
        # Similarly, handles both single and multiple user prompts
        if isinstance(user_prompt, List):
            messages.extend([{"role": "user", "content": context} for context in user_prompt])
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion and extract token counts based on the presence of a response model
    if response_model is None:
        # Standard completion process without a structured model
        completion = client.chat.completions.create(
            model=GPT_MODEL,
            response_model=None,
            max_retries=max_retries,
            messages=messages
        )
        response_content = completion.choices[0].message.content.strip()
        token_counts = TokenCounts(
            completion_tokens=completion.usage.completion_tokens,
            prompt_tokens=completion.usage.prompt_tokens,
            total_tokens=completion.usage.total_tokens
        )
    else:
        # Model-based completion that structures the response as per the specified BaseModel
        structured_response, raw_completion = client.chat.completions.create_with_completion(
            model=GPT_MODEL,
            response_model=response_model,
            max_retries=max_retries,
            messages=messages
        )
        response_content = structured_response
        token_counts = TokenCounts(
            completion_tokens=raw_completion.usage.completion_tokens,
            prompt_tokens=raw_completion.usage.prompt_tokens,
            total_tokens=raw_completion.usage.total_tokens
        )

    return WrapperOutput(response=response_content, token_counts=token_counts)


In [8]:
# predict tokens

def count_tokens(text: str, print_length: bool = True, token_type: str = 'input') -> int:
    """
    Count the number of tokens in a given text string using a specific tokenization model, print the token count,
    calculate and print the cost of tokens based on a pricing table.

    Parameters:
        text (str): The text string to tokenize and count.
        print_length (bool): If True, prints the length of the tokens. Default is True.
        token_type (str): Specifies whether to use 'input' or 'output' token pricing. Default is 'input'.

    Returns:
        int: The number of tokens in the text.
    """
    # Encode the transcript to count tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    token_count = len(tokens)

    # Print the token length if required
    if print_length:
        print(f"Token count: {token_count}")

    # Pricing table
    pricing = {
        'input': 5 / 1_000_000,  # $5 per 1 million tokens
        'output': 15 / 1_000_000  # $15 per 1 million tokens
    }

    # Calculate and print cost
    cost = pricing[token_type] * token_count
    print(f"Cost for {token_type} tokens: ${cost:.6f}")

    return None

## Tools

In [9]:
# transcript from youtube video tool

class Transcript(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and 
    formats it as a JSON string.
    """

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en-GB', 'en'])

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None

In [21]:
# books

class Book(BaseModel):
    """
    Represents a book with its title, author, and rating.

    Attributes:
        title (str): The title of the book.
        author (str): The author of the book.
        rating (Literal["High", "Medium", "Low"]): The rating of the book.
    """
    title: str = Field(..., description="The title of the book")
    author: str = Field(..., description="The author of the book")
    rating: Literal["High", "Medium", "Low"] = Field(..., description="The rating of the book")
    summary: str = Field(..., description="A one sentence summary of the book review")

class Books(BaseModel):
    """
    Represents a collection of books extracted from a transcript.

    Attributes:
        books (List[Book]): A list of Book objects.
    """
    books: List[Book] = Field(default_factory=list, description="A list of books")

    def run(self, text: str) -> 'Books':
        """
        Extract books from the input transcript text.

        This method uses a wrapper function to process the input text and create
        a Books instance containing the extracted books.

        Args:
            text (str): The input transcript text to extract books from.

        Returns:
            Books: An instance of the Books class with the extracted books.
        """
        books = wrapper(
            system_prompt="""
            Extract books mentioned in the provided transcript.
            For each book, identify the title, author, and assign a rating (High, Medium, or Low) 
            based on the context or sentiment expressed in the transcript.
            If the rating is not clear from the context, default to "Medium".
            Return the extracted information as a list of Book objects.
            """,
            user_prompt=text,
            response_model=Books,
            max_retries=3
        )
        return books

In [45]:
# books to dataframe


def books_to_dataframe(books_response):
    # Extract the list of books from the response
    books_list = books_response.books

    # Create a list of dictionaries, each representing a book
    data = [
        {
            "Title": book.title,
            "Author": book.author,
            "Rating": book.rating,
            "Summary": book.summary
        }
        for book in books_list
    ]

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    # Add a 'No.' column as the index
    df.index = range(1, len(df) + 1)
    df.index.name = 'No.'

    # Reorder columns to match the desired output
    df = df[['Title', 'Author', 'Rating', 'Summary']]

    return df

## Run

In [82]:
# constants

URL = "https://www.youtube.com/watch?v=HTNWbO-P4BM"

In [83]:
transcript_tool = Transcript()
transcript = transcript_tool.run(URL)
_ = count_tokens(transcript)

Token count: 16240
Cost for input tokens: $0.081200


In [85]:
books_tool = Books()
books = books_tool.run(transcript)

Cost Details: Input: $0.081745, Output: $0.008205, Total: $0.089950


In [86]:
df = books_to_dataframe(books.response)

with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None,
                       'display.width', None,
                       'display.max_colwidth', None):
    display(df)


Unnamed: 0_level_0,Title,Author,Rating,Summary
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The Will of the Many,James Islington,High,"A dark, political, Roman-inspired Academia progression fantasy where the main character's growth depends on how he plays his cards."
2,Of Darkness and Light,John Gwynne,High,"An epic fantasy that mixes elements of progression fantasy with a narrative about characters growing older, wiser, and stronger."
3,The Art of Prophecy,Wesley Chu,Medium,"A fun, Asian-inspired progression fantasy where a prophecy is revealed to be false and the spoiled main character must learn humility."
4,Dungeon Crawler Carl,Matt Dinniman,High,"An exciting and humor-filled dungeon crawl where millions of beings perish, featuring strong character dynamics."
5,Children of Blood and Bone,Tomi Adeyemi,Low,"An African-inspired YA fantasy, initially captivating but ultimately found lacking and overly dramatic."
6,Unsouled,Will Wight,High,"The first book in the Cradle series, a beloved journey from weakness to unimaginable power with complex character growth."
7,Sufficiently Advanced Magic,Andrew Rowe,Medium,"An enjoyable tower-clearing fantasy with a unique magic system, featuring a character from a prominent family with a different magical attribute."
8,The Iron Prince,Bryce O'Connor and Luke Chmilenko,High,A thrilling martial arts and robotic-enhancement progression fantasy with high potential for character growth.
9,Awaken Online,Travis Bagwell,High,"A dark, video game-inspired fantasy with real-world and in-game consequences featuring a revenge-driven necromancer protagonist."
10,Kraken Rider Z,Michael-Scott Earle,High,"An animal companion, school-setting progression fantasy with a mix of fantastical creatures and strong character bonds."


## longlist

1. The Dagger and Coin - Daniel Abraham
2. Blood over Bright Haven - M L Wang
3. Mother of Learning: ARC 4 - Domagoj Kurmaić
4. Yumi and the Nightmare Painter -	Brandon Sanderson
5. Eleventh Cycle - Kian N. Ardalan
6. The Silver Blood Promise - James Logan
7. The Vanished Birds - Simon Jimenez
