In [1]:
# goal
goal = "Convert a Youtube video transcript or website article into structured notes and diagrams for a local material-mkdocs site." # abbreviated to get sections



# tasks
task_1 = "Download the complete transcript from the YouTube video."
# task_2 = "Collect video metadata (title, author, date) from YouTube."
# task_3 = "Identify key topics, keywords and questions from the transcript."
task_4 = "Draft an outline with clear sections and subsections based on the concatenated content from the previous tasks."
task_5 = "Populate each section and subsection in the outline with detailed content."
task_6 = "Create relevant mermaid diagrams and integrate them into the content."
task_7 = "Assemble each section into individual Markdown files, including relevant diagrams and metadata."
task_8 = "Compile all Markdown files into the material-mkdocs site structure."
task_9 = "Launch the local mkdocs site to verify the structure and content."

In [2]:
# urls

url_1 = "https://www.youtube.com/watch?v=hvAPnpSfSGo" # langraph
url_2 = "https://www.answer.ai/posts/2024-06-11-os-ai.html" # Non Youtube AI regulation by Jeremy Howard
url_3 = "https://www.sequoiacap.com/podcast/training-data-harrison-chase/" # harrison chase on agents
url_4 = "https://www.youtube.com/watch?v=6XZLoW0-mPY" # harrison chase on agents youtube
url_5 = "https://applied-llms.org/" # what we learned from a year building with LLMs
url_6 = "https://lithub.com/joan-didion-why-i-write/" # joan didion
url_7 = "https://www.answer.ai/posts/2023-12-12-launch.html" # answer ai launch
url_8 = "https://www.youtube.com/watch?v=c0gcsprsFig&t=2839s" # 3 hour video discussion of what we learned from a year building with LLMs
url_8 = "https://www.youtube.com/watch?v=AwaNygeANFg" # accelerating AI outerbounds

## Things I want to try

- [X] cost decorator
- [ ] try different models
- [X] exa
- [ ] try Surya
- [ ] try an audio file and transcriptise it
- [ ] a loop where it takes longer than output character limit
- [X] style guide with jeremy howard writing or joan dion
- [ ] debug mermaid diagrams in launch
- [ ] debug sequencing of filesplit in launch
- [ ] debug title, description and formatting in launch
- [ ] figure out a way to use router to launch it
- [ X] make an exa tool or just an llm call to get the stripped down text?


In [3]:
# imports

import enum
import instructor
import json
import os
import re
import uuid
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv
from exa_py import Exa
from googleapiclient.discovery import build
from openai import OpenAI
from pathlib import Path
from pprint import pprint as pp
from pydantic import BaseModel, Field, StringConstraints, UUID4, conlist, constr, field_validator
import requests
import tiktoken
import time
from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing_extensions import Annotated, Literal
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

In [14]:
# load API key

dotenv_path = Path(r"C:\Storage\python_projects\ashvin\.env")
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
EXA_API_KEY = os.getenv("EXA_API_KEY")

# main constants

GPT_MODEL = "gpt-4o" # points to latest GPT model
URL = url_8

#instantiate client
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
audio_client = OpenAI()

## Utilities

In [5]:
# cost decorator

class CostDetails(BaseModel):
    input_cost: float
    output_cost: float
    total_cost: float

    def formatted_input_cost(self):
        return f"${self.input_cost:.6f}"

    def formatted_output_cost(self):
        return f"${self.output_cost:.6f}"

    def formatted_total_cost(self):
        return f"${self.total_cost:.6f}"

def cost(function: Callable) -> Callable:
    """
    Decorator to calculate and add the cost of token usage based on predefined model pricing.
    
    This decorator enriches the output of the decorated function by calculating the cost
    based on the number of prompt and completion tokens used. The costs are computed
    according to a hardcoded pricing table for supported models.

    Args:
        function (Callable): The function to be decorated, expected to return an instance
                             of a model with token counts included.

    Returns:
        Callable: A decorator that enhances the function's output with cost calculations.
    """

    # Define the pricing table within the decorator
    pricing = {
        'gpt-4o': {
            'input': 5.00 / 1000000,  # $5.00 per 1M tokens
            'output': 15.00 / 1000000  # $15.00 per 1M tokens
        }
    }

    def decorated_function(*args, **kwargs) -> Any:
        # Call the original function and capture its output
        result = function(*args, **kwargs)
        
        # Extract token counts using dot notation
        prompt_tokens = result.token_counts.prompt_tokens
        completion_tokens = result.token_counts.completion_tokens

        # Determine the model used; default to 'gpt-4o' for now
        model = 'gpt-4o'  # This could be dynamically determined based on args/kwargs if needed

        # Calculate costs based on the price table for the specific model
        input_cost = prompt_tokens * pricing[model]['input']
        output_cost = completion_tokens * pricing[model]['output']
        total_cost = input_cost + output_cost
        
        # Assign cost details using the CostDetails model
        result.cost_details = CostDetails(
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=total_cost
        )

        # Optionally print formatted cost details for transparency
        print(f"Cost Details: Input: {result.cost_details.formatted_input_cost()}, Output: {result.cost_details.formatted_output_cost()}, Total: {result.cost_details.formatted_total_cost()}")
        return result

    return decorated_function

In [6]:
# wrapper

@cost
def wrapper(
    system_prompt: str | None = None, 
    user_prompt: Union[str, List[str]] | None = None, 
    response_model: BaseModel | None = None, 
    max_retries: int = 3, 
    additional_messages: Union[str, List[str]] | None = None
) -> 'WrapperOutput':
    
    """
    Generates LLM completions using provided parameters and collects token usage information.
    
    This function dynamically constructs a message array for the LLM based on input parameters,
    handles the completion process using either standard or model-based completions depending on 
    the presence of a response model, and returns structured outputs including both the completion 
    response and token usage statistics.

    Args:
        system_prompt (str, optional): System-level initial prompt or instruction.
        user_prompt (Union[str, List[str]], optional): User-provided content or context as a single string or list of strings.
        response_model (BaseModel, optional): Pydantic model to structure the response when using model-specific completions.
        max_retries (int): Maximum number of retries for the LLM request.
        additional_messages (Union[str, List[str]], optional): Additional messages to precede the user prompt.

    Returns:
        WrapperOutput: A Pydantic model containing the LLM response and detailed token counts.

    Classes Defined Inside:
        TokenCounts: A Pydantic model detailing the counts of different types of tokens.
        WrapperOutput: A Pydantic model encapsulating the response and TokenCounts model.
    """

    class TokenCounts(BaseModel):
        completion_tokens: int
        prompt_tokens: int
        total_tokens: int

    class WrapperOutput(BaseModel):
        response: Union[str, BaseModel]
        token_counts: TokenCounts
        cost_details: Optional[Dict[str, str]] = None

    messages = []

    # Construct the messages list based on provided inputs
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    if additional_messages:
        # Can handle both list of messages or a single string
        if isinstance(additional_messages, List):
            messages.extend([{"role": "user", "content": message} for message in additional_messages])
        else:
            messages.append({"role": "user", "content": additional_messages})

    if user_prompt:
        # Similarly, handles both single and multiple user prompts
        if isinstance(user_prompt, List):
            messages.extend([{"role": "user", "content": context} for context in user_prompt])
        else:
            messages.append({"role": "user", "content": user_prompt})

    # Generate the completion and extract token counts based on the presence of a response model
    if response_model is None:
        # Standard completion process without a structured model
        completion = client.chat.completions.create(
            model=GPT_MODEL,
            response_model=None,
            max_retries=max_retries,
            messages=messages
        )
        response_content = completion.choices[0].message.content.strip()
        token_counts = TokenCounts(
            completion_tokens=completion.usage.completion_tokens,
            prompt_tokens=completion.usage.prompt_tokens,
            total_tokens=completion.usage.total_tokens
        )
    else:
        # Model-based completion that structures the response as per the specified BaseModel
        structured_response, raw_completion = client.chat.completions.create_with_completion(
            model=GPT_MODEL,
            response_model=response_model,
            max_retries=max_retries,
            messages=messages
        )
        response_content = structured_response
        token_counts = TokenCounts(
            completion_tokens=raw_completion.usage.completion_tokens,
            prompt_tokens=raw_completion.usage.prompt_tokens,
            total_tokens=raw_completion.usage.total_tokens
        )

    return WrapperOutput(response=response_content, token_counts=token_counts)


In [7]:
# predict tokens

def count_tokens(text: str, print_length: bool = True, token_type: str = 'input') -> int:
    """
    Count the number of tokens in a given text string using a specific tokenization model, print the token count,
    calculate and print the cost of tokens based on a pricing table.

    Parameters:
        text (str): The text string to tokenize and count.
        print_length (bool): If True, prints the length of the tokens. Default is True.
        token_type (str): Specifies whether to use 'input' or 'output' token pricing. Default is 'input'.

    Returns:
        int: The number of tokens in the text.
    """
    # Encode the transcript to count tokens
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    token_count = len(tokens)

    # Print the token length if required
    if print_length:
        print(f"Token count: {token_count}")

    # Pricing table
    pricing = {
        'input': 5 / 1_000_000,  # $5 per 1 million tokens
        'output': 15 / 1_000_000  # $15 per 1 million tokens
    }

    # Calculate and print cost
    cost = pricing[token_type] * token_count
    print(f"Cost for {token_type} tokens: ${cost:.6f}")

    return None

## Tools

In [8]:
# Youtube video metadata tool
# requires youtube API key

class YoutubeDetails(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves key metadata,
    and formats it as a dictionary.
    
    The metadata extracted includes:
        - Title: The title of the video.
        - Description: The description of the video.
        - Published At: The date and time when the video was published.
        - Channel Title: The title of the channel that uploaded the video.
        - Views: The number of views the video has received.
        - Likes: The number of likes the video has received.
        - Dislikes: The number of dislikes the video has received.
        - Comments: The number of comments on the video.
        - Duration: The duration of the video in ISO 8601 format.
        - Tags: A list of tags associated with the video.
    """

    def run(self, url: str) -> Optional[Dict[str, str]]:
        """
        Extract the YouTube video ID from a given URL, retrieve key metadata,
        and format it as a dictionary.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[Dict[str, str]]: The video metadata if the video ID is valid and the
                                      metadata is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            if not YOUTUBE_API_KEY:
                print("API key not found in environment variables.")
                return None

            youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
            request = youtube.videos().list(part='snippet,contentDetails,statistics', id=video_id)
            response = request.execute()

            if not response['items']:
                print("No video found with the provided video ID.")
                return None

            video_details = response['items'][0]
            metadata = {
                "Title": video_details['snippet']['title'],
                "Description": video_details['snippet']['description'],
                "Published At": video_details['snippet']['publishedAt'],
                "Channel Title": video_details['snippet']['channelTitle'],
                "Views": video_details['statistics'].get('viewCount', 'N/A'),
                "Likes": video_details['statistics'].get('likeCount', 'N/A'),
                "Dislikes": video_details['statistics'].get('dislikeCount', 'N/A'),
                "Comments": video_details['statistics'].get('commentCount', 'N/A'),
                "Duration": video_details['contentDetails']['duration'],
                "Tags": video_details['snippet'].get('tags', [])
            }

            return metadata
        except Exception as e:
            print(f"Error retrieving video metadata: {e}")
            return None

In [9]:
# transcript from youtube video tool

class Transcript(BaseModel):
    """
    This tool extracts the YouTube video ID from a given URL, retrieves the transcript, and 
    formats it as a JSON string.
    """

    def run(self, url: str) -> Optional[str]:
        """
        Extract the YouTube video ID from a given URL, retrieve the transcript,
        and format it as a JSON string.

        Parameters:
            url (str): The YouTube URL from which to extract the video ID.

        Returns:
            Optional[str]: The JSON formatted transcript if the video ID is valid and the
                           transcript is available, otherwise None.
        """
        # Regular expression to find the video ID in a YouTube URL
        pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})'
        match = re.search(pattern, url)
        if not match:
            print("No valid YouTube video ID found in the provided URL.")
            return None

        video_id = match.group(1)

        try:
            # Retrieve the transcript
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en-GB', 'en'])

            # Format the transcript as JSON
            formatter = JSONFormatter()
            json_formatted_transcript = formatter.format_transcript(transcript)

            return json_formatted_transcript
        except Exception as e:
            print(f"Error retrieving or formatting transcript: {e}")
            return None

In [10]:
# extract metadata from text

class MetaData(BaseModel):
    """
    Extract various metadata properties from a text - PrimarySource - and generate a MetaData instance by processing the text.
    Each property is described by a specific prompt, and this class can generate and return a metadata report
    based on the input text.
    """

    documentation_names: conlist(constr(max_length=100), min_length=1, max_length=10) = Field(
        default=None,
        description="Generate up to 10 possible names for documentation we want to build, for the data in PrimarySource.",
        example=["Documentation1", "Guide", "Manual"]
    )

    intended_audience: conlist(constr(max_length=100), min_length=1, max_length=10) = Field(
        default=None,
        description="Generate up to 10 words describing the intended audience for creating documentation from the data in PrimarySource.",
        example=["Beginners", "Advanced Users", "Developers", "Managers"]
    )

    description: constr(max_length=500) = Field(
        default=None,
        description="Provide a three-sentence description of the information in PrimarySource.",
        example="This is an article about data science best practices."
    )

    keywords: conlist(constr(max_length=50), min_length=1, max_length=10) = Field(
        default=None,
        description="Generate up to 10 possible keywords referring to industries, technologies, people or other themes for the data in PrimarySource.",
        example=["Data Science", "Machine Learning"]
    )

    clarification_questions: conlist(constr(max_length=200), min_length=1, max_length=10) = Field(
    default=None,
    description="Generate a list of insightful questions that target ambiguities or complex areas within PrimarySource, prompting deeper exploration or explanation.",
    example=[
        "Can you explain the significance of the statistical methods used in this analysis?",
        "What are the implications of these findings for the broader field of study?",
        "How does this information compare with prior established studies?",
        "What methodologies were used in the data collection process?",
        "Could you detail the reasoning behind the conclusions drawn in this document?",
        "What are the potential biases in this study, and how were they addressed?",
        "Can you clarify the terms used in the discussion of the results?",
        "Are there any assumptions in this study that need more detailed justification?",
        "What are the limitations of this study, and how do they affect the results?",
        "How can the findings be applied in practical scenarios?"
    ]
)

    def run(self, text: str) -> 'MetaData':
        """
        Process the input text to extract metadata and generate a MetaData instance.

        Parameters:
            text (str): The input text from which to extract metadata.

        Returns:
            MetaData: An instance of MetaData class filled with extracted metadata.
        """

        metadata = wrapper(
            system_prompt="Extract metadata from the provided text.",
            user_prompt=text,
            response_model=MetaData,  # Assuming the wrapper can fill a MetaData instance
            max_retries=3
        )

        return metadata


In [11]:
# Outline as a knowledge graph tool

class OutlineNode(BaseModel):
    """
    Represents a node in the outline. Each node can represent a section or a subsection.

    Attributes:
        id (int): Unique identifier for the node.
        title (str): Title of the node.
        description (str): A single sentence description of the node.
        key_points (List[str]): A list of very short strings outlining key points to cover in this node.
        parent_id (Union[int, None]): Identifier of the parent node, if any.
    """
    id: int = Field(None, description="Unique identifier for the node.")
    title: str = Field(None, description="Title of the node.")
    description: str = Field(None, description="A single sentence description of the node.")
    key_points: List[str] = Field(default_factory=list, description="A list of very short strings outlining key points to cover in this node.")
    parent_id: Union[int, None] = Field(default=None, description="Identifier of the parent node, if any.")


class OutlineEdge(BaseModel):
    """
    Represents an edge in the outline, connecting two nodes.

    Attributes:
        source (int): Identifier of the source node.
        target (int): Identifier of the target node.
        label (str): Label describing the relationship between the nodes. Be expressive and precise in label selection.
    """
    source: int = Field(None, description="Identifier of the source node.")
    target: int = Field(None, description="Identifier of the target node.")
    label: str = Field(None, description="Label describing the relationship between the nodes.Be expressive and precise in label selection.")


class Outline(BaseModel):
    """
    Represents the overall structure of a document outline, containing multiple nodes and edges.

    The outline can have sections, subsections, and subsubsections. A section can have many subsections,
    and each subsection can have many subsubsections. Ideally the first section doesn't have any subsections.
    The levels of nestedness are limited to a maximum of three levels:
        - Level 1: Section - corresponds to ## in markdown
        - Level 2: Subsection - corresponds to ### in markdown
        - Level 3: Subsubsection - corresponds to #### in markdown

    Attributes:
        title (str): Title of the document for which the outline is generated.
        nodes (List[OutlineNode]): List of nodes in the document outline.
        edges (List[OutlineEdge]): List of edges connecting the nodes in the document outline.
    """
    title: str = Field(None, description="Title of the document for which the outline is generated.")
    nodes: List[OutlineNode] = Field(default_factory=list, description="List of nodes in the document outline.")
    edges: List[OutlineEdge] = Field(default_factory=list, description="List of edges connecting the nodes in the document outline.")

    def run(self, text: str) -> 'Outline':
        """
        Generate an outline based on the input text.

        Parameters:
            text (str): The input text to generate the outline from.

        Returns:
            Outline: An instance of the Outline class with the generated outline.
        """
        outline = wrapper(
            system_prompt="""
            Generate a detailed, structured outline of the document based on the provided text.
            The outline should have appropriate sections, subsections and subsubsections.
            Ideally the first section doesn't have any subsections.
            """,
            user_prompt=text,
            response_model=Outline,
            max_retries=3
        )
        return outline

In [12]:
# web text extraction tool

class WebTextExtractor(BaseModel):
    """
    This tool extracts the text content from a given URL using the requests module.
    
    The extracted text can include the raw HTML content or just the text content of the web page.
    """

    def run(self, url: str, return_html: bool = False) -> Optional[str]:
        """
        Extract the text content from a given URL.

        Parameters:
            url (str): The URL from which to extract the text content.
            return_html (bool): If True, return the full HTML content. If False, return plain text. Default is False.

        Returns:
            Optional[str]: The text content of the web page if the request is successful,
                           otherwise None.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            
            if return_html:
                return response.text
            else:
                soup = BeautifulSoup(response.text, 'html.parser')
                page_text = soup.get_text()
                return page_text.strip()
                
        except requests.RequestException as e:
            print(f"Error fetching the web page: {e}")
            return None



In [13]:
# writing style guide

class WritingStyleGuide(BaseModel):
    """
    A comprehensive writing style guide encompassing grammar, sentences, paragraphs, tone of voice, precision, and overall quality. This guide helps in capturing a unique tone of voice and approach.
    """

    target_audience: List[constr(min_length=1, max_length=100)] = Field(
        None,
        description="Specify up to 5 target audiences, and why they are the target. This guides the tone and complexity.",
        max_items=5,
        examples=["General Public: Need straightforward explanations", "Technical Experts: Require detailed and precise information"]
    )

    tone: constr(min_length=1, max_length=100) = Field(
        None,
        description="The tone of your writing, such as formal, informal, conversational, or authoritative. Adapt the tone based on the target audience.",
        examples=["Conversational for general public, formal for technical experts"]
    )

    grammar_rules: List[constr(min_length=1, max_length=100)] = Field(
        None,
        description="Key grammar rules to follow. Avoid common pitfalls like passive voice and excessive adverbs.",
        max_items=10,
        examples=["Use active voice", "Avoid passive constructions", "Use the Oxford comma", "Limit the use of adverbs"]
    )

    sentence_structure: constr(min_length=1, max_length=200) = Field(
        None,
        description="Guidelines for sentence structure. Use short, direct sentences but vary length to create rhythm.",
        examples=["Use short, direct sentences. Keep most sentences between 10-15 words. Vary sentence length for rhythm."]
    )

    paragraph_structure: constr(min_length=1, max_length=200) = Field(
        None,
        description="Guidelines for paragraph structure. Ensure logical flow and coherence within paragraphs.",
        examples=["Begin with a topic sentence. Follow with supporting sentences. End with a concluding sentence. Ensure each paragraph has a single clear idea."]
    )

    preferred_words: List[constr(min_length=1, max_length=50)] = Field(
        None,
        description="Extract or infer up to 10 concrete words or phrases to use often. Avoid jargon and tired tropes.",
        max_items=10,
        examples=["Clear", "Direct", "Simple", "Precise"]
    )

    avoid_words: List[constr(min_length=1, max_length=50)] = Field(
        None,
        description="Extract or infer up to 10 words or phrases to avoid. Explain why these should be avoided.",
        max_items=10,
        examples=["Leverage: overused business jargon", "Utilize: 'use' is simpler", "Cutting-edge: vague and cliché"]
    )

    formatting: constr(min_length=1, max_length=500) = Field(
        None,
        description="Formatting guidelines to ensure consistency and readability. Include headings, bullet points, and emphasis.",
        examples=["Use H2 for section titles, bullet points for lists, and italics for emphasis. Ensure consistent formatting throughout the document."]
    )

    citation_style: constr(min_length=1, max_length=100) = Field(
        None,
        description="The preferred citation style, such as APA, MLA. Provide examples of correct citation formats.",
        examples=["APA: (Author, Year). MLA: (Author Page)."]
    )

    common_phrases: List[constr(min_length=1, max_length=100)] = Field(
        None,
        description="Up to 10 common phrases to ensure consistency. Use these to link ideas smoothly.",
        max_items=10,
        examples=["In short", "To sum up", "Moreover", "Furthermore"]
    )

    precision: constr(min_length=1, max_length=200) = Field(
        None,
        description="Guidelines for ensuring precision in writing. Provide tips on how to achieve it.",
        examples=["Use specific, concrete language. Avoid vague terms. Double-check facts and figures. Provide examples when possible."]
    )

    clarity: constr(min_length=1, max_length=200) = Field(
        None,
        description="Guidelines for maintaining clarity. Provide examples of clear vs. unclear sentences.",
        examples=["Ensure each sentence conveys a single idea clearly. Unclear: 'The report, which was long and detailed, was not read by the committee.' Clear: 'The committee did not read the long, detailed report.'"]
    )

    readability: constr(min_length=1, max_length=200) = Field(
        None,
        description="Guidelines for ensuring readability. Avoid jargon and overly complex words.",
        examples=["Use simple words and phrases. Avoid technical jargon unless necessary. Break up long paragraphs. Use transitional phrases between ideas."]
    )

    emotional_connection: Optional[constr(max_length=200)] = Field(
        None,
        description="Ensure the writing connects with readers on an emotional and intellectual level.",
        examples=["Use personal anecdotes and reflections to engage readers emotionally. Address the reader directly when appropriate."]
    )

    sensory_details: Optional[constr(max_length=200)] = Field(
        None,
        description="Incorporate sensory details to make the writing vivid and engaging.",
        examples=["Describe scenes with details that appeal to the senses, such as sights, sounds, and smells. Use metaphors and similes to create vivid imagery."]
    )

    honesty_integrity: constr(min_length=1, max_length=200) = Field(
        None,
        description="Emphasize the importance of honesty and integrity in writing.",
        examples=["Always present facts accurately and acknowledge sources. Be transparent about any biases. Correct errors promptly and openly."]
    )

    def create(self) -> str:
        """
        Generate a report summarizing the writing style guide.

        Returns:
            str: A summary of the writing style guide.
        """
        report = [
            "Writing Style Guide Summary:",
            f"\nTarget Audience:",
            *[f"- {audience}" for audience in self.target_audience or []],
            f"\nTone: {self.tone}",
            f"\nGrammar Rules:",
            *[f"- {rule}" for rule in self.grammar_rules or []],
            f"\nSentence Structure: {self.sentence_structure}",
            f"\nParagraph Structure: {self.paragraph_structure}",
            f"\nPreferred Words: {', '.join(self.preferred_words or [])}",
            f"\nWords to Avoid:",
            *[f"- {word}" for word in self.avoid_words or []],
            f"\nFormatting: {self.formatting}",
            f"\nCitation Style: {self.citation_style}",
            f"\nCommon Phrases: {', '.join(self.common_phrases or [])}",
            f"\nPrecision: {self.precision}",
            f"\nClarity: {self.clarity}",
            f"\nReadability: {self.readability}",
            f"\nHonesty and Integrity: {self.honesty_integrity}"
        ]
        
        if self.emotional_connection:
            report.append(f"\nEmotional Connection: {self.emotional_connection}")
        
        if self.sensory_details:
            report.append(f"\nSensory Details: {self.sensory_details}")
        
        return "\n".join(report)

    def run(self, text: str | None = None) -> str:
        """
        Generate a writing style guide based on the provided text using an LLM.

        Parameters:
            text (str): The input text to generate the writing style guide from.

        Returns:
            str: A JSON representation of the WritingStyleGuide instance.
        """
        instance = wrapper(
            system_prompt="""
            Extract or infer a comprehensive writing style guide with all relevant properties based on the provided text. 
            This guide should help in capturing a unique tone of voice and approach based on the provided text.
            """,
            user_prompt=text,
            response_model=WritingStyleGuide,
            max_retries=3
        )
        return instance.response




In [21]:
# section tools

class Subsection(BaseModel):
    id: int
    title: str
    description: str
    key_points: List[str]
    content: Optional[str] = None  # Detailed content in Markdown or plain text

class Section(BaseModel):
    id: int
    title: str
    description: str
    key_points: List[str]
    subsections: List[Subsection] = []
    content: Optional[str] = None  # Detailed content in Markdown or plain text

class Sections(BaseModel):
    sections: List[Section]


## Mini Run for Writing Style Guide

In [None]:
# get text from website incl html

web_text_extraction_tool = WebTextExtractor()
web_text_1 = web_text_extraction_tool.run(url_1)
web_text_2 = web_text_extraction_tool.run(url_7)
web_text_3 = web_text_1 + web_text_2

In [None]:
style_guide_tool = WritingStyleGuide()
style_guide = style_guide_tool.run(web_text_3)

In [None]:
print(style_guide.create())

In [None]:
essay = wrapper(
    system_prompt = f"""Write me an essay about learning to build in and with AI after 40 without a strong tech background. 
    Constantly feeling like an imposter but trying to inch forward.
    Follow the style of : {web_text_3}.
    """
)

In [None]:
print(essay.response)

In [None]:
updated_story = wrapper(
    system_prompt = f"""
    Revise the provided short story about nanomachines using this style guide to improve the prose and content : {style_guide.create()}.
    Always favour simple words and jargon free prose. Meaning should shine through simplicity and precision of prose and content choices. 
    Remember it is a story not a report. The writing should flow. No bullet points.
    """,
    user_prompt = short_story.response
)

In [None]:
print(updated_story.response)

## Run

In [16]:
# get text from youtube video

transcript_tool = Transcript()
transcript = transcript_tool.run(url_1)
_ = count_tokens(transcript)

Token count: 19107
Cost for input tokens: $0.095535


In [18]:
# get schema from webtext and transcript

outline_tool = Outline()
outline = outline_tool.run(transcript)

Cost Details: Input: $0.097795, Output: $0.009825, Total: $0.107620


In [22]:
sections = wrapper(
    system_prompt = "Given this knowledge graph, extract the detailed outline for sections and subsections. Do not fill in the content.",
    user_prompt = outline.response.model_dump_json(),
    response_model = Sections
)

Cost Details: Input: $0.004245, Output: $0.006960, Total: $0.011205


In [24]:
sections.response

Sections(sections=[Section(id=1, title='Introduction', description='Introduction to LangGraph and its purpose', key_points=['Overview', 'Purpose of LangGraph'], subsections=[], content=None), Section(id=2, title='LangGraph Overview', description='Detailed introduction to LangGraph and its functionalities', key_points=['Definition of LangGraph', 'Similarities with NetworkX', 'Syntax and basic structure'], subsections=[], content=None), Section(id=3, title='Multi-Agent Workflows', description='Different types of multi-agent workflows that can be created using LangGraph', key_points=['Definition of agent-like workflows', 'Definition of cycles', 'Comparison with state machines', 'Nodes and edges in LangGraph'], subsections=[Subsection(id=4, title='Multi-Agent Collaboration', description='Casting multi-agent collaboration using LangGraph', key_points=['Shared state among agents', 'Example of researcher agent and chart generator'], content=None), Subsection(id=6, title='Agent Supervision', d

In [None]:
outline.response

In [None]:
task = f"""
This is the primary source : {web_text}
---

This is extracted metadata about the primary source : {metadata.response.model_dump_json()}

---
This is the outline of notes about the primary source expressed as a knowledge graph of nodes and edges : {outline_webtext.response.model_dump_json()}

---
Task 1 :

Given this knowledge graph, extract the outline in markdown with appropriate formatting for sections and subsections

---

Task 2 : 

Follow the structure established in the outline, extract and then infer the content from the primary source, the extracted metadata and your knowledge.
For each section or subsection fill in the content in extensive and precise detail.
Write in markdown, with appropriate formatting (bold, italics, headings, bullet points, etc).
Write it as an expert in the themes.

---
ONLY RETURN THE OUTPUT FROM TASK 2. DO NOT ENCLOSE THE OUTPUT IN CODE BLOCKS.
"""

In [None]:
content = wrapper(
    user_prompt = task,
)

In [None]:
print(content.response)

In [None]:
new_task = f"""
This is the primary source : {web_text}

---
This is the initial draft of detailed and structured notes extracted from the transcript: {content.response}

---
Task:

Preserve the markdown structure of the initial draft.
Extend, update and revise the content of each section and subsection to:
    - provide more detail to each section and subsection by incorporating relevant information from the primary source
    - selectively incorporate callouts where appropriate. Use direct quotes in callouts where appropriate.
    - The added detail is of the same writing clarity and quality as previously. 
Extend the content by selectively incorporating one or more mermaid diagrams:
    - Only insert mermaid diagrams where appropriate so that the reader can better understand the note content
    - Extract or infer details for the diagrams from primary source or the revised draft
    - Insert the diagram in code blocks at the appropriate section, subsection or subsubsection of the revised draft
    - The diagrams should be accessible and easy to grasp. They should not be overly complex or complicated.
Return a revised set of notes in markdown with the appropriately inserted mermaid diagram code

---
ONLY RETURN THE OUTPUT FROM THE TASK. DO NOT ENCLOSE THE OUTPUT IN CODE BLOCKS.
"""

In [None]:
final = wrapper(
    user_prompt = new_task
)

In [None]:
print(final.response)

In [None]:
# Install required packages (you may need to run this in a separate cell)
!pip install mkdocs mkdocs-material

In [None]:
# mkdocs code

import os
import yaml
from pathlib import Path
import re
import subprocess
import sys

def split_markdown(content):
    """
    Split the markdown content into separate files based on headers.
    
    Args:
    content (str): The full markdown content.
    
    Returns:
    list of tuples: Each tuple contains (filename, content) for each section.
    """
    print("Splitting markdown...")
    # Split on both # and ## to catch all sections
    sections = re.split(r'^(#|##)\s', content, flags=re.MULTILINE)
    files = []
    current_file = 'index.md'
    current_content = []

    for i, section in enumerate(sections):
        if section in ['#', '##']:
            if current_content:
                files.append((current_file, ''.join(current_content).strip()))
                current_content = []
            if i + 1 < len(sections):
                title = sections[i+1].split('\n')[0].strip()
                current_file = f"{title.lower().replace(' ', '-')}.md"
                if not current_file.endswith('.md'):
                    current_file += '.md'
        else:
            current_content.append(f"{'#' if i > 0 else ''}{section}")

    if current_content:
        files.append((current_file, ''.join(current_content).strip()))

    print(f"Split into {len(files)} files.")
    return files

def create_nav_structure(files):
    """
    Create a navigation structure for mkdocs.yml based on the generated files.
    
    Args:
    files (list): List of tuples containing (filename, content) for each section.
    
    Returns:
    list: A nested list representing the nav structure.
    """
    nav = []
    for filename, _ in files:
        if filename == 'index.md':
            nav.append({"Home": "index.md"})
        else:
            title = ' '.join(word.capitalize() for word in filename[:-3].split('-'))
            nav.append({title: filename})
    return nav

def create_mkdocs_config(site_name, site_description, nav):
    """
    Create the MkDocs configuration dictionary.
    
    Args:
    site_name (str): The name of the site.
    site_description (str): A brief description of the site.
    nav (list): The navigation structure for the site.
    
    Returns:
    dict: MkDocs configuration dictionary.
    """
    config = {
        'site_name': site_name,
        'site_url': 'https://www.example.com/',
        'site_description': site_description,
        'site_author': 'Ashvin',
        'theme': {
            'name': 'material',
            'icon': {
                'logo': 'material/view-grid-plus'
            },
            'favicon': 'assets/view-grid-plus-outline-dark.png',
            'palette': [
                {
                    'scheme': 'default',
                    'primary': 'teal',
                    'accent': 'amber',
                    'toggle': {
                        'icon': 'material/lightbulb',
                        'name': 'Switch to dark mode'
                    }
                },
                {
                    'scheme': 'slate',
                    'primary': 'teal',
                    'accent': 'amber',
                    'toggle': {
                        'icon': 'material/lightbulb-outline',
                        'name': 'Switch to light mode'
                    }
                }
            ],
            'features': [
                'content.code.copy',
                'content.code.annotate'
            ]
        },
        'markdown_extensions': [
            'admonition',
            'md_in_html',
            'pymdownx.details',
            {
                'pymdownx.superfences': {
                    'custom_fences': [
                        {
                            'name': 'mermaid',
                            'class': 'mermaid',
                            'format': '!!python/name:pymdownx.superfences.fence_code_format'
                        }
                    ]
                }
            },
            'attr_list'
        ],
        'copyright': 'Copyright © 2023-2024 Ashvin Parameswaran',
        'extra': {
            'social': [
                {
                    'icon': 'material/alpha-k-circle-outline',
                    'link': 'https://ashvin.au'
                }
            ]
        },
        'nav': nav
    }
    return config

def generate_site(markdown_content, site_name, site_description):
    """
    Generate the MkDocs site from the given markdown content.
    
    Args:
    markdown_content (str): The full markdown content for the site.
    site_name (str): The name of the site.
    site_description (str): A brief description of the site.
    """
    print("Starting site generation...")
    try:
        # Create docs directory
        docs_dir = Path('docs')
        docs_dir.mkdir(exist_ok=True)
        
        files = split_markdown(markdown_content)
        print(f"Creating {len(files)} files...")
        for filename, content in files:
            with open(docs_dir / filename, 'w', encoding='utf-8') as f:
                f.write(content)
        
        print("Creating mkdocs.yml...")
        nav = create_nav_structure(files)
        config = create_mkdocs_config(site_name, site_description, nav)
        with open('mkdocs.yml', 'w', encoding='utf-8') as f:
            yaml.dump(config, f, allow_unicode=True)
        print("Site generation complete.")
    except Exception as e:
        print(f"Error during site generation: {e}")
        raise

def launch_mkdocs():
    """
    Launch the MkDocs server.
    """
    print("Launching MkDocs server...")
    try:
        process = subprocess.Popen(['mkdocs', 'serve'], 
                                   stdout=subprocess.PIPE, 
                                   stderr=subprocess.PIPE,
                                   text=True)
        
        # Wait for a short time to see if the server starts successfully
        try:
            stdout, stderr = process.communicate(timeout=5)
            print("MkDocs output:")
            print(stdout)
            if stderr:
                print("Errors:")
                print(stderr)
        except subprocess.TimeoutExpired:
            print("MkDocs server is running. Access it at http://127.0.0.1:8000/")
            print("To stop the server, you'll need to interrupt the kernel.")
        
    except FileNotFoundError:
        print("Error: 'mkdocs' command not found. Make sure MkDocs is installed and in your PATH.")
    except Exception as e:
        print(f"Error launching MkDocs: {e}")
        print("You can try running 'mkdocs serve' in the terminal.")



In [None]:
# Main execution
markdown_content = final.response
site_name = "Applied LLMs"
site_description = "Tactical, Operational and Strategic Tips"


try:
    # Generate the site
    generate_site(markdown_content, site_name, site_description)

    # Launch MkDocs server
    launch_mkdocs()
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
print(final.response)