In [14]:
from pathlib import Path

from tqdm.auto import tqdm
from minsearch import Index

import docs

In [15]:
documents = []

data_folder = Path('../data_cache/youtube_videos/')

for f in tqdm(data_folder.glob('*.txt')):
    filename = f.name
    video_id, _ = filename.split('.')

    transcript = f.read_text(encoding='utf-8')

    chunks = docs.sliding_window(transcript, size=3000, step=1500)

    for chunk in chunks:
        chunk['video_id'] = video_id
        documents.append(chunk)
        

0it [00:00, ?it/s]

In [18]:
index = Index(
    text_fields=['content'],
    keyword_fields=['video_id']
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7b45d2787e00>

In [21]:
from typing import Any, Dict, List, TypedDict

class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    video_id: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - video_id (str): YouTube video ID for the snippet.
    """
    return index.search(
        query=query,
        num_results=5,
    )

In [30]:
from pydantic_ai.messages import FunctionToolCallEvent

async def print_function_calls(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print("TOOL CALL:", event.part.tool_name, event.part.args)

## Agent

In [62]:
from pydantic_ai import Agent
from pydantic import BaseModel, Field

from typing import List

In [117]:
instructions = """
You are an autonomous research agent. Your goal is to perform deep, multi-stage research on the given topic using the available search function. You must iteratively refine your understanding of the topic and its subtopics through structured exploration.

Research process:

stage 1: initial exploration  
- Perform one broad search query to understand the main topic and identify related areas.  
- Summarize key concepts, definitions, and major themes.  

stage 2: broad expansion  
- Perform 5–6 targeted queries based on findings from stage 1.  
- Explore adjacent and contextual topics to build a broader understanding.  
- Identify key debates, challenges, frameworks, and major contributors.  

stage 3: deep investigation  
- Perform 5–6 refined queries focusing on depth.  
- Investigate specific mechanisms, case studies, technical details, or research gaps.  
- Gather diverse viewpoints and data to strengthen depth and accuracy.  

Final deliverable:

Produce a complete research report as valid JSON that fits the ResearchReport schema.  
The article must be long, detailed, and divided into multiple sections and paragraphs — not short summaries.

Rules:

1. Search queries:
   - Do not include years (e.g., “2023,” “2024”) unless explicitly part of the user’s request.
   - Always use timeless, general, or concept-based queries (e.g., “AI investment trends” instead of “AI investment trends 2023”).
   - Prefer queries that help build depth, context, and connections across subtopics.

2. The article must contain:
   - A clear, descriptive title.
   - An introduction with 2–3 paragraphs (each 4–6 sentences).
   - At least 10 sections, ideally 10–12, each focused on a distinct subtopic.
   - Each section must have at least 4 paragraphs, preferably 5–6.
   - Each paragraph must contain 4–6 sentences of original synthesis.

3. Each paragraph must have at least one reference object containing:
   - video_id (YouTube video ID)
   - timestamp ("mm:ss" or "h:mm:ss")
   - quote (a short excerpt or paraphrase from that video segment)
   - Do not embed citations inline in the text.

4. The conclusion must have 2–3 paragraphs summarizing the key findings and implications.

5. Structure and coherence:
   - Each section must have a unique title describing its focus.
   - Paragraphs within a section must explore different but related ideas.
   - The narrative should flow logically across sections.

6. Evidence quality:
   - All claims must be traceable to real YouTube sources.
   - References must correspond to valid video_id and timestamp pairs.
   - Quotes should reflect the video’s spoken content accurately.
   - Do not fabricate data or sources.

7. Depth and length requirements:
   - If the output has fewer than 10 sections or any section has fewer than 4 paragraphs, it is invalid.
   - If any paragraph has no references, it is invalid.
   - The report should read like an in-depth, long-form synthesis, not a short article.

8. Tone and style:
   - Write in a professional, neutral, analytical tone.
   - Emphasize explanation, synthesis, and comparison.
   - Avoid repetition and filler text.

9. Output format:
   - Produce valid JSON following the ResearchReport model.
   - Do not include markdown, headings, or inline links.
   - Populate all required fields: stages, article, sections, paragraphs, references.

10. Self-check before completion:
   - Confirm you have 10–12 sections.
   - Confirm each section has 4–6 paragraphs.
   - Confirm each paragraph has 4–6 sentences.
   - Confirm each paragraph has at least one valid reference object.
   - Confirm introduction and conclusion both contain 2–3 paragraphs.

11. Keyword and query integrity:

- For each research stage, the list of keywords in StageReport.keywords must exactly match the queries actually used in TOOL CALLs.
- Do not invent or summarize keywords. Only include the literal query strings that were passed to the search function.
- If a query was modified before searching (e.g., sanitized to remove years), include the final version actually executed.
- The StageReport.summary may interpret or discuss what was found, but may not add new or implied keywords that were not searched.
- If fewer or more keywords were used than planned, report exactly how many were executed — not how many were intended.
- The final report must remain consistent: every keyword listed must correspond to one real search call, and no others.


Only output once all checks are satisfied.
""".strip()


In [118]:
from pydantic import BaseModel, Field
from typing import List


class Reference(BaseModel):
    """
    Represents a specific citation to a YouTube video segment used as evidence or context in a paragraph.

    Each reference links to a precise moment in a video where the cited idea or discussion occurs.
    """
    video_id: str = Field(..., description="The unique YouTube video ID (e.g., 'rwuud5wr3J4').")
    timestamp: str = Field(..., description="The timestamp in the format 'mm:ss' or 'h:mm:ss' pointing to the relevant segment.")
    quote: str = Field(..., description="A short excerpt or paraphrase from the referenced segment of the video.")


class Keyword(BaseModel):
    """Research results for a specific keyword"""
    search_keyword: str = Field(..., description="Exact keyword used for search.")
    summary: str = Field(..., description="Short summary of the search result.")
    references: List[Reference] = Field(..., description="Specific references to help us track the findings of the research.")
    relevance_summary: str = Field(..., description="1 sentence for each reference explainig how it supports the keyword's summary — ensure factual consistency.")
    other_ideas: str = Field(..., description="Free-form description of related or complimentary ideas to explore in next stages.")


class StageReport(BaseModel):
    """
    Represents the output of a single stage in the multi-stage research process.

    Each stage includes its numeric order, a list of relevant keywords or queries used,
    and a textual summary describing the key insights or outcomes from that stage.
    """
    stage: int = Field(..., description="The stage number (e.g., 1, 2, or 3) representing the research depth.")
    keywords: List[Keyword] = Field(..., description="List of exact search queries used during this stage.")
    summary: str = Field(..., description="A concise summary of the main findings or insights from this stage.")


class Paragraph(BaseModel):
    """
    Represents a single paragraph within an article section.

    Each paragraph explains one specific idea or subtopic, written in 3–5 sentences,
    and must include at least one reference to a relevant YouTube video segment.
    """
    content: str = Field(
        ...,
        description="A paragraph of 3–5 sentences elaborating on one key idea within the section."
    )
    references: List[Reference] = Field(
        ...,
        description="A list of one or more references to YouTube videos supporting or illustrating the paragraph."
    )


class ArticleSection(BaseModel):
    """
    Represents a single section of the final research article.

    Each section has a descriptive title and a list of paragraphs,
    each paragraph supported by at least one YouTube-based reference.
    """
    title: str = Field(..., description="The title or heading of the section.")
    paragraphs: List[Paragraph] = Field(
        ...,
        description="A list of paragraphs forming the body of this section, each containing content and references."
    )


class ActionPoint(BaseModel):
    """Practical takeaways from the research."""
    point: str = Field(..., description="A concrete recommendation, insight, or action derived from the research.")
    relevance_check: str = Field(..., description="Explain how the referenced quote supports this action point — must show logical connection, not assumption.")
    reference: Reference = Field(..., description="Source supporting this action point.")


class Article(BaseModel):
    """
    Represents the complete research article generated from all research stages.

    The article contains an introductory instruction, a set of structured sections,
    and a conclusion summarizing the overall findings.
    """
    title: str = Field(..., description="The title of the article.")
    introduction: str = Field(..., description="The introduction or contextual overview of the article.")
    sections: List[ArticleSection] = Field(..., description="A list of sections comprising the main body of the article.")
    action_points: List[ActionPoint] = Field(..., description="3-5 key insights or recommendations derived from the findings.")
    conclusion: str = Field(..., description="The final concluding text summarizing findings and insights.")


class ResearchReport(BaseModel):
    """
    Represents the full structured output of the research agent.

    It contains:
    - A list of StageReport objects describing each stage of the research process.
    - An Article object representing the final synthesized report.
    """
    stages: List[StageReport] = Field(..., description="A list of stage summaries capturing the evolution of the research process.")
    article: Article = Field(..., description="The final research article synthesizing all insights from the stages.")


In [119]:
agent = Agent(
    name='researcher',
    instructions=instructions,
    tools=[search],
    model='gpt-4o-mini',
    output_type=ResearchReport
)

In [120]:
results = await agent.run(
    user_prompt='how do I make money with AI',
    event_stream_handler=print_function_calls
)

TOOL CALL: search {"query":"how to make money with AI"}
TOOL CALL: search {"query": "ways to monetize AI technology"}
TOOL CALL: search {"query": "AI as a service business models"}
TOOL CALL: search {"query": "AI startups profitable business models"}
TOOL CALL: search {"query": "AI consulting opportunities"}
TOOL CALL: search {"query": "investing in AI startups tips"}
TOOL CALL: search {"query": "AI entrepreneurship opportunities"}
TOOL CALL: search {"query": "best strategies to monetize AI"}
TOOL CALL: search {"query": "AI consulting business models"}
TOOL CALL: search {"query": "AI product development monetization strategies"}
TOOL CALL: search {"query": "AI investment and funding options"}


In [121]:
report = results.output

In [122]:
for stage in report.stages:
    for kw in stage.keywords:
        print(kw.search_keyword)
    print()

how to make money with AI

ways to monetize AI technology
AI as a service business models
AI startups profitable business models
AI consulting opportunities
investing in AI startups tips

AI entrepreneurship opportunities
best strategies to monetize AI
AI consulting business models
AI product development monetization strategies
AI investment and funding options



In [125]:
article = report.article

In [126]:
# print('#', article.title)

print('## Inroduction')
print()

print(article.introduction)
print()

for section in article.sections:
    print('##', section.title)
    print()
    for p in section.paragraphs:
        print(p.content)
        print(p.references)
        print()

print('## Action Points')
print()

for action_point in article.action_points:
    print('*', action_point.point, action_point.reference)

print('## Conclusion')
print()
print(article.conclusion)



## Inroduction

In recent years, the advent of artificial intelligence (AI) has transformed multiple sectors, giving rise to various methods of monetization. As businesses recognize AI's potential, questions surrounding how to effectively harness these technologies for financial gain have become paramount. This report delves deeply into the multifaceted approaches for making money with AI, ranging from consulting practices and product development to investment avenues and entrepreneurial ventures. It explores how businesses can strategically implement AI technologies to maximize revenue and establish a competitive edge in an evolving market.

AI isn't simply a technological marvel; it's an avenue for innovation and economic growth. As startups and established companies alike pivot towards AI, this report aims to provide a comprehensive overview of how AI can be monetized effectively in today's dynamic business landscape. By understanding core strategies and emerging opportunities in th