In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables if not using dirdotenv automatically
load_dotenv()

openai_client = OpenAI()
print("OpenAI client initialized")

OpenAI client initialized


In [26]:
from download_books import download_books

In [27]:
# Run the download function
download_books()

Fetching books list from https://raw.githubusercontent.com/alexeygrigorev/ai-engineering-buildcamp-code/main/01-foundation/homework/books.csv...
File thinkpython2.pdf already exists. Skipping download.
File thinkdsp.pdf already exists. Skipping download.
File thinkcomplexity2.pdf already exists. Skipping download.
File thinkjava2.pdf already exists. Skipping download.
File PhysicalModelingInMatlab4.pdf already exists. Skipping download.
File thinkos.pdf already exists. Skipping download.
File Think-C.pdf already exists. Skipping download.


In [28]:
# Q1 answer: 16268 lines

# Import extraction logic from external module
from convert_books import convert_books_to_markdown

# Run the conversion
convert_books_to_markdown()

Skipping thinkcomplexity2.pdf, thinkcomplexity2.md already exists.
Skipping thinkpython2.pdf, thinkpython2.md already exists.
Skipping PhysicalModelingInMatlab4.pdf, PhysicalModelingInMatlab4.md already exists.
Skipping thinkjava2.pdf, thinkjava2.md already exists.
Skipping thinkdsp.pdf, thinkdsp.md already exists.
Skipping thinkos.pdf, thinkos.md already exists.
Skipping Think-C.pdf, Think-C.md already exists.

Line count for Think Python 2e (books_text/thinkpython2.md): 16268


In [29]:
from chunk_books import get_chunks
from pathlib import Path

# Get all markdown files
book_files = sorted(Path("books_text").glob("*.md"))

# Collect all chunks from all books
all_chunks = []

for book_file in book_files:
    try:
        chunks = get_chunks(book_file.name)
        n_chunks = len(chunks)
        print(f"Number of chunks for {book_file.stem}: {n_chunks}")
        all_chunks.extend(chunks)
    except Exception as e:
        print(f"Error processing {book_file.name}: {e}")

print(f"\nTotal chunks collected: {len(all_chunks)}")

Number of chunks for PhysicalModelingInMatlab4: 106
Number of chunks for Think-C: 109
Number of chunks for thinkcomplexity2: 130
Number of chunks for thinkdsp: 86
Number of chunks for thinkjava2: 216
Number of chunks for thinkos: 62
Number of chunks for thinkpython2: 214

Total chunks collected: 923


In [30]:
# Q2 answer: 214 chunks for Think Python

def prepare_documents(chunks):
    """
    Prepares chunks for indexing by converting the list content to strings.
    Each chunk has a 'content' field that is a list of strings.
    We need to join them into a single string.
    """
    documents = []
    
    for chunk in chunks:
        # Join the content list into a single string
        content_str = "\n".join(chunk["content"])
        
        doc = {
            "content": content_str,
            "source": chunk["source"]
        }
        documents.append(doc)
    
    return documents

# Prepare all documents
documents = prepare_documents(all_chunks)
print(f"Prepared {len(documents)} documents for indexing")

Prepared 923 documents for indexing


In [31]:
from minsearch import Index

# Create and fit the index
index = Index(
    text_fields=["content"],
    keyword_fields=["source"]
)

index.fit(documents)

print(f"\nQ3 answer: Indexed {len(documents)} documents (chunks)")


Q3 answer: Indexed 923 documents (chunks)


In [32]:
# Q4: Search for 'python function definition'
results = index.search("python function definition", num_results=5)

print("Top search result:")
print(f"Source: {results[0]['source']}")
print(f"\nContent preview (first 200 chars):\n{results[0]['content'][:200]}...")

print("\n" + "="*50)
print("All 5 results:")
for i, result in enumerate(results, 1):
    print(f"\n{i}. Source: {result['source']}")
    print(f"   Content preview: {result['content'][:100]}...")

Top search result:
Source: /Users/realmistic/Documents/ai-buildcamp/01-foundation-hw/books_text/thinkpython2.md

Content preview (first 200 chars):
when you are comfortable with Python, I’ll make suggestions for installing Python on your
computer.
There are a number of web pages you can use to run Python. If you already have a fa-
vorite, go ahea...

All 5 results:

1. Source: /Users/realmistic/Documents/ai-buildcamp/01-foundation-hw/books_text/thinkpython2.md
   Content preview: when you are comfortable with Python, I’ll make suggestions for installing Python on your
computer.
...

2. Source: /Users/realmistic/Documents/ai-buildcamp/01-foundation-hw/books_text/thinkpython2.md
   Content preview: .
.
.
.
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 206
. . . . . . . . . . . ...

3. Source: /Users/realmistic/Documents/ai-buildcamp/01-foundation-hw/books_text/thinkpython2.md
   Content preview: cording to Larry Greenﬁeld, “One of Linus’s earlier projects was a progra

In [33]:
# Q5: Full RAG Pipeline
import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages
    )
    
    # Extract token counts
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    total_tokens = response.usage.total_tokens
    
    answer = response.choices[0].message.content
    
    return answer, input_tokens, output_tokens, total_tokens

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer, input_tokens, output_tokens, total_tokens = llm(prompt, instructions)
    return answer, input_tokens, output_tokens, total_tokens

# Run RAG for "python function definition"
query = "python function definition"
answer, input_tokens, output_tokens, total_tokens = rag(query)

print("RAG Response:")
print("="*80)
print(answer)
print("="*80)
print(f"\nToken Usage:")
print(f"  Input tokens: {input_tokens}")
print(f"  Output tokens: {output_tokens}")
print(f"  Total tokens: {total_tokens}")
print(f"\nQ5 answer: {input_tokens} input tokens")

# Store for comparison in Q6
q5_input_tokens = input_tokens

RAG Response:
In Python, a function is defined using the `def` keyword, followed by the function name and parentheses containing any parameters. The body of the function, which contains the code to be executed, is indented below the definition line. Here's the basic structure of a function definition:

```python
def function_name(parameters):
    # Body of the function
    # Perform operations
    return value  # Optional
```

Here's an example of a simple function that adds two numbers:

```python
def add_numbers(a, b):
    sum = a + b
    return sum
```

You can call this function using its name and passing the required arguments:

```python
result = add_numbers(5, 3)  # result will be 8
```

Remember that functions can return a value using the `return` statement, and they can also accept parameters, which are values you provide when you call the function.

Token Usage:
  Input tokens: 6991
  Output tokens: 189
  Total tokens: 7180

Q5 answer: 6991 input tokens


In [34]:
# Q6: Structured Output with Pydantic
from pydantic import BaseModel, Field
from typing import Literal

class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")

def llm_structured(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.beta.chat.completions.parse(
        model=model,
        messages=messages,
        response_format=RAGResponse
    )
    
    # Extract token counts
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    total_tokens = response.usage.total_tokens
    
    # Get the parsed response
    parsed_response = response.choices[0].message.parsed
    
    return parsed_response, input_tokens, output_tokens, total_tokens

def rag_structured(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response, input_tokens, output_tokens, total_tokens = llm_structured(prompt, instructions)
    return response, input_tokens, output_tokens, total_tokens

# Run structured RAG for "python function definition"
query = "python function definition"
structured_response, structured_input_tokens, structured_output_tokens, structured_total_tokens = rag_structured(query)

print("Structured RAG Response:")
print("="*80)
print(f"Answer: {structured_response.answer}")
print(f"\nFound Answer: {structured_response.found_answer}")
print(f"Confidence: {structured_response.confidence}")
print(f"Confidence Explanation: {structured_response.confidence_explanation}")
print(f"Answer Type: {structured_response.answer_type}")
print(f"\nFollow-up Questions:")
for i, q in enumerate(structured_response.followup_questions, 1):
    print(f"  {i}. {q}")
print("="*80)

print(f"\nToken Usage (Structured):")
print(f"  Input tokens: {structured_input_tokens}")
print(f"  Output tokens: {structured_output_tokens}")
print(f"  Total tokens: {structured_total_tokens}")

# Compare with Q5
token_difference = structured_input_tokens - q5_input_tokens
print(f"\nComparison with Q5:")
print(f"  Q5 input tokens: {q5_input_tokens}")
print(f"  Q6 input tokens: {structured_input_tokens}")
print(f"  Difference: {token_difference} tokens")
print(f"\nQ6 answer: {token_difference} more input tokens")

Structured RAG Response:
Answer: In Python, a function definition is a special statement that defines a new function, specifying its name, parameters, and the block of code that makes up its body. Here’s a simple example of a function definition:

```python
def greet(name):
    print(f'Hello, {name}!')
```

### Breakdown of the Example:
- `def`: This keyword indicates the start of a function definition.
- `greet`: This is the name of the function. You can call this function by using its name later in your code.
- `(name)`: This is the parameter list. In this case, `name` is a parameter that the function accepts when called. 
- The indented lines under the `def` statement form the body of the function. This code runs whenever the function is called.

### Calling the Function:
You can call the function like this:
```python
greet('Alice')  # Output: Hello, Alice!
```

Functions can also return values, accept multiple parameters, and include default values. The function definition is essen