In [21]:
!curl -O https://raw.githubusercontent.com/alexeygrigorev/ai-engineering-buildcamp-code/main/01-foundation/homework/books.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   879  100   879    0     0   8377      0 --:--:-- --:--:-- --:--:--  8451


# Script to download all pdfs from the books.csv

In [11]:
import pandas as pd
import requests
import os
import re


In [25]:

def sanitize_filename(filename):
    # Remove characters that are generally not allowed in filenames
    return re.sub(r'[\\/*?:"<>|]', "", filename)

csv_file = 'books.csv'
output_dir = 'books'
# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Load the CSV
try:
    df = pd.read_csv(csv_file)
except Exception as e:
    print(f"Error reading CSV: {e}")


# Iterate through the rows and download each PDF
for index, row in df.iterrows():
    title = row['title']
    pdf_url = row['pdf_url']
    
    # Clean title for filename
    clean_title = sanitize_filename(title)
    filename = f"{clean_title}.pdf"
    filepath = os.path.join(output_dir, filename)

    print(f"Downloading '{title}' from {pdf_url}...")

    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status() # Check for HTTP errors
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Successfully saved to {filepath}")
    except Exception as e:
        print(f"Failed to download {title}: {e}")

Downloading 'Think Python 2e' from http://greenteapress.com/thinkpython2/thinkpython2.pdf...
Successfully saved to books/Think Python 2e.pdf
Downloading 'Think DSP' from http://greenteapress.com/thinkdsp/thinkdsp.pdf...
Successfully saved to books/Think DSP.pdf
Downloading 'Think Complexity 2e' from http://greenteapress.com/complexity2/thinkcomplexity2.pdf...
Successfully saved to books/Think Complexity 2e.pdf
Downloading 'Think Java 2e' from http://greenteapress.com/thinkjava7/thinkjava2.pdf...
Successfully saved to books/Think Java 2e.pdf
Downloading 'Physical Modeling in MATLAB' from https://github.com/AllenDowney/PhysicalModelingInMatlab/raw/master/PhysicalModelingInMatlab4.pdf...
Successfully saved to books/Physical Modeling in MATLAB.pdf
Downloading 'Think OS' from http://greenteapress.com/thinkos/thinkos.pdf...
Successfully saved to books/Think OS.pdf
Downloading 'Think C++' from https://raw.githubusercontent.com/tscheffl/ThinkC/refs/heads/master/PDF/Think-C.pdf...
Successfully 

# Convert PDFs to markdown files

In [29]:
!uv add 'markitdown[pdf]'

[2mResolved [1m134 packages[0m [2min 4ms[0m[0m
[2mAudited [1m131 packages[0m [2min 2ms[0m[0m


In [35]:
from markitdown import MarkItDown
md = MarkItDown()


In [12]:
import os
from markitdown import MarkItDown

# Source and destination directories
pdf_dir = "books"          # folder containing downloaded PDFs
output_dir = "books_text"  # folder to save markdown files

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Initialize converter
md = MarkItDown()

# Loop through all files in the PDF directory
for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)
        
        # Convert PDF to markdown
        result = md.convert(pdf_path)
        markdown_text = result.text_content
        
        # Create output markdown filename
        output_filename = os.path.splitext(filename)[0] + ".md"
        output_path = os.path.join(output_dir, output_filename)
        
        # Save markdown file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(markdown_text)

        print(f"Converted: {filename} → {output_filename}")

print("All PDFs converted successfully.")
        
        

Converted: Think OS.pdf → Think OS.md
Converted: Think Java 2e.pdf → Think Java 2e.md
Converted: Physical Modeling in MATLAB.pdf → Physical Modeling in MATLAB.md
Converted: Think Python 2e.pdf → Think Python 2e.md
Converted: Think DSP.pdf → Think DSP.md
Converted: Think Complexity 2e.pdf → Think Complexity 2e.md
Converted: Think C++.pdf → Think C++.md
All PDFs converted successfully.


# How many lines are in the extracted content from the "Think Python" book?

In [13]:
!wc -l "books_text/Think Python 2e.md"

   16268 books_text/Think Python 2e.md


In [14]:
books_dir = "books_text"

documents = []

for filename in os.listdir(books_dir):
    if filename.lower().endswith(".md"):       # Sanity Check for md files
        filepath = os.path.join(books_dir, filename)

        with open(filepath, "r", encoding="utf-8") as f:
            lines = f.read().splitlines()      # 

            # Remove empty / whitespace-only lines
            clean_lines = [line for line in lines if line.strip()]

            book_dict = {
                "source": filename,
                "content": clean_lines
            }

            documents.append(book_dict)

print(f"Loaded {len(documents)} documents.")

Loaded 7 documents.


In [15]:
from gitsource import chunk_documents

chunks = chunk_documents(
    documents,
    size=100,
    step=50
)

print(f"Created {len(chunks)} chunks.")

Created 1009 chunks.


In [16]:
chunks[800]

{'start': 150,
 'content': ['| 3 Virtual | memory | 15  |',
  '| --------- | ------ | --- |',
  '3.1 A bit of information theory . . . . . . . . . . . . . . . . . . . 15',
  '3.2 Memory and storage . . . . . . . . . . . . . . . . . . . . . . 16',
  '3.3 Address spaces . . . . . . . . . . . . . . . . . . . . . . . . . 16',
  'x Contents',
  '3.4 Memory segments . . . . . . . . . . . . . . . . . . . . . . . . 17',
  '3.5 Static local variables . . . . . . . . . . . . . . . . . . . . . . 20',
  '3.6 Address translation . . . . . . . . . . . . . . . . . . . . . . . 20',
  '4 Files and file systems 23',
  '4.1 Disk performance . . . . . . . . . . . . . . . . . . . . . . . . 25',
  '4.2 Disk metadata . . . . . . . . . . . . . . . . . . . . . . . . . . 27',
  '4.3 Block allocation . . . . . . . . . . . . . . . . . . . . . . . . . 28',
  '4.4 Everything is a file? . . . . . . . . . . . . . . . . . . . . . . . 28',
  '5 More bits and bytes 31',
  '5.1 Representing integers . . . . . . . . . . .

# Count number of chunks with source as 'Think Python' book

In [17]:
count = 0
for chunk in chunks:
    if chunk['source'] == 'Think Python 2e.md':
        count = count + 1

In [18]:
count

214

# Indexing using minsearch

In [20]:
from minsearch import Index

def prepare_documents(chunks):
    prepared = []

    for chunk in chunks:
        prepared.append({
            "source": chunk["source"],
            "content": "\n".join(chunk["content"])  # convert list → string
        })

    return prepared




In [21]:
documents = prepare_documents(chunks)
# here you need to turn the lists into strings
# e.g. with content = "\n".join(chunk["content"])

index = Index(text_fields = ['content'],
             keyword_fields = ['source']
             )

index.fit(documents)

<minsearch.minsearch.Index at 0x32e8cecf0>

In [22]:
len(documents)

1009

In [23]:
results = index.search("python function definition", num_results=5)

In [24]:
results

[{'source': 'Think Python 2e.md',
  'content': 'when you are comfortable with Python, I’ll make suggestions for installing Python on your\ncomputer.\nThere are a number of web pages you can use to run Python. If you already have a fa-\nvorite, go ahead and use it. Otherwise I recommend PythonAnywhere. I provide detailed\ninstructions for getting started at http://tinyurl.com/thinkpython2e.\nThere are two versions of Python, called Python 2 and Python 3. They are very similar, so\nif you learn one, it is easy to switch to the other. In fact, there are only a few differences you\nwill encounter as a beginner. This book is written for Python 3, but I include some notes\nabout Python 2.\nThe Python interpreter is a program that reads and executes Python code. Depending\non your environment, you might start the interpreter by clicking on an icon, or by typing\npython on a command line. When it starts, you should see output like this:\nPython 3.4.0 (default, Jun 19 2015, 14:20:21)\n[GCC 4.8.

# Comparison between input tokens for normal RAG and structured output RAG

In [54]:
from openai import OpenAI
from dotenv import load_dotenv
from minsearch import Index
load_dotenv()
openai_client = OpenAI()

import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
    )

    return response

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions)
    return answer

In [56]:
response = rag('python function definition')

In [57]:
answer = response.output_parsed

In [58]:
print(response.model_dump_json(indent=2))

{
  "id": "resp_00797e30ffd032a70069a48a1e7ce081968ac030107e8c4c37",
  "created_at": 1772390942.0,
  "error": null,
  "incomplete_details": null,
  "instructions": null,
  "metadata": {},
  "model": "gpt-4o-mini-2024-07-18",
  "object": "response",
  "output": [
    {
      "id": "msg_00797e30ffd032a70069a48a1ff52881969fa1de30d1b64bd9",
      "content": [
        {
          "annotations": [],
          "text": "To define a function in Python, you use the `def` keyword followed by the function name and parentheses containing any parameters. The body of the function is indented and contains the statements that will be executed when the function is called. Here's a simple example based on the context provided:\n\n```python\ndef print_hello():\n    print(\"Hello, World!\")\n```\n\nIn this example:\n- The function is named `print_hello`.\n- It takes no parameters.\n- When called, it executes the statement `print(\"Hello, World!\")`, displaying the text on the screen.\n\nYou can call this f

In [40]:
from openai import OpenAI
from dotenv import load_dotenv
from minsearch import Index
load_dotenv()
openai_client = OpenAI()

import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm_structured(user_prompt, instructions, output_type, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type
    )

    return response

def rag_structured(query, output_type):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_structured(prompt, instructions, output_type)
    return answer


In [31]:
from pydantic import BaseModel, Field
from typing import Literal

class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")


In [41]:
response = rag_structured('python function definition', RAGResponse)

In [43]:
answer = response.output_parsed

In [49]:
print(response.model_dump_json(indent=2))

{
  "id": "resp_018b89cf95056db20069a488417b5c8195a598ef46bef8b08f",
  "created_at": 1772390465.0,
  "error": null,
  "incomplete_details": null,
  "instructions": null,
  "metadata": {},
  "model": "gpt-4o-mini-2024-07-18",
  "object": "response",
  "output": [
    {
      "id": "msg_018b89cf95056db20069a48846f270819586b3b284edbe4209",
      "content": [
        {
          "annotations": [],
          "text": "{\"answer\":\"In Python, a function is a named sequence of statements that performs a specific task. Functions are defined using the `def` keyword followed by the function name and parentheses containing any parameters. Here is a basic example of a function definition in Python:\\n\\n```python\\ndef my_function(parameter1, parameter2):\\n    # Function body: statements that execute when the function is called\\n    result = parameter1 + parameter2\\n    return result\\n```\\n\\n### Components of a Function Definition:\\n1. **Defining the Function**: You start with the keyword `