<a href="https://colab.research.google.com/github/smozley/austinAIallianceintensive/blob/main/2_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic RAG Demo

## 1. Extraction

In [None]:
!pip install PyMuPDF

In [None]:
import fitz  # PyMuPDF

# Load the PDF
pdf_path = "/content/sample_data/tesla_manual.pdf"
doc = fitz.open(pdf_path)

# Extract text from all pages
pages = [page.get_text() for page in doc]
full_text = "\n".join(pages)

print(f"Extracted {len(pages)} pages of text.")

In [None]:
print(full_text[:1000])

## 2. Chunking

In [None]:
!pip install langchain

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

sample = full_text[10000:20000]

# For this we will use a RecursiveCharacterTextSplitter from langchain
custom_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=20,
    length_function=len
)

# Create the chunks
texts = custom_text_splitter.create_documents([sample])

# Convert to a list of plain text chunks for further processing
chunks = [doc.page_content for doc in texts]

print(f"Total Chunks Created: {len(chunks)}\n\n")
# Print the first two chunks
print(f'### Chunk 1:\n\n{chunks[10]}\n\n=====\n')
print(f'### Chunk 2:\n\n{chunks[11]}\n\n=====')

## 3. Embedding + BM25

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute dense embeddings
embeddings = model.encode(chunks, convert_to_numpy=True)

# Compute sparse representations (TF-IDF as BM25 approximation)
tfidf = TfidfVectorizer().fit(chunks)
bm25_matrix = tfidf.transform(chunks)

# Store both in a simple DataFrame (this will take a bit)
index = pd.DataFrame({
    "chunk": chunks,
    "embedding": list(embeddings),
    "tfidf": list(bm25_matrix)
})

index.tail(2)

## 4. Retrieval

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

def hybrid_search(query, top_k=5, alpha=0.5):
    # Get embedding and sparse vector for query
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_tfidf = tfidf.transform([query])

    # Dense similarity (embeddings)
    dense_matrix = np.stack(index["embedding"].values)
    dense_scores = cosine_similarity(query_embedding, dense_matrix)[0]

    # Sparse similarity (TF-IDF)
    tfidf_matrix = vstack(index["tfidf"].values)
    sparse_scores = cosine_similarity(query_tfidf, tfidf_matrix)[0]

    # Combine scores
    hybrid_scores = alpha * dense_scores + (1 - alpha) * sparse_scores

    # Rank and return top_k chunks
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    return index.iloc[top_indices].copy()

# Example query
hits = hybrid_search("How do I charge the Tesla vehicle?", top_k=5)
hits[["chunk"]]

## 5. Generation

### Setup

In [None]:
!pip install anthropic

In [None]:
import anthropic
from google.colab import userdata
from pydantic import BaseModel
from enum import StrEnum

client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))

class Model(StrEnum):
  SM = 'claude-3-5-haiku-20241022'
  MD = 'claude-sonnet-4-20250514'
  LG = 'claude-opus-4-20250514'

class Question(BaseModel):
  model: Model
  prompt: str

from typing import Generator

def stream_claude(question: Question) -> Generator[str, None, None]:
  assert question.model in [Model.MD, Model.LG]

  with client.messages.stream(
        model=question.model.value,
        max_tokens=1024,
        messages=[{"role": "user", "content": question.prompt}]
    ) as stream:
        for text in stream.text_stream:
            yield text

### Ask Questions

In [None]:
query = "How do I open the front doors manually?"
hits = hybrid_search(query, top_k=5)

In [None]:
# Combine retrieved chunks
context = "\n".join(hits["chunk"].values)

# Prompt template
prompt = f"""Use the context below to answer the question.

Context:
{context}

Question: {query}
Answer:"""

# Create the question and take a look
question = Question(model=Model.MD, prompt=prompt)
print(question.prompt)

In [None]:
# Generate response
for chunk in stream_claude(question):
    print(chunk, end="", flush=True)