# 02. Advanced RAG with PDF
In this notebook, we will:
1. Download a sample PDF.
2. Ingest and split the text.
3. Store embeddings in Qdrant (Persistent).
4. Perform RAG questions against the PDF.

In [7]:
import os
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Ensure data directory exists
os.makedirs('data', exist_ok=True)

## 0. Cleanup Collection
Ensures we start fresh each run.

In [None]:
from qdrant_client import QdrantClient

try:
    client = QdrantClient(url=os.environ.get('QDRANT_URL'))
    client.delete_collection('pdf_rag')
    print('Collection cleared!')
except Exception as e:
    pass

## 0.1 Baseline (No RAG)
Let's see what the model knows about DataStage without content.
*(This represents the "Before" state)*

In [None]:
# Baseline Query
query = 'what is the best partitioning strategy for a lookup stage in datastage?'
print(f'Question: {query}')
print(llm.invoke(query).content)

In [8]:
# 1. Download a Datastage Redbook PDF
pdf_url = 'http://redbooks.ibm.com/redbooks/pdfs/sg247576.pdf'
pdf_path = 'data/sample.pdf'

if not os.path.exists(pdf_path):
    print('Downloading PDF...')
    response = requests.get(pdf_url)
    with open(pdf_path, 'wb') as f:
        f.write(response.content)
    print('PDF Downloaded')
else:
    print('PDF already exists')

PDF already exists


In [9]:
# 2. Load and Split PDF
loader = PyPDFLoader(pdf_path)
pages = loader.load()
print(f'Loaded {len(pages)} pages')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)
print(f'Created {len(splits)} splits')

Loaded 658 pages
Created 1050 splits


In [10]:
# 3. Index in Qdrant (Persistent)
# We use a specific collection name 'pdf_rag'
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
url = os.environ.get('QDRANT_URL')

qdrant = QdrantVectorStore.from_documents(
    splits,
    embeddings,
    url=url,
    prefer_grpc=False,
    collection_name='pdf_rag',
    force_recreate=True  # Clean start for this tutorial
)
print('PDF Content Indexed!')

PDF Content Indexed!


In [6]:
# 4. Perform RAG
llm = ChatOllama(
    base_url=os.environ.get('OLLAMA_BASE_URL'),
    model='llama3'
)

retriever = qdrant.as_retriever(search_kwargs={'k': 3})
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

#query = 'What did the president say about Ukraine?'
query = 'what is the best partitioning strategy for a lookup stage in datastage?'
print(f'Question: {query}')
print('Answer:')
for chunk in chain.stream(query):
    print(chunk, end='', flush=True)

Question: what is the best partitioning strategy for a lookup stage in datastage?
Answer:
Based on the provided context, I can answer your question.

The text suggests that for stages that require groups of related records, such as a Lookup stage, you should use Hash partitioning. This is mentioned under the "Methodology" section:

"...Specify Hash partitioning for stages that require groups of related records..."

In this case, only specify the key column(s) necessary for correct grouping, as long as there are sufficient unique values.

Please note that this answer is based solely on the provided context and may not be applicable to all scenarios.