# simple_rag.ipynb
``````markdown
# Simple Retrieval-Augmented Generation (RAG) with Langchain

This notebook demonstrates a basic RAG pipeline using Langchain.
We will:
1. Install necessary libraries.
2. Download a sample PDF document.
3. Load the PDF.
4. Split the document into manageable chunks.
5. Generate embeddings for the chunks using a Hugging Face model.
6. Store the chunks and their embeddings in a FAISS vector store.
7. Set up a Hugging Face LLM for generation.
8. Create a `RetrievalQA` chain to perform RAG.
9. Ask a question and get an answer based on the document.

In [None]:
# @title 1. Install Dependencies
!pip install -q langchain langchain_community langchain_huggingface pypdf faiss-cpu sentence-transformers torch accelerate bitsandbytes
print("Dependencies installed.")

Dependencies installed.


In [None]:
# @title 2. Setup and Download PDF
import os
import requests

# Create a directory for PDFs if it doesn't exist
pdf_dir = "pdfs"
os.makedirs(pdf_dir, exist_ok=True)

# URL of the PDF
pdf_url = "https://cs229.stanford.edu/main_notes.pdf"
pdf_filename = os.path.join(pdf_dir, "cs229_main_notes.pdf")

# Download the PDF if it doesn't already exist
if not os.path.exists(pdf_filename):
    print(f"Downloading {pdf_filename}...")
    response = requests.get(pdf_url)
    with open(pdf_filename, "wb") as f:
        f.write(response.content)
    print("Download complete.")
else:
    print(f"{pdf_filename} already exists.")

pdfs/cs229_main_notes.pdf already exists.


In [None]:
# @title 3. Load the PDF Document
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_filename)
documents = loader.load()

print(f"Loaded {len(documents)} pages from the PDF.")
# For demonstration, let's take only the first 50 pages to speed up processing
# Remove this line if you want to process the whole document (will take longer)
documents = documents[:100]
print(f"Using {len(documents)} pages for this demo.")

Loaded 227 pages from the PDF.
Using 100 pages for this demo.


In [None]:
# @title 4.  Split Documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
doc_chunks = text_splitter.split_documents(documents)

print(f"Split the document into {len(doc_chunks)} chunks.")

Split the document into 223 chunks.


In [None]:
# @title 5. Generate Embeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Using a smaller, efficient model for embeddings
# For potentially better results (and larger model size), consider "BAAI/bge-small-en-v1.5"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

print("Embedding model loaded.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding model loaded.


In [None]:
# @title 6. Create FAISS Vector Store (Local)
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(doc_chunks, embeddings)
print("Vector store created.")

Vector store created.


In [None]:
# @title 7. Configure LLM (Local Hugging Face Pipeline)
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

# Define the model name
llm_model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

pipe = pipeline("text2text-generation", model=model,
    tokenizer=tokenizer,
    max_length=512,  # Max length of the generated text
    temperature=0.7, # Controls randomness: lower is more deterministic
    top_p=0.95,      # Nucleus sampling: considers the smallest set of tokens whose cumulative probability exceeds top_p
    repetition_penalty=1.2 # Penalizes repeated tokens
)

"""
llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # MODIFIED
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)

pipe = pipeline(
    "text-generation", # MODIFIED
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,  # Max NEW tokens to generate (max_length includes prompt) # MODIFIED
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15 # Adjusted slightly
)
"""
llm = HuggingFacePipeline(pipeline=pipe)

print("LLM loaded and pipeline created.")

Device set to use cuda:0


LLM loaded and pipeline created.


In [None]:
# @title 8. Create RAG Chain using LCEL

from langchain.chains import RetrievalQA

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Retrieve top 3 relevant chunks

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # "stuff" puts all retrieved text directly into the prompt
    retriever=retriever,
    return_source_documents=True # Optionally return source documents
)

print("RetrievalQA chain created.")

RetrievalQA chain created.


In [None]:
query = "What is  machine learning? "
print(f"Query: {query}")

# It's good practice to wrap LLM calls in a try-except block

result = qa_chain.invoke({"query": query})
print("\nAnswer:")
print(result["result"])

#print("\nSource Documents (first 100 chars of each):")
#for i, doc in enumerate(result["source_documents"]):
#    print(f"Doc {i+1}: {doc.page_content[:200]}...")


Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors


Query: What is  machine learning? 





Answer:
predicting the output
