# Retrieval-Augmented Generation (RAG) Notebook
📒Notebook Created by ❤️ [@prasadmahamulkar](https://x.com/prsdm17).

In this notebook, you will learn how to implement RAG (basic to advanced) using LangChain 🦜 and LlamaIndex 🦙.



# Basic RAG using LangChain

In [None]:
! pip install sentence_transformers
! pip install pypdf
! pip install faiss-gpu
! pip install langchain
! pip install langchain-openai

Collecting sentence_transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.3.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting langchain
  Downloading langchain-0.1.5-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.7/806.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 

In [None]:
# load pdf
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/qlora_paper.pdf")
documents = loader.load()

In [None]:
# split document content
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter()
text = text_splitter.split_documents(documents)

In [None]:
# load embedding model
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs = {"normalize_embeddings": True})

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/93.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# # check embeddings
# check = embeddings.embed_query("what is Qlora?")
# check[0:10]

In [None]:
# create vectorstore using FAISS
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(text, embeddings)

In [None]:
# saving the vectorstore
vectorstore.save_local("vectorstore.db")

In [None]:
# # load the vectorstore and check similarity
# sub_docs = vectorstore.similarity_search("what is Qlora?") # k=5
# sub_docs

In [None]:
# create retriever
retriever = vectorstore.as_retriever()

In [None]:
# set api openai key
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# load llm
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [None]:
# create document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

template = """"
You are an assistant for question-answering tasks.
Use the provided context only to answer the following question:

<context>
{context}
</context>

Question: {input}
"""
prompt = ChatPromptTemplate.from_template(template)
doc_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
# create retrieval chain
from langchain.chains import create_retrieval_chain
chain = create_retrieval_chain(retriever, doc_chain)

In [None]:
response = chain.invoke({"input": "what is Qlora?"})

In [None]:
response['answer']

'QLoRA is an efficient finetuning approach that allows for the finetuning of quantized language models without any performance degradation. It reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance.'

# Advanced RAG using LangChain

In [None]:
! pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.109.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.27.0.post1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.4

In [None]:
# load pdf
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/qlora_paper.pdf")
documents = loader.load()

In [None]:
# split pages content
from langchain.text_splitter import RecursiveCharacterTextSplitter

# create the parent documents - The big chunks
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

# create the child documents - The small chunks
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# The storage layer for the parent chunks
from langchain.storage import InMemoryStore
store = InMemoryStore()

In [None]:
# load embedding model
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs = {"normalize_embeddings": True})

In [None]:
# create vectorstore using Chromadb
from langchain.vectorstores import Chroma
vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings)

In [None]:
# create retriever
from langchain.retrievers import ParentDocumentRetriever
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [None]:
# add documents to vectorstore
retriever.add_documents(documents)

In [None]:
# set api openai key
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# load llm
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [None]:
# create document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

template = """"
You are an assistant for question-answering tasks.
Use the provided context only to answer the following question:

<context>
{context}
</context>

Question: {input}
"""
prompt = ChatPromptTemplate.from_template(template)
doc_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
# create retrieval chain
from langchain.chains import create_retrieval_chain
chain = create_retrieval_chain(retriever, doc_chain)

In [None]:
response = chain.invoke({"input": "what is Qlora?"})

In [None]:
response['answer']

'QLORA is a method that achieves high-fidelity 4-bit finetuning by using two techniques: 4-bit NormalFloat (NF4) quantization and Double Quantization. It also introduces Paged Optimizers to prevent memory spikes during gradient checkpointing. QLORA has one low-precision storage data type (usually 4-bit) and one computation data type (usually BFloat16). It dequantizes the low-precision tensor to BFloat16 and performs matrix multiplication in 16-bit.'

# Basic RAG using LlamaIndex

In [None]:
! pip install -U llama_hub llama_index pypdf

Collecting llama_hub
  Downloading llama_hub-0.0.78-py3-none-any.whl (104.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama_index
  Downloading llama_index-0.9.45-py3-none-any.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-4.0.1-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting html2text (from llama_hub)
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Collecting pyaml<24.0.0,>=23.9.7 (from llama_hub)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Collecting retrying (from llama_hub)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting dataclasses-json (from llama_index)
  Downloading dataclasses_json-0.6.4-py3-none-

In [None]:
# load pdf
from llama_index import SimpleDirectoryReader
documents = SimpleDirectoryReader(input_files=["/content/qlora_paper.pdf"]).load_data()

In [None]:
# combine documents into one
from llama_index import Document
doc_text = "\n\n".join([d.get_content() for d in documents])
text= [Document(text=doc_text)]

In [None]:
# set up text chunk
from llama_index.node_parser import SimpleNodeParser
node_parser = SimpleNodeParser.from_defaults() # Default chunk size is 1024

In [None]:
# create chunks from text
base_nodes = node_parser.get_nodes_from_documents(text)

In [None]:
# reset node ids
from llama_index.schema import IndexNode
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

In [None]:
# load embedding model
from llama_index.embeddings import resolve_embed_model
embed_model = resolve_embed_model("local:BAAI/bge-small-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# set api openai key
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# load llm
from llama_index.llms import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")

In [None]:
# set service context
from llama_index import ServiceContext
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

In [None]:
# store in vectorstore index
from llama_index import VectorStoreIndex
index = VectorStoreIndex(base_nodes, service_context=service_context)

In [None]:
# create retriever
retriever = index.as_retriever() #similarity_top_k=2

In [None]:
# set up query engine
from llama_index.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(retriever, service_context=service_context)

In [None]:
# query
response = query_engine.query("What is Qlora?")
print(str(response))

QLORA is an efficient finetuning approach that allows for the reduction of memory usage while preserving the performance of a pretrained language model. It achieves this by backpropagating gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters (LoRA). QLORA introduces several innovations to save memory, including the use of a new data type called 4-bit NormalFloat (NF4), Double Quantization to reduce memory footprint, and Paged Optimizers to manage memory spikes. QLORA has been used to finetune more than 1,000 models and has shown state-of-the-art results in chatbot performance.


# Advanced RAG using LlamaIndex

In [None]:
# load pdf
from llama_index import SimpleDirectoryReader
documents = SimpleDirectoryReader(input_files=["/content/qlora_paper.pdf"]).load_data()

In [None]:
# combine documents into one
from llama_index import Document
doc_text = "\n\n".join([d.get_content() for d in documents])
text= [Document(text=doc_text)]

In [None]:
# set up parent chunk
from llama_index.node_parser import SimpleNodeParser
node_parser = SimpleNodeParser.from_defaults()

In [None]:
from llama_index.schema import IndexNode
base_nodes = node_parser.get_nodes_from_documents(text)

In [None]:
# reset node ids (optional)
from llama_index.schema import IndexNode
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

In [None]:
# set up child chunk
sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [
    SimpleNodeParser.from_defaults(chunk_size=c,chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

In [None]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

In [None]:
# set api openai key
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# load llm
from llama_index.llms import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")

In [None]:
# load embedding model
from llama_index.embeddings import resolve_embed_model
embed_model = resolve_embed_model("local:BAAI/bge-small-en")

In [None]:
# set service context
from llama_index import ServiceContext
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

In [None]:
# create and store embedding in vectorstore
from llama_index import VectorStoreIndex
index = VectorStoreIndex(all_nodes, service_context=service_context)

In [None]:
vector_retriever_chunk = index.as_retriever()

In [None]:
# create retriever
from llama_index.retrievers import RecursiveRetriever
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)

In [None]:
# create retriever query
from llama_index.query_engine import RetrieverQueryEngine
query_engine_chunk = RetrieverQueryEngine.from_args(retriever_chunk, service_context=service_context)

In [None]:
# # check source node
# from llama_index.response.notebook_utils import display_source_node
# nodes = retriever_chunk.retrieve( "what is Qlora?")
# for node in nodes:
#     display_source_node(node, source_length=2000)

In [None]:
# query
response = query_engine_chunk.query("What is Qlora?")
print(str(response))

[1;3;34mRetrieving with query id None: What is Qlora?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-0
[0m[1;3;34mRetrieving with query id node-0: What is Qlora?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-18
[0m[1;3;34mRetrieving with query id node-18: What is Qlora?
[0mQLORA is an efficient finetuning approach that allows for the finetuning of quantized language models without any performance degradation. It reduces memory usage and enables the finetuning of large models on a single GPU. QLORA uses a novel technique to quantize a pretrained model to 4-bit and incorporates Low-rank Adapters (LoRA) for backpropagating gradients through the quantized weights. It introduces several innovations to save memory without sacrificing performance, such as a new data type called 4-bit NormalFloat (NF4), Double Quantization to reduce memory footprint, and Paged Optimizers to manage memory spikes. QLORA has been used to finetune more than 1,000 models and has achieve