# RAG

In [3]:
# Loading a document to be used for retrieval
from langchain_community.document_loaders import PyPDFLoader

pdf_file = "~/Desktop/langchain_concepts/hdfc_policy_doc.pdf"

In [4]:
pdf_loader = PyPDFLoader(pdf_file)
# will load each page as a document, with lazy loading
document_pages = []
async for page in pdf_loader.alazy_load():
    document_pages.append(page)

In [5]:
len(document_pages)

48

In [6]:
# chat model for making the retrieval process conversational also
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

In [10]:
llm_model = HuggingFacePipeline.from_model_id(
    model_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 9.04 GB, other allocations: 384.00 KB, max allowed: 9.07 GB). Tried to allocate 64.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
chat_model = ChatHuggingFace(llm=llm_model)

In [8]:
# in case if not loading model on RAM, can use LLM API as well
import os
from api_keys import GROQ_API_KEY
from langchain.chat_models import init_chat_model

os.environ["GROQ_API_KEY"] = GROQ_API_KEY
llm_model = init_chat_model("llama3-8b-8192", model_provider="groq")

In [9]:
# creating text splitter for our document which will break,
# for document either we can go with Recursive Splitter or also Semantic splitter
# if the document has content which are more of semantic reference

CHUNK_SIZE = 300
OVERLAP = 30
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = CHUNK_SIZE,
                                               chunk_overlap = OVERLAP)

In [10]:
# embedding model for indexing
EMBED_DIMS = 768
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name = "thenlper/gte-base")

In [11]:
# faiss index to be used as vector store 
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore

faiss_index = faiss.IndexFlatL2(EMBED_DIMS)
vector_store = FAISS(embedding_function = embed_model,
                    index = faiss_index,
                    docstore = InMemoryDocstore(),
                    index_to_docstore_id = {})

Here for providing the context to the LLM for retrieval, we have two options,

1. Chunk Context -- In this, if we just store the text spliited (chunked) documents in vector store, the vector store retriever would just retrieve the chunk for LLM would have this chunk as the context. It can be used if the chunk size is big enough, and we just need to generate an answer to the query to the user.

2. Document context -- How ever, if we need more context, and also need give the user the related document from which the answer is generated, we need to keep the mapping of the vector retrieved chunk to the original document. This can be done by using `ParentDocumentRetriever` in Langchain