## **LangChain -  FAISS**

In [47]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

In [16]:
# 1) load PDF
pdf_path = 'Hayat-Qulub-Alama-Majlisi.pdf'
loader = PyPDFLoader(pdf_path)
docs = loader.load()

In [17]:
print(loader)

<langchain_community.document_loaders.pdf.PyPDFLoader object at 0x00000258CF9536F0>


In [None]:
# 2) Split into chunks 
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 200)
chunks = splitter.split_documents(docs)

In [19]:
# 3-A) Create Embeddings - Sentenace Transformer Embeddings
embeddings = HuggingFaceBgeEmbeddings(model_name='all-MiniLM-L12-v2', model_kwargs={'device':'cuda'})
embeddings

  embeddings = HuggingFaceBgeEmbeddings(model_name='all-MiniLM-L12-v2', model_kwargs={'device':'cuda'})


HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L12-v2', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [20]:
# 4) Store Vector Embeddings
vectordb = FAISS.from_documents(chunks, embeddings)
vectordb.save_local("faiss_all_minilm")

In [None]:
# 5) Retrieve 
retrieve = vectordb.as_retriever(search_type = 'similarity', search_kwargs={"k":10})
retrieve

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000025908492900>, search_kwargs={'k': 5})

In [None]:
from dotenv import load_dotenv
# ---------- Load env var ----------
load_dotenv()

API_KEYS = os.getenv("GEMINI_API_KEYS", "").split(",")
API_KEYS = [k.strip() for k in API_KEYS if k.strip()]
current_key_index = 0
ACCESS_KEY = os.getenv("APP_ACCESS_KEY")

if not API_KEYS:
    print("❌ No Gemini API keys found. Check .env file.")
    raise Exception("Missing GEMINI_API_KEYS")

In [None]:
# System Prompt
system_prompt = """
You are an Islamic history assistant. 
Always answer in a respectful and storytelling way. 
If the answer is not in the documents, say "I don’t know based on my knowledge."
Question: {question}
Context: {context}
Answer:
"""

prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=system_prompt
)

In [None]:
# 6) LLM and QA
llm = ChatGoogleGenerativeAI(
    model='gemini-2.5-flash',
    temperature=0,
    api_key=os.getenv('GEMINI_KEY_KEY', '')
)

qa = RetrievalQA.from_chain_type(
    llm = llm,
    retriever = retrieve,
    return_source_documents=False,
    chain_type_kwargs = {"prompt":prompt}
)

In [None]:
# 7) Ask
query = "Why Adam was named Adam? Give reference to it as well"
res = qa.run(query)
print(res)

In [53]:
q1 = "Why Adam was named Adam? Explain in details"
res = qa.run(q1)
print(res)

In the beautiful tapestry of creation, the naming of our father Adam (peace be upon him) holds a profound significance, rooted in the very essence of his being.

According to authentic narrations from revered Imams, Muhammad al-Baqir and Ja‘far as-Sadiq (peace be upon them), Adam was named 'Adam' because he was **'Adeemul Arz'**, which means he was created from the very face of the earth, from its dust. Another perspective suggests that 'Adeemul Arz' refers specifically to the fourth layer of the earth.

This understanding is further illuminated by a report where ‘Abdullah bin Salaam inquired of the noble Messenger of Allah (peace and blessings be upon him) about the reason for Adam's name. The Prophet (peace and blessings be upon him) confirmed that it was indeed because Adam was fashioned from the dust of the Earth.

When ‘Abdullah then asked if Adam was created from dust of a single location or a mixture, the Prophet (peace and blessings be upon him) revealed a beautiful detail: "Th

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.8.0+cu126
True


## **Pinecone Storage**

In [None]:
import os
from pinecone import Pinecone as PineconeBaseClient 
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import time
import os

In [6]:
load_dotenv()
pc = PineconeBaseClient(api_key=os.getenv("PINECONE_API_KEY"))
INDEX_NAME = "hayatal-qulub"
index = pc.Index(INDEX_NAME)


### **Vectorinze Single PDF**

In [None]:
# ---------- begin of pipelines ----------
# load pdf
pdf_path = 'Hayat-al-Qulub-Vol-1.pdf'
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

In [4]:
# split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = splitter.split_documents(docs)

In [5]:
# create embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5", model_kwargs={"device": "cuda"}
)

In [7]:
# Store in vector database - FAISS
vector_db = FAISS.from_documents(chunks, embeddings)
vector_db.save_local("Hayat-Qulub-Alama-Majlisi-faiss-index")

### **Vectorize multiple PDFs**

In [7]:
#  Store all 3 PDFs in vector database - Pinecone
pdfs = [
    'hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-1.pdf',
    'hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-2.pdf',
    'hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-3.pdf'
]

all_chunks = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    docs = loader.load()

    # split text into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    chunks = splitter.split_documents(docs)
    print(f"Loaded {len(chunks)} chunks from {pdf}")

    all_chunks.extend(chunks)

    # create embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5", model_kwargs={"device": "cuda"}
)

    # Store in vector database - Pinecone
vector_db = PineconeVectorStore.from_documents(all_chunks, embeddings, index_name=INDEX_NAME)

Loaded 2547 chunks from hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-1.pdf
Loaded 3806 chunks from hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-2.pdf
Loaded 1205 chunks from hayatal-qulub-pdfs/Hayat-al-Qulub-Vol-3.pdf


  embeddings = HuggingFaceEmbeddings(


In [7]:
# Store in vector database - Pinecone

pinecone_store = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=INDEX_NAME
)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Create retriever from Pinecone
retrieve = pinecone_store.as_retriever(
    search_type='similarity', 
    search_kwargs={"k": 10}
)

# System Prompt
system_prompt = """
You are an Islamic history assistant. 
Always answer in a respectful and storytelling way. 
If the answer is not in the documents, say "I don't know based on my knowledge."
Question: {question}
Context: {context}
Answer:
"""
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=system_prompt
)

# LLM and QA
llm = ChatGoogleGenerativeAI(
    model='gemini-2.0-flash-exp',
    temperature=0,
    api_key=os.getenv('GEMINI_API_KEY', 'AIzaSyAWITxUEUxWavkHSMIqD7DNQpXZRArr_ig')
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retrieve,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)

# Ask a question
query = "Why Adam was named Adam? Give reference to it as well"
res = qa.run(query)
print(res)

  res = qa.run(query)


The story goes that Adam, the first man, was named "Adam" because he was created from the face of the earth, from dust. In fact, Imam Muhammad al-Baqir and Ja'far as-Sadiq (peace be upon them) have said that Adam was named ‘Adam’ because he was ‘Adeemul Arz’.

There is also a narration where ‘Abdullah bin Salaam asked the Messenger of Allah (peace and blessings be upon him) why Adam was named thus, and the Prophet (peace and blessings be upon him) replied that it was because Adam had been created from the dust of the Earth.


In [14]:
res2 = qa.invoke({"query": query})
print(res2['result'])

The story goes that Adam, the first man, was named so because he was created from the face of the earth, from dust. In fact, Imam Muhammad al-Baqir and Ja‘far as-Sadiq (peace be upon them) have said that Adam was named ‘Adam’ because he was ‘Adeemul Arz,’ meaning he was created from the dust of the earth. This is mentioned in Chapter 4, "Merits of Adam and Hawwa’ (Eve), Reasons behind naming them so, the beginning of creation."

Furthermore, it is narrated that when ‘Abdullah bin Salaam asked the Messenger of Allah (peace and blessings be upon him) why Adam was named thus, the Prophet (peace and blessings be upon him) replied that it was because Adam was created from the dust of the Earth.


In [None]:
# quick check for any embedding wrapper you plan to use
vec = embeddings.embed_query("test")
print("type:", type(vec))
print("len:", len(vec))   # MUST be 1024 to match your Pinecone index


type: <class 'list'>
len: 384


## **LangChain Qdrant**

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings
import os 

In [None]:
# qdrant setup
qdrant_client = QdrantClient(
    host="xyz-example.eu-central.aws.cloud.qdrant.io",
    api_key=os.getenv("QDRANT_API_KEY"),
)

In [48]:
# ---------- begin of pipelines ----------
# load pdf
pdf_path = 'Hayat-Qulub-Alama-Majlisi.pdf'
loader = PyPDFLoader(pdf_path)
docs = loader.load()


In [49]:
# split text 
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = splitter.split_documents(docs)


In [None]:
# create embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L12-v2", model_kwargs={"device": "cuda"}
)

TypeError: object of type 'HuggingFaceEmbeddings' has no len()

In [60]:
# setup vectordb and store embeddings
vector_db_qdrant = qdrant_client.recreate_collection(
    collection_name="hayat_qulub",
    vectors_config=Qdrant.VectorParams(
         size = 768,
        distance=Qdrant.Distance.COSINE
    ),
)


AttributeError: type object 'Qdrant' has no attribute 'VectorParams'

## **LlamaIndex**

In [40]:
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader 
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI

from langchain_google_genai import ChatGoogleGenerativeAI

In [41]:
docs = SimpleDirectoryReader('pdf_docs').load_data()
docs


[Document(id_='5155758f-50f4-4cf3-8b9f-ee990722fa51', embedding=None, metadata={'page_label': '1', 'file_name': 'Hayat-Qulub-Alama-Majlisi.pdf', 'file_path': 'c:\\Users\\sadiq\\OneDrive\\Documents\\projects\\rag-for-Hayatal-Qulub\\pdf_docs\\Hayat-Qulub-Alama-Majlisi.pdf', 'file_type': 'application/pdf', 'file_size': 3743474, 'creation_date': '2025-10-15', 'last_modified_date': '2025-09-28'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Hayatul Qulub - Vol. 1 Stories of the Prophets\nAllamah Muhammad Baqir Al-Majlisi - XKP\nPublished: 2012\nCategorie(s): Non-Fiction, Religion, Islam, Education and Study aids,\nHistory, Philosophy, Religi

In [42]:
# splitter
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter

TokenTextSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001D90A129AE0>, id_func=<function default_id_func at 0x000001D9259F2E80>, chunk_size=1000, chunk_overlap=200, separator=' ', backup_separators=['\n'], keep_whitespaces=False)

In [43]:
embeddings = HuggingFaceEmbedding(model_name="all-MiniLM-L12-v2")
embeddings

2025-10-15 13:02:08,749 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L12-v2


HuggingFaceEmbedding(model_name='all-MiniLM-L12-v2', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001D96E338950>, num_workers=None, embeddings_cache=None, max_length=128, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False)

In [None]:
storage_context = StorageContext.from_defaults(index_store=embeddings)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000001D96E3311D0>, index_store=HuggingFaceEmbedding(model_name='all-MiniLM-L12-v2', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001D96E338950>, num_workers=None, embeddings_cache=None, max_length=128, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False), vector_stores={'default': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={})), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x000001D90A4FA5D0>, property_graph_store=None)

In [None]:
vector_store = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
vector_store