In [17]:
from dotenv import load_dotenv

In [18]:
import os
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [19]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [33]:
query = embeddings_model.embed_query("Hi I am Shashwat")

In [34]:
len(query)

768

In [21]:
from sklearn.metrics.pairwise import cosine_similarity


In [22]:
document = ["Who is the president of USA?",
            "Who is the Prime Minister of India?"]

In [23]:
text = "Narendra Modi is the Prime Minister of India"

In [24]:
document_embedding = embeddings_model.embed_documents(document)

In [25]:
query_embedding = embeddings_model.embed_query(text)

In [26]:
cosine_similarity([query_embedding],document_embedding)

array([[0.39142982, 0.72165125]])

In [27]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [35]:
index=faiss.IndexFlatL2(768)


In [36]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002CEEE691680> >

In [37]:
vector_store = FAISS(
    embedding_function=embeddings_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [38]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['3451070b-ee1f-4917-b5c1-8f1c28be81ee',
 '1bf0b897-f0ad-462f-8211-1b6f9bc0b640',
 'c9f94710-424b-4cbb-9430-a7fd155a8d78']

In [39]:
vector_store.index_to_docstore_id

{0: '3451070b-ee1f-4917-b5c1-8f1c28be81ee',
 1: '1bf0b897-f0ad-462f-8211-1b6f9bc0b640',
 2: 'c9f94710-424b-4cbb-9430-a7fd155a8d78'}

In [44]:
results = vector_store.similarity_search("Tell me about AI",k=2)

In [45]:
results

[Document(id='3451070b-ee1f-4917-b5c1-8f1c28be81ee', metadata={}, page_content='AI is future'),
 Document(id='1bf0b897-f0ad-462f-8211-1b6f9bc0b640', metadata={}, page_content='AI is powerful')]

In [46]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="The stock market reached an all-time high today.",
    metadata={"source": "news"}
)
document_2 = Document(
    page_content="The local football team won their championship game last night.",
    metadata={"source": "sports"}
)
document_3 = Document(
    page_content="Scientists have discovered a new species of bird in the Amazon.",
    metadata={"source": "news"}
)
document_4 = Document(
    page_content="Just saw the most amazing sunset! #blessed",
    metadata={"source": "tweet"}
)
document_5 = Document(
    page_content="Breaking: Major earthquake shakes the city center.",
    metadata={"source": "news"}
)
document_6 = Document(
    page_content="Can't believe my team lost in the last minute! #sportsfan",
    metadata={"source": "tweet"}
)
document_7 = Document(
    page_content="Olympic swimmer sets new world record in 100m freestyle.",
    metadata={"source": "sports"}
)
document_8 = Document(
    page_content="Elections results are in: incumbent wins by a narrow margin.",
    metadata={"source": "news"}
)
document_9 = Document(
    page_content="Just finished a 5k run, feeling great! #fitness",
    metadata={"source": "tweet"}
)
document_10 = Document(
    page_content="Basketball finals tonightâ€”who's watching?",
    metadata={"source": "tweet"}
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10
]

In [47]:
index=faiss.IndexFlatIP(768)

vector_store = FAISS(
    embedding_function=embeddings_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)


In [49]:
vector_store.add_documents(documents=documents)

['20845c39-cc2e-435a-8ab8-85403433f0b1',
 'e7a497e0-a720-4280-a171-ac1d4b6952e8',
 'f40a8d91-c15f-45a3-87ec-deb3a874c8cb',
 '886f48b1-49f5-4269-976c-683528321540',
 'b7ece64a-f1ef-4669-b6c2-c15b737c14ce',
 'da8b6e9f-4eb5-4723-9de8-2fa5d5cd5dfc',
 '2b108bd6-db83-4fcd-a24a-c03b656f5923',
 '855bf228-7588-405f-ad2d-876844a7ada7',
 '2d178e50-cf0c-4581-9fa9-a9b636e37570',
 '922309b6-dd03-4db4-9020-0ba7ec6afbad']

In [52]:
vector_store.similarity_search(
    "Tell me some news",
    # k=2,
    filter={"source":{"$eq":"news"}}
)

[Document(id='855bf228-7588-405f-ad2d-876844a7ada7', metadata={'source': 'news'}, page_content='Elections results are in: incumbent wins by a narrow margin.'),
 Document(id='7cbaa6b3-8d5e-492c-94f4-397a384ed267', metadata={'source': 'news'}, page_content='Elections results are in: incumbent wins by a narrow margin.'),
 Document(id='b7ece64a-f1ef-4669-b6c2-c15b737c14ce', metadata={'source': 'news'}, page_content='Breaking: Major earthquake shakes the city center.'),
 Document(id='60cd2757-d774-4afb-a1c9-2ad60a01b79d', metadata={'source': 'news'}, page_content='Breaking: Major earthquake shakes the city center.')]

In [53]:
retreiver = vector_store.as_retriever(search_kwargs={"k": 3})

In [55]:
retreiver.invoke("Tell me some news")

[Document(id='2d178e50-cf0c-4581-9fa9-a9b636e37570', metadata={'source': 'tweet'}, page_content='Just finished a 5k run, feeling great! #fitness'),
 Document(id='0734ef7f-24f2-43c6-a9c0-5765898aa185', metadata={'source': 'tweet'}, page_content='Just finished a 5k run, feeling great! #fitness'),
 Document(id='855bf228-7588-405f-ad2d-876844a7ada7', metadata={'source': 'news'}, page_content='Elections results are in: incumbent wins by a narrow margin.')]

In [56]:
vector_store.save_local("Today's faiss index")

In [58]:
new_vector_store = FAISS.load_local(
    "Today's faiss index",embeddings_model,allow_dangerous_deserialization=True
)

In [59]:
new_vector_store.similarity_search("Tell something about sports")

[Document(id='da8b6e9f-4eb5-4723-9de8-2fa5d5cd5dfc', metadata={'source': 'tweet'}, page_content="Can't believe my team lost in the last minute! #sportsfan"),
 Document(id='708c4062-d2f8-4f30-82f5-2b87eaa6a7ff', metadata={'source': 'tweet'}, page_content="Can't believe my team lost in the last minute! #sportsfan"),
 Document(id='e7a497e0-a720-4280-a171-ac1d4b6952e8', metadata={'source': 'sports'}, page_content='The local football team won their championship game last night.'),
 Document(id='b6a3f3ae-fef7-4204-8b8d-e464d5b1f462', metadata={'source': 'sports'}, page_content='The local football team won their championship game last night.')]

In [60]:
from langchain_community.document_loaders import PyPDFLoader

In [61]:
FILE_PATH = r"C:\Users\SRJ\SRJ\Work\agentic_ai\2-Langchain Basics\2.4-VectorDatabase\FAISS\data\sample_document.pdf"

In [62]:
loader = PyPDFLoader(FILE_PATH)

In [66]:
pages = loader.load()

In [64]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [65]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)

In [69]:
split_docs = splitter.split_documents(pages)

In [70]:
len(split_docs)

6

In [71]:
index=faiss.IndexFlatIP(768)

vector_store = FAISS(
    embedding_function=embeddings_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [72]:
vector_store.add_documents(documents = split_docs)

['d833d1b3-e247-419c-91a6-8d32a7083fe1',
 '555fd574-3df9-4cfe-95b3-d9d4266d43ac',
 'c32d334c-47a4-4f02-a7a0-36daafe665a8',
 '8a924275-e621-4c00-ab30-2de32c3c9417',
 'f6a5f074-23bb-4b13-ac14-3c5e5bd4dea9',
 '21af5227-a2b0-4d75-878c-0168a7f2d379']

In [73]:
retreiver = vector_store.as_retriever(search_kwargs={"k": 3})

In [74]:
retreiver.invoke("Tell me about data ingestion")

[Document(id='c32d334c-47a4-4f02-a7a0-36daafe665a8', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-08-03T15:53:14+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-08-03T15:53:14+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'C:\\Users\\SRJ\\SRJ\\Work\\agentic_ai\\2-Langchain Basics\\2.4-VectorDatabase\\FAISS\\data\\sample_document.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='volumes while maintaining performance and reliability.\nBest Practices\n\x7f Data Validation: Implement comprehensive validation rules to ensure data quality \x7f\nError Handling: Design robust error handling mechanisms for failed ingestion\nattempts \x7f Monitoring: Set up monitoring and alerting for ingestion pipelines \x7f\nDocumentation: Maintain clear documentation of data sources, schemas, and\nprocesses \x7f Security: Implement appropriate security measu

In [75]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [76]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [77]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [78]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [80]:
rag_chain = (
    {"context":retreiver | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [81]:
rag_chain.invoke("Tell me about ingestion")

'Data ingestion is a fundamental process in modern data analytics and machine learning workflows. It involves collecting, importing, and processing data from various sources for immediate use or storage in databases. Effective data ingestion forms the foundation of successful data-driven organizations.'