In [17]:
from dotenv import load_dotenv
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader,DirectoryLoader,PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

load_dotenv()
llm_model = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash',max_retries=2)
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') ##embedding model

In [14]:
## Document/Directory Loader ##
doc_loaders = DirectoryLoader(
    path="data",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":'utf-8'}
)
docs = doc_loaders.load()
print(f"Number Documents loaded : {len(docs)}")
for index,doc in enumerate(docs):
    print(f"Document number: {index+1}")
    print(f"Document data preview:\n{doc.page_content[0:100]}")
    print(f"Document metadata:\n{doc.metadata}")
    print("-------------------")

Number Documents loaded : 5
Document number: 1
Document data preview:
ChatGPT and Conversational AI

ChatGPT is one of the most widely known conversational AI models deve
Document metadata:
{'source': 'data\\chatgpt_conversational_ai.txt'}
-------------------
Document number: 2
Document data preview:
Convolutional Neural Networks (CNNs)

Convolutional Neural Networks, or CNNs, are a class of deep le
Document metadata:
{'source': 'data\\convolutional_neural_networks.txt'}
-------------------
Document number: 3
Document data preview:
Google Gemini and the Rise of Multimodal AI

Google Gemini represents a new generation of large-scal
Document metadata:
{'source': 'data\\google_gemini_multimodal_ai.txt'}
-------------------
Document number: 4
Document data preview:
Machine Learning Fundamentals

Machine Learning (ML) is a branch of artificial intelligence focused 
Document metadata:
{'source': 'data\\machine_learning_fundamentals.txt'}
-------------------
Document number: 5
Document data p

In [None]:
### text splitters ####
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 60
    ,
    length_function=len,
    separators=[" "]
)
doc_chunks = text_splitter.split_documents(docs)
print(f"Total number of chunks created: {len(doc_chunks)}")
print(doc_chunks[10],"\n")
print(doc_chunks[11])

Total number of chunks created: 22
page_content='multiple sources.

Unlike traditional LLMs that are text-only, Gemini can interpret visual inputs, perform cross-modal reasoning, and produce outputs that combine modalities. This makes it ideal for use cases like visual question answering, document analysis, or creative generation. It also integrates with Google Search, Workspace, and YouTube, enabling intelligent summarization, content generation, and recommendation systems.

Technically, Gemini builds upon the Transformer architecture but' metadata={'source': 'data\\google_gemini_multimodal_ai.txt'} 

page_content='systems.

Technically, Gemini builds upon the Transformer architecture but extends it with multimodal embeddings. These embeddings allow the model to represent different types of input (e.g., text and image) in a shared space. This unified understanding improves contextual grounding and factual accuracy. Gemini also benefits from large-scale reinforcement learning and exten

In [22]:
## create the FAISS vectorstore ##
vector_store = FAISS.from_documents(
    documents=doc_chunks,
    embedding=embedding_model
)
print("Total number of vectors created in the vector store:",vector_store.index.ntotal)

##save the faiss vector store##
vector_store.save_local("faiss_index")
print("Vector store saved to faiss_index directory")

Total number of vectors created in the vector store: 22
Vector store saved to faiss_index directory


### RAG PIPELINE

In [23]:
def format_docs(docs):
    '''Creating a context from chunks'''
    return "\n\n".join(doc.page_content for doc in docs)


prompt = ChatPromptTemplate.from_template(
"""
You are an intelligent assisstant, for question-answering tasks,
Use the retrieved context given below to answer the question in a more streamlined way.
Answer in only 3 sentences

Context : {context}
Question : {question}

"""
)

retreiver = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)


In [25]:
##RAG CHAIN##
rag_chain =( 
    {'context':retreiver | format_docs, "question":RunnablePassthrough()}
    | prompt
    | llm_model
    | StrOutputParser()        
    )
rag_chain.invoke("Explain Transformer architecture in few sentences")

'The Transformer architecture, introduced in 2017, revolutionized NLP by replacing RNNs with a self-attention mechanism for efficient long-range dependency modeling. It processes input sequences in parallel, offering scalability and flexibility across various domains like vision and audio. Its capabilities have made it the foundation for generative AI, enabling machines to reason and create.'