In [78]:
from langchain_community.document_loaders import PyPDFLoader #To load the pdf document
from langchain_text_splitters import RecursiveCharacterTextSplitter #Perform chunking
from langchain_huggingface import HuggingFaceEndpoint ,ChatHuggingFace ,HuggingFaceEmbeddings#To initilazie the model
from dotenv import load_dotenv #Load environmental variables
load_dotenv()
import os
from langchain_community.vectorstores import FAISS #Faiss vectorstore
from langchain_core.prompts import PromptTemplate #To create a Instruction prompt
from langchain.retrievers.multi_query import MultiQueryRetriever #Generate multiple query for user's single query
from langchain_groq import ChatGroq #Open source LLM models 
from langchain_core.output_parsers import StrOutputParser 
from langchain_core.messages import HumanMessage, AIMessage, trim_messages #Interaction between human and AI


## Step-1: Load the PDF Document

In [79]:
file_path = "Intro_about_AI.pdf" #Path to pdf file
loader = PyPDFLoader(file_path) #Load the file
docs= []
for doc in loader.lazy_load():
    docs.append(doc)

In [80]:
len(docs) #Page wise document

7

## Step-2: Perform Chunking using RecursiveCharacterTextSplitter

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n","\n","."], #pripority based seperation 
    chunk_size = 1000,  
    chunk_overlap = 120,
    length_function = len
)

#Apply splitting/chunking
chunks = text_splitter.split_documents(docs)


In [5]:
len(chunks) #Total number of chunks formed

12

## Step-3: Load the LLM and Embedding Model

In [6]:
#Loading the embedding model from huggingface
embedding_model = HuggingFaceEmbeddings(model_name ="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# #loading the llm model from hugging face
# llm = HuggingFaceEndpoint(
#     repo_id = "mistralai/Mistral-7B-Instruct-v0.3", #Mistral model
#     temperature= 0.4,
#     max_new_tokens= 200,  #Maximun number of tokens to be generated in output
# )
# model = ChatHuggingFace(llm=llm)

In [None]:
#load the groq hosted model
model = ChatGroq(model_name = "Llama-3.3-70b-Versatile",max_tokens= 1000)

## Step-4: Creating a VectorStore

In [None]:
VECTOR_STORE_PATH = "faiss_index"  # Directory to save the vector store

# Check if vector store exists
if os.path.exists(VECTOR_STORE_PATH):
    print("Loading existing vector store...")
    vector_store = FAISS.load_local(VECTOR_STORE_PATH, embedding_model, allow_dangerous_deserialization=True)
else:
    print("Creating new vector store...")
    # creating/storing chunks 
    vector_store = FAISS.from_documents(
        documents=chunks,
        embedding=embedding_model
    )
    # Saving the vector store locally
    vector_store.save_local(VECTOR_STORE_PATH)
    print(f"Vector store saved to {VECTOR_STORE_PATH}")

Creating new vector store...
Vector store saved to faiss_index


## Step-5: Creating a prompt template

In [None]:
prompt_template = PromptTemplate(
    template = """You are a Smart Chat Assistant tasked with answering the user question , based on the context provided. Only answer based on the provided context .
    If the context does not contain enough information to answer, respond: I do not have enough information to answer your question.
    Keep answers concise, clear, and directly relevant to the question.
    'Context':
    {context}
    'Question':
    {input}""",
    input_variables=['context','input']
)

## Step-6: Creating a retriever component 

In [None]:
# Multi query retriever takes 2 arguemnts : ("which llm to use","which retriever to use")
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever
    (search_kwargs={"k": 6,'lambda_mult':0.3},  #'k' : number of similari documents to retrieve , 'lambda_mult': to retriever the diverse documents and reduce redundancy
    search_type="mmr"), #Maximum marginal relevance
    llm=model)

## Step-7: Creating a RAG_Chain / Pipeline

In [None]:
from langchain_core.runnables import RunnableParallel , RunnablePassthrough

parallel_chain = RunnableParallel({
    'context' :multiquery_retriever, #Retrieve context from vectorstore for user query
    'input' :RunnablePassthrough() #No processing, Just pass what received
})

In [81]:
#Final rag chain , conbining : retreivers , template , model and output parser
rag_chain = parallel_chain | prompt_template | model | StrOutputParser()

In [None]:
# To store the Human and AI messages
messages = []

In [82]:
#Question asked by user
query = "what are the 2 questions ?"
messages.append(HumanMessage(content=query))

In [83]:
messages

[HumanMessage(content='what is this pdf about ?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='This PDF appears to be an introduction to Artificial Intelligence (AI), covering topics such as the definition of AI, its benefits, examples of AI applications, and the future of AI, including its potential risks and regulations.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='can you further elaborate it ?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='The PDF discusses the introduction to Artificial Intelligence (AI), covering various aspects such as its definition, benefits, examples, and future implications. \n\nIt starts by defining AI as computer systems that can perform tasks associated with human cognitive functions, such as interpreting speech, playing games, and identifying patterns. The PDF also explains how AI systems learn by processing large amounts of data and looking for patterns.\n\nThe benefits of AI mentioned in 

In [84]:
# Only store past 3 conversation due to token limit
selected_messages = trim_messages(
    messages,
    token_counter=len,  # <-- len will simply count the number of messages rather than tokens
    max_tokens=6,  # <-- allow up to past 6 messages (equivalent to 3 conversations).
    strategy="last",
    start_on="human",
    
    include_system=True,
    allow_partial=False,
)
result = rag_chain.invoke(selected_messages)
messages.append(AIMessage(content=result))

In [85]:
for msg in messages:
    msg.pretty_print()


what is this pdf about ?

This PDF appears to be an introduction to Artificial Intelligence (AI), covering topics such as the definition of AI, its benefits, examples of AI applications, and the future of AI, including its potential risks and regulations.

can you further elaborate it ?

The PDF discusses the introduction to Artificial Intelligence (AI), covering various aspects such as its definition, benefits, examples, and future implications. 

It starts by defining AI as computer systems that can perform tasks associated with human cognitive functions, such as interpreting speech, playing games, and identifying patterns. The PDF also explains how AI systems learn by processing large amounts of data and looking for patterns.

The benefits of AI mentioned in the PDF include automating repetitive tasks, solving complex problems, reducing human error, and improving customer experience. It also highlights the application of AI in various fields, including healthcare, transportation, a