In [12]:
# install langchain and all other helper libraries
# chromadb is the database to store vector embedings
!pip3 install --upgrade pip
!pip3 install --quiet --upgrade langchain langchain-community langchain-openai chromadb
# pypdf reads pdf using python
# streamlit is the api to build webapplication using python
!pip3 install --quiet  --upgrade pypdf pandas streamlit python-dotenv



# 1. Import all necessary Packages and Modules

In [14]:
# import langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

import os
import shutil
from dotenv import load_dotenv

In [15]:
# read all key value value pairs from hidden file .env and sets them as environment variables
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# 2. Define and load llm

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY, temperature=0)

# test the model
llm.invoke
response = llm.invoke("what are the 7 wonders of the worlds?")
print(response.content)

The term "Seven Wonders of the World" can refer to different lists, but the two most commonly referenced are the **Seven Wonders of the Ancient World** and the **New Seven Wonders of the World**.

### Seven Wonders of the Ancient World:
1. **Great Pyramid of Giza** (Egypt) - The only surviving wonder, it was built as a tomb for the Pharaoh Khufu.
2. **Hanging Gardens of Babylon** (Iraq) - Said to be an extraordinary series of tiered gardens, though its existence is debated.
3. **Statue of Zeus at Olympia** (Greece) - A giant statue made by the sculptor Phidias, honoring the god Zeus.
4. **Temple of Artemis at Ephesus** (Turkey) - A large temple dedicated to the goddess Artemis, known for its grandeur.
5. **Mausoleum at Halicarnassus** (Turkey) - A tomb built for Mausolus, a satrap of the Persian Empire, known for its architectural beauty.
6. **Colossus of Rhodes** (Greece) - A giant statue that stood at the entrance of the harbor of Rhodes.
7. **Lighthouse of Alexandria (Pharos of Alex

# 3. Load the PDF file and split it in chunks

In [6]:
# load the pdf that you want RAG on
loader = PyPDFLoader("data/world_religions.pdf")

# load function returns a list of all the pages of pdf
document_pages = loader.load()

# split the document in chunks for easy access to relevant information and avoid providing the whole document in the context
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=len, separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(document_pages)
print(len(chunks))

Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 285 0 (offset 0)
Ignoring wrong pointing object 288 0 (offset 0)
Ignoring wrong pointing object 295 0 (offset 0)
Ignoring wrong pointing object 306 0 (offset 0)
Ignoring wrong pointing object 311 0 (offset 0)
Ignoring wrong pointing object 324 0 (offset 0)
Ignoring wrong pointing object 330 0 (offset 0)
Ignoring wrong pointing object 332 0 (offset 0)
Ignoring wrong pointing object 339 0 (offset 0)
Ignoring wrong pointing object 351 0 (offset 0)
Ignoring wrong pointing object 353 0 (offset 0)
Ignoring wrong pointing object 360 0 (offset 0)
Ignoring wrong pointing object 367 0 (offset 0)


59


# 4. Create embeddings for the document chunks and store them in vector DB

In [7]:
PERSIST_DIR = "vectorstore"
if os.path.exists(PERSIST_DIR):
    shutil.rmtree(PERSIST_DIR)

# this functions creates embeddings functions which can be used to generate embeddings


def embeddings_factory():
    return OpenAIEmbeddings(model="text-embedding-3-small", api_key=OPENAI_API_KEY)


embeddings_function = embeddings_factory()

run_embedding_test = False
# a simple test to perform embedding test to see how different words are closer to each other
if (run_embedding_test):
    from langchain.evaluation import load_evaluator
    evaluator = load_evaluator(
        evaluator="embedding_distance", embeddings=embeddings_function)
    print(evaluator._evaluate_strings(prediction="India", reference="Taj Mahal"))

# create a vector store and pass in embeddings_function as the helper function
vectorstore = Chroma.from_documents(
    documents=chunks, embedding=embeddings_function, persist_directory=PERSIST_DIR)
vectorstore.persist()

  vectorstore.persist()


# 5. Query for relevant data from the vectorstore

In [8]:
# create a retriever based on similarity search which will search for the relevant docs based on the user query
retriever = vectorstore.as_retriever(search_type="similarity")


def search_vectorstore(user_query, retriever):
    docs = retriever.invoke(user_query)
    return docs


# following is the question asked by user
user_query = "who is the supreme power according to Hinduism"
searched_docs = search_vectorstore(user_query=user_query, retriever=retriever)


# define a function to concatenate the searched_docs for later use
def join_relevant_docs(docs):
    relevant_info = "\n\n---\n\n".join(
        each_doc.page_content for each_doc in docs)
    return relevant_info

# # 6. Create the prompt for LLM with the relevant info extracted from vectorstore

In [9]:
# create a prompt temaplate for the LLM to follow
prompt_template_str = """
You are an assistant for question-answering tasks. 
Strictly use the following pieces of retrieved context to answer the question. 
If you don't know the answer or do not find the relevant information in the context, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

# create a prompt and pass it on to LLM
prompt_template = ChatPromptTemplate.from_template(prompt_template_str)
promt = prompt_template.format(
    context=join_relevant_docs(searched_docs), question=user_query)
print(promt)

Human: 
You are an assistant for question-answering tasks. 
Strictly use the following pieces of retrieved context to answer the question. 
If you don't know the answer or do not find the relevant information in the context, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Question: who is the supreme power according to Hinduism 

Context: and interpretations of the Vedas.  
 
Religious Beliefs 
At the foundation of Hinduism is the divinity 
(holiness) of the Vedas.  Eventually, multiple deities 
were also introduced.  The three most important 
include Brahma the Creator, Vishnu the Preserver, 
and Shiva the Destroyer.  Although there are three, 
all are considered part of a single, universal spirit or 
supreme being.  These three deities also illustrate the 
belief that the universe is in a constant cycle of 
beginning, continuing, and ending.  Karma is a law of 
Copyright © 2015 Instructomania Pavlovich 
Common Core Literacy

---

Station 7
Pan

# 7. Invoke LLM with the above generated prompt

In [10]:
response = llm.invoke(promt)
print(response.content)

According to Hinduism, the supreme power is represented by a single, universal spirit or supreme being, which is worshiped in three forms: Brahma the Creator, Vishnu the Preserver, and Shiva the Destroyer. These deities illustrate the belief in the cyclical nature of the universe. All three are considered part of the same divine essence.


# 8. Alternate Method of calling the llm and passing prompt using LangChain Chain

In [11]:
rag_chain = (
    {"context": retriever | join_relevant_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)
AIMessage = rag_chain.invoke(user_query)
print(AIMessage.content)

According to Hinduism, the supreme power is represented by a single, universal spirit or supreme being, which is manifested in three primary deities: Brahma the Creator, Vishnu the Preserver, and Shiva the Destroyer. These deities illustrate the belief in a constant cycle of creation, preservation, and destruction. Thus, while there are multiple deities, they are all considered part of the same ultimate reality.
