In [39]:
!pip install beautifulsoup4 lark



In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

# Basic RAG using LCEL

In [3]:
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
llm = ChatOpenAI(model_name="gpt-4o-mini-2024-07-18", temperature=0)

## From PDF

In [5]:
# PDF path
file_path = (
    "PDF/The_Art_Of_War.pdf"
)
loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5,"k":3})
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [8]:
rag_chain.invoke("what is the 5 constant factors")

'The five constant factors are: (1) The Moral Law; (2) Heaven; (3) Earth; (4) The Commander; (5) Method and discipline. These factors are essential for determining the conditions in the field during warfare.'

In [21]:
resp = retriever.invoke("Tell me about Weak Points and Strong")

## From Web

# Self Querying

Jika ingin melakukan querying tidak hanya pada konteks dari dokumen yang diberikan, namun juga dari metadata dari dokumen tersebut

In [10]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="Path to the source file",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page number of the document",
        type="integer",
    )
]
document_content_description = "Content of the book"

In [40]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

In [41]:
retriever.invoke("five constant factors in what page ?")

[Document(metadata={'page': 26, 'source': 'PDF/The_Art_Of_War.pdf'}, page_content='34. The five elements (water, fire, wood, metal, earth)\nare not always equally predominant; the four seasonsmake way for each other in turn. There are short daysand long; the moon has its periods of waning and wax-ing.\n24\n/G53/G75/G6E/G20/G54/G7A/G75/G20/G6F/G6E/G20/G74/G68/G65/G20/G41/G72/G74/G20/G6F/G66/G20/G57/G61/G72'),
 Document(metadata={'page': 44, 'source': 'PDF/The_Art_Of_War.pdf'}, page_content='13. These six are the principles connected with Earth.\nThe general who has attained a responsible post mustbe careful to study them.\n14. Now an army is exposed to six several calamities,\nnot arising from natural causes, but from faults forwhich the general is responsible. These are: \n(1) Flight;(2) insubordination; (3) collapse; (4) ruin; (5) disorganization; (6) rout.\n15. Other conditions being equal, if one force is hurled\nagainst another ten times its size, the result will be theflight of th

In [45]:
prompt = ChatPromptTemplate.from_template("""
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to help you answer the question. 
if user question about pages just explain the given context. because the context is the page content.  
                                                                                    
Question: {question} 

Context: {context}
                                          
Use the following metadata if you need it.           

Answer:

                                       
""")

def format_docs(docs):
    context = f"""
context : {"\n\n".join(doc.page_content for doc in docs)+"\n\n".join(str(doc.metadata["page"]) for doc in docs)}
"""
    return context

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
rag_chain.invoke("tell me about page 20")

'Page 20 discusses the concept of energy in the context of fighting men. It compares their energy to that of rolling logs or stones, explaining that logs or stones remain motionless on level ground but move when on a slope. The text emphasizes that the energy developed by skilled fighters is akin to the momentum of a round stone rolling down a mountain, illustrating the idea of harnessing energy effectively in combat.'

In [47]:
rag_chain.invoke("tell me the concept of energy in fighting men in what page ?")

'The concept of energy in fighting men is discussed on page 22 of the context provided. It describes how combined energy can make fighting men behave like rolling logs or stones, emphasizing the importance of momentum and the nature of energy in battle. The text also compares energy to the bending of a crossbow, highlighting the strategic aspects of maintaining order amidst chaos and utilizing the strengths of individuals effectively.'

In [46]:
rag_chain.invoke("five constant factors in what page ?")

'The five constant factors are discussed on page 1 of the provided context. They are identified as follows: \n\n1. The Moral Law\n2. Heaven\n3. Earth\n4. The Commander\n5. Method and discipline\n\nThese factors are essential for consideration in military strategy and decision-making, as they influence the conditions in the field.'

In [49]:
rag_chain.invoke("five constant factors")

"The five constant factors mentioned in the context are essential elements to consider in the art of war. They are:\n\n1. **The Moral Law**: This factor ensures that the people are in complete accord with their ruler, leading them to follow him courageously, regardless of danger.\n2. **Heaven**: This refers to the natural elements and conditions such as night and day, cold and heat, and the changing of seasons.\n3. **Earth**: This encompasses the geographical and environmental aspects that can affect military operations.\n4. **The Commander**: This factor highlights the importance of the leader's qualities and capabilities in guiding the army.\n5. **Method and Discipline**: This involves the organization, training, and strategies employed by the military forces.\n\nThese factors are crucial for determining the conditions in the field during warfare."