<a href="https://colab.research.google.com/github/rujhannajib/mlbNittanyAI-rujhan/blob/main/Document_answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Document answering app

- Objective: Develop a RAG that can answer questions based on documents uploaded.
- Model: GPT-40
- Example Document: Document about Malaysia
- Task: RAG
- Library: OpenAI, LangChain

By:
- Name: Muhammad Rujhan Najib Bin Fauzi Najib
- PSU ID: 950118053


In [1]:
# installing important libraries
!pip install -qU pypdf langchain_community langchain-openai langchain_openai


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/298.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/298.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from langchain_community.document_loaders import PyPDFLoader

# Put your pdf here
file_path = "/content/MCIA-MalaysiaHandbook.pdf"

# load pdf
loader = PyPDFLoader(file_path)

# extract text, metadata
docs = loader.load()

print(len(docs))

180


In [3]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

Malaysia Country Handbook
1.  This handbook provides basic reference information on Malaysia, includ
{'source': '/content/MCIA-MalaysiaHandbook.pdf', 'page': 0}


In [4]:
import os

# Replace 'your_api_key_here' with your actual API key
os.environ["OPENAI_API_KEY"] = ""

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")


In [5]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


# split documents into smaller documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# load smaller chunks into vector store, data stores that enable indexing & retrieving information based on vector representation.
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)


retriever = vectorstore.as_retriever()

In [6]:
from typing_extensions import final
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# design prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
# retrieve document based on similarity
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [7]:
# define function to ask question
def ask_question(input_text):

    # results = rag_chain.invoke({"input": input_text})

    # Retrieve relevant documents
    retrieved_documents = retriever.invoke(input_text)

    # Filter out documents with repeated placeholders and join relevant content
    retrieved_context = " ".join(
        [doc.page_content for doc in retrieved_documents if "notes" not in doc.page_content.lower()]
    )

    # Check if there is any retrieved context; if not, default to "I don't know"
    if not retrieved_context.strip():
        final_answer = "I don't know"
    else:
        # Run the RAG model and validate the answer
        results = rag_chain.invoke({"input": input_text, "context": retrieved_context})
        final_answer = results if results else "I don't know"

    print("Question: ", input_text)
    if final_answer == "I don't know":
        print("Answer: I don't know")
    else:
        print(final_answer["answer"])
        print("Sources: ")
        for source in final_answer["context"]:
            print("Page:", source.metadata["page"])
        print("\n")


### Note: Currently having problems with model using outside knowledge instead of the knowledge available in the document uploaded. (see question 3)

In [8]:
print("Question 1")
ask_question("Where is Malaysia?")

print("Question 2")
ask_question("How is the weather in Malaysia?")

print("Question 3")
ask_question("Is Japan a beautiful country?")

Question 1
Question:  Where is Malaysia?
Malaysia is located in Southeast Asia, consisting of peninsular Malaysia on the Malay Peninsula and East Malaysia on the island of Borneo. It shares borders with Thailand to the north, and its east part is bordered by Indonesia and Brunei. Malaysia has maritime boundaries with Vietnam and the Philippines as well.
Sources: 
Page: 8
Page: 10
Page: 19
Page: 13


Question 2
Question:  How is the weather in Malaysia?
Malaysia has a tropical climate influenced by northern monsoons from November to March and southern monsoons from May to October. Average daily temperatures range from 21 to 32° C (70 to 90 ° F) throughout the year, with November and December being the wettest months. The west coast is generally dry, while the east coast and areas like Sarawak and Sabah experience more rainfall, often related to thunderstorms and typhoons.
Sources: 
Page: 13
Page: 13
Page: 16
Page: 14


Question 3
Question:  Is Japan a beautiful country?
I don't know.
So

In [9]:
# results = rag_chain.invoke({"input": input_text})
input_text = "Where is Malaysia?"
# Retrieve relevant documents
retrieved_documents = retriever.invoke(input_text)
# Filter out documents with repeated placeholders and join relevant content
retrieved_context = " ".join(
        [doc.page_content for doc in retrieved_documents if "notes" not in doc.page_content.lower()]
)

# Check if there is any retrieved context; if not, default to "I don't know"
if not retrieved_context.strip():
    final_answer = "I don't know"
else:
        # Run the RAG model and validate the answer
    results = rag_chain.invoke({"input": input_text, "context": retrieved_context})
    final_answer = results if results else "I don't know"

print("Question: ", input_text)
final_answer

Question:  Where is Malaysia?


{'input': 'Where is Malaysia?',
 'context': [Document(id='cedb36a2-de25-4e43-b186-0aca193af6c7', metadata={'source': '/content/MCIA-MalaysiaHandbook.pdf', 'page': 8}, page_content='1\nKEY FACTS\nCountry Name. Malaysia\nCountry Code.  MYS\nChief of State-Paramount Ruler. His Majesty Tuanku Syed Siraj ud-din\nibn al-Marhum Tengku Syed Petra Jamalulail, King of Malaysia; elected\nking on 12 December 2001 and installed 13 December 2001.\nCapital. Kuala Lumpur\nNational Flag.  Fourteen equal horizontal stripes of red (top) alternating\nwith white (bottom). There is a blue rectangle in the upper hoist-side\ncorner bearing a yellow crescent and a yellow, 14-pointed star; the cres-\ncent and the star are traditional symbols of Islam. \nTime Zone. GMT +8\nPopulation. 22,229,040 (2001); average annual growth rate –  1.96 percent.\nLanguages.  Bhasa Malaysia (official), English, Chinese, and Tamil\nCurrency. The local currency is the Malaysian ringgit (RM), which is divided\ninto 100 sen. Notes i