# HW3: Document QA based on RAG
> #### 👀 Attention❗:
> All parameters, prompts, and code appearing in this tutorial are for demonstration purposes only and may not be the best practices.  
> We encourage you to think creatively and try new things!

In [1]:
# %pip install pandas==2.2.3 jupyter==1.1.1 langchain==0.3.23 langchain-community==0.3.21 rich==14.0.0 openai==1.71.0 langchain-groq==0.3.2 langchain-ollama==0.3.1 faiss-gpu==1.7.2 numpy<2 rouge-score 

In [1]:
import logging
import json

from rich.console import Console
from rich.logging import RichHandler

console = Console(stderr=True, record=True)
log_handler = RichHandler(rich_tracebacks=True, console=console, markup=True)
logging.basicConfig(format="%(message)s",datefmt="[%X]",handlers=[log_handler])
log = logging.getLogger("rich")
# log.setLevel(logging.INFO)
log.setLevel(logging.DEBUG)

DEBUG: bool = False
DATASET_PATH: str = "../datasets/public_dataset.json"

USING_MODEL: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"
USING_PORT: int = 8092
API_ENDPOINT: str = f"http://192.168.0.7:{USING_PORT}/v1"
API_KEY: str = "abc"

MODEL_TEMPERATURE: float = 0.3
MODEL_MAX_TOKENS: int = 128
RETRIEVE_TOP_K: int = 5

## Query a LLM

In [2]:
from langchain_community.llms.vllm import VLLMOpenAI

def get_llm(temperature: float=0.7, max_tokens: int=128):
  return VLLMOpenAI(
    base_url=API_ENDPOINT,
    api_key=API_KEY,
    model=USING_MODEL,
    temperature=temperature,
    max_tokens=max_tokens,
    frequency_penalty=0.5,
    presence_penalty=0,
    # streaming=True,
    # model_kwargs={"stop": ["."]},
  )

llm = get_llm(temperature=0.3, max_tokens=32)
response = llm.invoke("How are you?")
print(response)

 I hope you're doing well. I'm not sure if you remember me, but we met at the conference last year. I wanted to reach out and follow


### Using Groq API

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
  model="llama-3.3-70b-versatile",
  # model="meta-llama/llama-4-scout-17b-16e-instruct",
  # model="llama-3.3-70b-specdec",
  # model="llama-3.1-8b-instant",
  api_key="gsk_xxx",
  temperature=0.6,
  max_tokens=128,
  # model_kwargs={"frequency_penalty": 0.8},
)

response = llm.invoke("How are you?")
console.print(response)

## A simple chat chain

In [3]:
# SYSTEM_PROMPT: str = "You are a helpful assistant."
# SYSTEM_PROMPT: str = "You are a helpful assistant. You will try your best to help the user."
SYSTEM_PROMPT: str = """Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know"."""

CHAT_TEMPLATE_CAG = (
f"""system: {SYSTEM_PROMPT}
human: context: {{document}}\nquestion: {{input}}
assistant: """
)
# {{document}} => {document}
print(CHAT_TEMPLATE_CAG)

system: Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".
human: context: {document}
question: {input}
assistant: 


In [8]:
from langchain_core.language_models.llms import BaseLLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# LLM Chain
def llm_chat_chain(llm: BaseLLM):
  chat_prompt = PromptTemplate.from_template(template=CHAT_TEMPLATE_CAG)
  llm_chain = chat_prompt | llm | StrOutputParser()
  return llm_chain


llm = get_llm(MODEL_TEMPERATURE, MODEL_MAX_TOKENS)
llm_chain = llm_chat_chain(llm)

## Load and show the Pubilc Dataset

In [6]:
with open(DATASET_PATH, "r") as f:
  dataset = json.load(f)

demo_id = 45
demo_title = dataset[demo_id]["title"]
demo_full_text = dataset[demo_id]["full_text"]
demo_question = dataset[demo_id]["question"]
demo_answer = dataset[demo_id]["answer"]
demo_evidence = dataset[demo_id]["evidence"]

display(dataset[demo_id])
# print(f'title: {demo_title}')
# print(f'question: {demo_question}')
# print(f'answer: {demo_answer}')
# print(f'evidence: {demo_evidence}')
# print(f'\nFull text: \n{demo_full_text}')

{'title': 'Extractive Summarization of Long Documents by Combining Global and Local Context',
 'full_text': 'Abstract\nIn this paper, we propose a novel neural single document extractive summarization model for long documents, incorporating both the global context of the whole document and the local context within the current topic. We evaluate the model on two datasets of scientific papers, Pubmed and arXiv, where it outperforms previous work, both extractive and abstractive models, on ROUGE-1, ROUGE-2 and METEOR scores. We also show that, consistently with our goal, the benefits of our method become stronger as we apply it to longer documents. Rather surprisingly, an ablation study indicates that the benefits of our model seem to come exclusively from modeling the local context, even for the longest documents.\n\n\nIntroduction\nSingle-document summarization is the task of generating a short summary for a given document. Ideally, the generated summaries should be fluent and coherent,

### Here's a quick implementation of Cache-Augmented Generation (CAG)
Let's just `put the whole paper context into the prompt`!

---
To build a multi-stage solution, we can simply change:

`document` in `query = {"document": doc_content, "input": demo_question}` to `history`


In [9]:
from langchain.callbacks.tracers import ConsoleCallbackHandler  # It provides additional details during LLM invocation with chaining.

doc_content = f"{demo_full_text}"
query = {"document": doc_content, "input": demo_question}

print(f"{demo_question = }")
print(f"{demo_answer = }")
response = []
for chunk in llm_chain.stream(query, config={"callbacks": [ConsoleCallbackHandler()]} if DEBUG else None):
  response.append(chunk)
  print(chunk, end="")
print("")

chat_response = "".join(response)
console.print(chat_response)

demo_question = 'What do they mean by global and local context?'
demo_answer = ['global (the whole document) and the local context (e.g., the section/topic) ']
 In this context, global context refers to the overall information or representation of the entire document, while local context refers to the specific information or representation of a particular topic or section within the document. The authors use these terms to describe how their model incorporates both global and local information when deciding which sentences to include in an extractive summary.  In other words, the global context provides a broad understanding of the document as a whole, while the local context provides more focused information about specific topics or sections within that document.  The authors use LSTM-Minus to capture the local context, which is a method for learning embeddings of text spans that can represent


# How to RAG?

## Construct the Document and perform preprocessing
A `Document` in LangChain is a fundamental data structure used to represent and manage text-based information.

In [10]:
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Converting texts to documents
documents = demo_full_text.split("\n\n\n")[:-1]
docs = [Document(page_content=doc) for doc in documents]

# Split the Documents
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=256, # number of characters
  chunk_overlap=128,
  length_function=len,
  add_start_index=True,
)

docs_splits = text_splitter.split_documents(docs)

### Construct the `embedding model` and `vectorstore`

In [15]:
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.vectorstores import InMemoryVectorStore

# Using Ollama embedding model
embeddings = OllamaEmbeddings(
  model="snowflake-arctic-embed2:568m-l-fp16",
  keep_alive=3000,
)

# FAISS vector_store
# vector_store = FAISS.from_documents(docs_splits, embeddings)
vector_store = InMemoryVectorStore.from_documents(docs_splits, embeddings)

The function `RetrievalQA.from_chain_type` is outdated.
([see more](https://python.langchain.com/api_reference/langchain/chains/langchain.chains.retrieval_qa.base.RetrievalQA.html))

Feel free to apply new functions to implement the information retrieval. 
Check out [Migrating from RetrievalQA](https://python.langchain.com/docs/versions/migrating_chains/retrieval_qa/)

*For the latest implementation, please refer to the cell two cells after this one.*

In [18]:
# from langchain.chains import RetrievalQA
from langchain.chains.retrieval_qa.base import RetrievalQA

rag_qa_chain = RetrievalQA.from_chain_type(
  llm=llm,
  chain_type="stuff",  # Merging all retrieved documents into a single context.
  retriever=vector_store.as_retriever(search_kwargs={"k": RETRIEVE_TOP_K}, search_type="similarity"),
  return_source_documents=True
)

response = rag_qa_chain.invoke(f"{SYSTEM_PROMPT} {demo_question}")
# response = rag_qa_chain.invoke(f"{demo_question}")
# response
print(f"Q: {demo_question}")
print(f"GT: {demo_answer}\n")
print(f"A: {response['result']}")

Q: What do they mean by global and local context?
GT: ['global (the whole document) and the local context (e.g., the section/topic) ']

A:  In the context of the given text, global context refers to the information about the whole document, while local context refers to the information about a specific sentence or topic segment within that document. 

Note: The question is based on the provided text and requires an understanding of the context in which "global" and "local" are used. The answer should be concise and accurate based on the information given. 

Final Answer: Global context refers to information about the whole document, while local context refers to information about a specific sentence or topic segment within that document. I don't know if you need more of an explanation than that. If you do


latest implementation

In [None]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate

# PROMPT_RAG: str = (
#   """Use the given context to answer the question. 
# If you don't know the answer, say you don't know. 
# Keep the answer concise. Don't provide irrelevant information."""
# )

# retrieval_qa_prompt = ChatPromptTemplate(
#   [
#     ("system", PROMPT_RAG),
#     ("human", "Context: {context}\nQuestion: {input}\nAnswer:"),
#     # ("human", "Context: {context}\nQuestion: {input}"),
#   ]
# )

CHAT_TEMPLATE_RAG = (
"""human: context: {context}\nquestion: {input}
assistant: """
)
retrieval_qa_prompt = PromptTemplate.from_template(template=CHAT_TEMPLATE_RAG)

# console.print(retrieval_qa_prompt)

combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_prompt)
rag_qa_chain = create_retrieval_chain(
  retriever=vector_store.as_retriever(search_kwargs={"k": RETRIEVE_TOP_K}, search_type="similarity"), 
  combine_docs_chain=combine_docs_chain
  )
response_new = rag_qa_chain.invoke({"input": f"{demo_question}"})

# response_new
print(f"Q: {demo_question}")
print(f"GT: {demo_answer}\n")
print(f"A: {response_new['answer']}")

Q: What do they mean by global and local context?
GT: ['global (the whole document) and the local context (e.g., the section/topic) ']

A:  In the context of natural language processing (NLP), "global context" and "local context" refer to different types of information that can be used to understand the meaning of a sentence or a piece of text.

**Local Context:**
Local context refers to the information that is immediately surrounding a word or phrase, such as the words or phrases that appear before and after it. This can include things like:

* The previous sentence or paragraph
* The current sentence or phrase
* The words or phrases that are closely related to the current word or phrase

In other words, local context is about understanding the immediate surroundings of a piece of


### Collect all the retrieved chunk in the document

In [20]:
# retrieved_list = []
# for retrieved_chunk in response["source_documents"]:
#   retrieved_list.append(retrieved_chunk.page_content)
# display(retrieved_list)

# This is the latest implementation, but it may not be significantly different.
retrieved_list = []
for retrieved_chunk in response_new["context"]:
  retrieved_list.append(retrieved_chunk.page_content)
display(retrieved_list)

['information (i.e. local context) is added. And the improvement is even greater when we only consider long documents. Rather surprisingly, this is not the case for the global context. Adding a representation of the whole document (i.e. global context)',
 'surprisingly, this is not the case for the global context. Adding a representation of the whole document (i.e. global context) never significantly improves performance. In essence, it seems that all the benefits of our model come exclusively from modeling',
 'global context are all contextual information of the given sentence, we use an attention mechanism to decide the weight of each context vector, represented as INLINEFORM0  where the INLINEFORM0 is the weighted context vector of each sentence INLINEFORM1 ,',
 'of sentence INLINEFORM3 with topic segment information and global context information. Attentive context As local context and global context are all contextual information of the given sentence, we use an attention mechanis

## Evidence score (ROUGE-L)

In [21]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
fmeasure_scores = []

for chunk in retrieved_list:
  scores = scorer.score_multi(  # using maximum f-measure
    targets=demo_evidence,
    prediction=chunk
    )
  # print(f"{chunk = }")
  # print(f"* f-measure = {scores['rougeL'].fmeasure:.4f}\n")
  fmeasure_scores.append(scores["rougeL"].fmeasure)

final_evidence_score = sum(fmeasure_scores) / len(fmeasure_scores)
print(f"{final_evidence_score = :.4f}")

final_evidence_score = 0.2170


## Using LLM to Judge the answer generated by the RAG system

In [22]:
PROMPT_JUDGEMENT: str = (
  """Assume you are a human expert in grading predictions given by a model. You are given a document, a question and a model prediction. Judge if the prediction matches the ground truth answer by following these steps:
1: Take it as granted that the Ground Truth is always correct.
2: If the Prediction indicates it is not sure about the answer, "score" should be "0"; otherwise, go the next step.
3: If the Prediction exactly matches the Ground Truth, "score" is 1.
4: If the Prediction does not exactly match the Ground Truth, go through the following steps.
5: If the Ground Truth is a number, "score" is 1 if and only if the Prediction gives a number that almost exactly matches the ground truth.
6: If the Prediction is self-contradictory, "score" must be 0.
7: If the prediction is not answering the question, "score" must be 0.
8: If the prediction is a concise and correct summary of the ground truth, "score" is 1.
9: If ground truth contains a set of items, prediction must contain exactly same items for the score to be 1.
10: Otherwise, "score" is 0.
Keep the answer concise. Don't provide irrelevant information.
""")

PROMPT_JUDGE_CONTENT = (
"""document: {document}
question: {question}
Ground Truth: {answer}
Prediction: {prediction}
""")

CHAT_JUDGE_TEMPLATE = (
  f"system: {PROMPT_JUDGEMENT}\n"
  f"human: {PROMPT_JUDGE_CONTENT}\n"
  "assistant: The score is "
)

# print(CHAT_JUDGE_TEMPLATE)

In [37]:
llm_judge = VLLMOpenAI(
  base_url=API_ENDPOINT,
  api_key=API_KEY,
  model=USING_MODEL,
  temperature=0.6,
  max_tokens=128,
  frequency_penalty=1.6,
  presence_penalty=0.8,
  model_kwargs={"stop": ["```", "}}"]},
)

chat_prompt = PromptTemplate.from_template(template=CHAT_JUDGE_TEMPLATE)
llm_judge_chain = chat_prompt | llm_judge | StrOutputParser()

query = {
  # "document": f"Paper title: {demo_title}\n" + demo_full_text.split("\n\n\n")[0] + "\n" + str(demo_evidence),
  "document": f"Paper title: {demo_title}\n" + str(demo_evidence),
  "question": demo_question,
  "answer": " ".join(demo_answer),
  # "prediction": response["result"]
  "prediction": response_new["answer"]
  }

# print(f"{demo_question = }")
# print(f"{demo_answer = }")
_response = ["The score is "]
print("The score is ", end="")
for chunk in llm_judge_chain.stream(query, config={"callbacks": [ConsoleCallbackHandler()]} if DEBUG else None):
  _response.append(chunk)
  print(chunk, end="")
print("")

judge_response = "".join(_response)

The score is 0. The prediction is not concise and does not exactly match the Ground Truth, but it provides additional information that is not present in the Ground Truth. Additionally, while related to the topic, it's more of an explanation than a direct answer to what global and local context mean in this specific paper.  (Note: This response assumes that "score" should be 1 if the prediction matches or closely matches ground truth without extra unrelated information) 
However If I try to squeeze this into one of your steps:
8: Prediction is a concice correct summary -> No
9: Prediction contains same items as GT -> Yes for "global


### Using another LLM to extract the score

In [43]:
llm_judge_refine = VLLMOpenAI(
  base_url=API_ENDPOINT,
  api_key=API_KEY,
  model=USING_MODEL,
  temperature=0.3,
  max_tokens=4,
  frequency_penalty=1.6,
  presence_penalty=0.8,
  model_kwargs={"stop": [" "]},
)

CHAT_JUDGE_REFINE_TEMPLATE = (
"""assistant: {history}
user: {input}
assistant: {{"score": """
)

chat_prompt = PromptTemplate.from_template(template=CHAT_JUDGE_REFINE_TEMPLATE)
llm_judge_refine_chain = chat_prompt | llm_judge_refine | StrOutputParser()

query = {
  "history": judge_response,
  "input": """Please extract the score result from the last conversation and output in the JSON format. e.g., {"score": 0} or {"score": 1}.
  Please don't respond with extra content. Just output a JSON.
  """
  }

# print(f"{demo_question = }")
# print(f"{demo_answer = }")
_response = ['{"score": ']
print('{"score": ', end="")
for chunk in llm_judge_refine_chain.stream(query, config={"callbacks": [ConsoleCallbackHandler()]} if DEBUG else None):
  _response.append(chunk)
  print(chunk, end="")
print("")

judge_refine_response = "".join(_response)
correctness = round(json.loads(judge_refine_response)["score"], 0)
print(f"Answer correctness: {correctness}")


{"score": 0}
Answer correctness: 0


----


# sample_submission.json

In [21]:
dummy_json = {
  "title": "aaa",
  "answer": "bbb",
  "evidence": ["xxx", "yyy"],
}

display(dummy_json)
json_data = json.dumps(dummy_json, indent=2)
sample_submission = [dummy_json for _ in range(100)]  # emulate 100 data in private dataset

with open("sample_submission.json", "w") as f:
  json.dump(sample_submission, f, indent=2)

{'title': 'aaa', 'answer': 'bbb', 'evidence': ['xxx', 'yyy']}