## Evaluate Flan-T5 LLM for answer generation

https://huggingface.co/collections/google/flan-t5-release-65005c39e3201fff885e22fb

In [1]:
# provide project root path
ProjectRoot = "<PROVIDE PROJECT ROOT PATH>"
DatasetRoot = ProjectRoot + "/Dataset/"

In [2]:
try:
    import bert_score
except ImportError:
    !pip install bert_score

try:
    from evaluate import load
except ImportError:
    !pip install evaluate

try:
    import rouge_score
except ImportError:
    !pip install rouge_score


In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import json
import bert_score
import numpy as np
import re
from evaluate import load
from tqdm import tqdm

In [4]:
# load context and question train set which was created by doc2query
train_df = pd.read_csv(DatasetRoot + 'q_a_trainset.csv')

In [5]:
# loading full article from json file
with open(DatasetRoot + 'raw_knowledge.json', 'r') as f:
    raw_text_json = json.load(f)

In [6]:
raw_df = pd.DataFrame(list(raw_text_json.items()), columns=['raw_para_id', 'raw_text'])
raw_df['raw_para_id'] = raw_df['raw_para_id'].astype('int64')

In [7]:
# create dataframe of raw, summarized paragraphs and question
train_df = train_df.merge(raw_df, left_on='raw_para_id', right_on='raw_para_id', how='left')

### Evaluation

In [8]:
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
tokenizer = T5Tokenizer.from_pretrained(model_name, trust_remote_code=True)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


#### Prompt Engineering

##### Experiment with different prompt templates
NOTE: Following prompts are created with the aid of ChatGPT

In [9]:
def generate_prompt_fewshot(context, question):
    prompt_template = """
You are an expert in understanding and interpreting provided text contexts. Given a context and a question, your task is to generate an accurate and informative answer based on the provided context. Here is the structure:

1. **Context:** The detailed text or passage that contains the information needed to answer the question.
2. **Question:** A specific question that needs to be answered based on the context.

Please make sure your response is clear, concise, and directly addresses the question. If the context does not contain sufficient information to answer the question, say I don't know.

**Example:**

**Context:**
"The rainforests of the Amazon are home to a vast diversity of species, including numerous plants, animals, and insects. These forests play a crucial role in regulating the Earth's climate by absorbing carbon dioxide and releasing oxygen. However, deforestation poses a significant threat to these ecosystems, leading to loss of habitat and biodiversity."

**Question:**
"Why are the rainforests of the Amazon important for the Earth's climate?"

**Answer:**
"The rainforests of the Amazon are important for the Earth's climate because they absorb carbon dioxide and release oxygen, helping to regulate the climate."

Please follow this format for each question:

**Context:**
{context}

**Question:**
{question}

**Final Answer:**
"""
    return prompt_template.format(context=context, question=question)



In [10]:
def generate_prompt_zeroshot(context, query):
    prompt = f"""Give the answer to the user query delimited by triple backticks ```{query}```
                using the information given in context delimited by triple backticks ```{context}```.
                If there is no relevant information in the provided context, tell user that you did not have any relevant context to base your answer on. Be concise and output the answer.
                """
    return prompt

In [11]:
def generate_prompt_zeroshot_2(context, question):
    prompt_template = f"""
Given the following context, please answer the question:

Context:
{context}

Question:
{question}
"""
    return prompt_template

In [12]:
def generate_prompt_zeroshot_3(context, question):
    prompt = f"""
    You are an expert in understanding and interpreting provided text contexts. Given a context and a question, your task is to generate an accurate and informative answer based on the provided context. Here is the structure:

    **Context:**
    "{context}"

    **Question:**
    "{question}"

    **Answer:**
    """
    return prompt

In [13]:
def generate_prompt_zeroshot_4(context, question):
    prompt = f"""
    ```Context:```
    "{context}"

    ```Question:```
    "{question}"

    ```Answer:```
    """
    return prompt

In [14]:
def generate_prompt_zeroshot_5(context, question):
    prompt = f"""
Based on the context:
{context}

respond to the question clearly and concisely, ensuring that the answer is directly supported by the information given.

question:
{question}

answer:
"""
    return prompt

#### Calculate Different metric scores

In [15]:
# LLM inference wrapper
def AskLLM(context, question):
    prompt = generate_prompt_zeroshot(context, question)

    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True)

    # Ask LLM to answer
    outputs = model.generate(**inputs, max_length=500)
    final_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return final_answer

In [16]:
candidate_answers = []
true_answers = []

for _, eval_data in tqdm(train_df.iterrows(), total=len(train_df)):
    context = eval_data.raw_text
    question = eval_data.question

    true_answers.append(eval_data.Final_answer)
    candidate_answers.append(AskLLM(context, question))


100%|██████████| 96/96 [05:36<00:00,  3.50s/it]


In [17]:
# Calculate BERTScore
# bert_metrics = bert_score.score(cands=candidate_answers, refs=true_answers, model_type='roberta-large', nthreads=4)
bert_metrics = bert_score.score(cands=candidate_answers, refs=true_answers, model_type='bert-base-uncased', nthreads=4)

# Fetch precision, recall, F1 score from BERT score (https://lightning.ai/docs/torchmetrics/stable/text/bert_score.html)
print(f"Mean Precision: {np.mean(np.array(bert_metrics[0]))}")
print(f"Mean Recall: {np.mean(np.array(bert_metrics[1]))}")
print(f"Mean F1 Score: {np.mean(np.array(bert_metrics[2]))}")



Mean Precision: 0.6054932475090027
Mean Recall: 0.7196041941642761
Mean F1 Score: 0.6455835103988647


In [18]:
# Calculate BERTScore via https://huggingface.co/spaces/evaluate-metric/bertscore
bertscore = load("bertscore")
bert_metrics2 = bertscore.compute(predictions=candidate_answers, references=true_answers, lang="en")

print(f"Mean Precision: {np.mean(np.array(bert_metrics2['precision']))}")
print(f"Mean Recall: {np.mean(np.array(bert_metrics2['recall']))}")
print(f"Mean F1 Score: {np.mean(np.array(bert_metrics2['f1']))}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mean Precision: 0.8759204472104708
Mean Recall: 0.9076709852864345
Mean F1 Score: 0.8909476660192013


In [19]:
# calculate meteor via https://huggingface.co/spaces/evaluate-metric/meteor
meteor = load('meteor')
meteor_score = meteor.compute(predictions=candidate_answers, references=true_answers)

print(f"METEOR Score: {meteor_score['meteor']}")

[nltk_data] Downloading package wordnet to /home/sangram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sangram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sangram/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR Score: 0.46557510152413056


In [20]:
# calculate rouge via https://huggingface.co/spaces/evaluate-metric/rouge
rouge = load("rouge")
rouge_score = rouge.compute(predictions=candidate_answers, references=true_answers)

print(f"ROUGE Score: {rouge_score}")

ROUGE Score: {'rouge1': 0.4284925086458006, 'rouge2': 0.308228735511381, 'rougeL': 0.393647956576354, 'rougeLsum': 0.3943345327448875}
