### Chatbot_QA_Data_Augmentation_and_Evaluation

This notebook contains the following sections:
1. Import the dataset and then use LLM to generate question-answer example based on context from training data/RAG-chain
2. LLM assisted evaluation on Generated Question and Predicted Answer
3. Manual Evaluation / Review
4. Human Correction
5. Add generated qa pairs into dataset
6. Save the new dataset as a jsonl file for next round's evaluation use
7. Test loading the new dataset to make sure it works


### Import the dataset and then use LLM to generate question-answer example based on context from training data/RAG-chain

In [None]:
#Import libraries

from langchain.evaluation.qa import QAGenerateChain
from langchain_ollama import OllamaLLM
from langchain_community.document_loaders import HuggingFaceDatasetLoader

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA

import langchain
from langchain.memory import ConversationBufferMemory

from langchain.evaluation.qa import QAEvalChain
from langchain.evaluation.qa import ContextQAEvalChain

from langchain.schema import Document
import copy

import json
from typing import Iterable

from tqdm import tqdm
import time

import pandas as pd
pd.set_option('display.max_colwidth', None) # Set pandas display options to show all characters

import warnings
# warnings.filterwarnings("ignore", category=UserWarning) # Suppress only UserWarnings
warnings.filterwarnings("ignore") # Suppress all Warnings

In [None]:
# Load Data
dataset_name = "MakTek/Customer_support_faqs_dataset"
page_content_column = "answer"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()

In [None]:
# Determine batch size

batch_size = len(data)
# batch_size = 10

In [None]:
# Set model and prompt used in the QA-generation chain

custom_qa_gen_prompt_str = """System: You are the manager of Customer Service Support coming up with example questions from customers.
Given the following document, please generate a question and answer based on that document.

Example Format:
<Begin Document>
...
<End Document>
QUESTION: question here
ANSWER: answer here

Instruction: These questions should be detailed and be based explicitly on information in the document. Begin!

<Begin Document>
{doc}
<End Document>"""

custom_qa_gen_prompt = PromptTemplate(input_variable=["doc"], template=custom_qa_gen_prompt_str)

langchain.evaluation.qa.generate_prompt.PROMPT = custom_qa_gen_prompt

llm_model = "llama3.2"
example_gen_chain = QAGenerateChain.from_llm(OllamaLLM(model=llm_model))

langchain.evaluation.qa.generate_prompt.PROMPT 

In [None]:
# Generated new qa examples

start_time = time.time()

new_examples = []
for item in tqdm([{"doc": t} for t in data[:batch_size]], desc="Generating QA Examples"):
    result = example_gen_chain.apply_and_parse([item])
    new_examples.append(result)

# A trick to flaten the list:
new_examples = sum(new_examples, [])

end_time = time.time()
elapsed_time = (end_time - start_time) / 60
print(f"Elapsed time: {elapsed_time:.2f} minutes")

In [None]:
#Create dataframe for the generated_qa_examples

generated_qa_examples = []

for i in range(batch_size):
    generated_qa_examples.append({"Dataset Index" : str(i),
                                  "Train-set Question" : data[i].metadata['question'],
                                  "Generated Question" : new_examples[i]['qa_pairs']['query'],
                                  "Train-set Context" : data[i].page_content,
                                  "Generated Answer" : new_examples[i]['qa_pairs']['answer']
                                 })

generated_qa_examples_df = pd.DataFrame(generated_qa_examples)


In [None]:
generated_qa_examples_df.head()

### LLM assisted evaluation on Generated Question and Predicted Answer

In [None]:
# First let's create a chain.
# Let's use the same kind of chain we used in our customer service chatbot for consistency.
# However, for this evaluation, we don't need to use the memory for simplicity.

embedding_model_chosen = "hkunlp/instructor-large"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

hf_embeddings = HuggingFaceEmbeddings(model_name=embedding_model_chosen,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs)

persist_directory = 'docs/chroma/'

vectordb = Chroma.from_documents(
    documents=data,
    embedding=hf_embeddings,
    persist_directory=persist_directory
)
retriever=vectordb.as_retriever(search_kwargs={"k": 3}, search_type="mmr")

base_prompt_template = """System: You are a ABC-Company customer service representative.
\n\nInstruction: First, if you know the answer: Answer the customer's question based on following context and chat history. Do not mention we have discussed this topic before in the previous conversation or ask any follow up question. Otherwise, if you do not know the answer: simply answer 'I am not sure about the answer, please contact our human service for assistance. Thank You!'.
\n\nContext: {context}
\n\nChat history: {chat_history}
\n\nQuestion: {question}
\n\nOutput Answer: """
prompt_input_list = ["context", "question", "chat_history"]

BASE_PROMPT = PromptTemplate(
            template=base_prompt_template, input_variables=["context", "question", "chat_history"]
        )

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key='question',
    output_key='answer',
    return_messages=True,
)

qa = ConversationalRetrievalChain.from_llm(
    llm=OllamaLLM(model=llm_model, temperature = 0.01),
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    #return_generated_question=True,
    verbose=False,
    combine_docs_chain_kwargs={'prompt': BASE_PROMPT},
    # condense_question_prompt=condense_question_prompt
)


In [None]:
# Make sure memory is clear

memory.clear()
memory.chat_memory.messages

In [None]:
# Prepare the format for the generated question set

prediction_question_list = []
for i in range(batch_size):
    prediction_question_list.append(new_examples[i]['qa_pairs']['query'])


In [None]:
# Feed the generated question into the llm, generate answer and save the context and predicted answer
start_time = time.time()

prediction_answer_list = []
for question in tqdm(prediction_question_list, desc="Predicting answers for generated questions"):
    memory.clear() # This is necessary because otherwise the memory will becomes too long and response will be too slow
    answer = qa(question)  # Generate an answer

    #append question, answer and source_context
    if answer['source_documents'][0].page_content == answer['source_documents'][1].page_content == answer['source_documents'][2].page_content:
        prediction_answer_list.append({"question": question,
                                        "answer": answer['answer'],
                                        "source_context": answer['source_documents'][0].page_content})        
    elif (answer['source_documents'][0].page_content == answer['source_documents'][1].page_content) or (answer['source_documents'][1].page_content == answer['source_documents'][2].page_content):
        prediction_answer_list.append({"question": question,
                                        "answer": answer['answer'],
                                        "source_context": answer['source_documents'][0].page_content + answer['source_documents'][2].page_content})
    else:
        prediction_answer_list.append({"question": question,
                                        "answer": answer['answer'],
                                        "source_context": answer['source_documents'][0].page_content + answer['source_documents'][1].page_content + answer['source_documents'][2].page_content})
end_time = time.time()
elapsed_time = (end_time - start_time) / 60
print(f"Elapsed time: {elapsed_time:.2f} minutes")

In [None]:
# create data frame for prediction_answer_list

prediction_answer_list_df = pd.DataFrame(prediction_answer_list)
prediction_answer_list_df.head()

In [None]:
# Create a new grading prompt to edit the roles in the evaluation chain

custom_grading_prompt_str = """System: You are a Customer Service Manager grading a Customer Service Chatbot's responses to customers' questions.
You are given a question, the context the question is about, and the Customer Service Chatbot's answer. You are asked to score the Chatbot's answer as either CORRECT or INCORRECT, based on the context.

Instruction: Grade the chatbot answer INCORRECT if it makes up more details than the provided context. Grade the chatbot answer as CORRECT even if the chatbot answer misses some details and does not mention all the context information provided. Ignore differences in punctuation and phrasing between the chatbot answer and context. 

Example Format:
CUSTOMER QUESTION: customer question here
CONTEXT: context the question is about here
CUSTOMER SERVICE CHATBOT ANSWER: customer service chatbot's answer here
GRADE: CORRECT or INCORRECT here

CUSTOMER QUESTION: {query}
CONTEXT: {context}
CUSTOMER SERVICE CHATBOT ANSWER: {result}
GRADE: """

custom_grading_prompt = PromptTemplate(input_variable=["query", "context", "result"], template=custom_grading_prompt_str)


In [None]:
langchain.evaluation.qa.eval_prompt.CONTEXT_PROMPT = custom_grading_prompt
langchain.evaluation.qa.eval_prompt.PROMPT = custom_grading_prompt

In [None]:
# Check out the prompt used in the context evaluation chain
# langchain.evaluation.qa.eval_prompt.CONTEXT_PROMPT

In [None]:
# Check out the prompt used in the context evaluation chain
# langchain.evaluation.qa.eval_prompt.PROMPT

In [None]:
llm=OllamaLLM(model=llm_model)
# eval_chain = QAEvalChain.from_llm(llm)
eval_chain = ContextQAEvalChain.from_llm(llm, prompt = custom_grading_prompt)

In [None]:
# Evaluate the answer

start_time = time.time()
print("...Start Evaluating Answers...")
graded_outputs = eval_chain.evaluate(examples=prediction_answer_list, 
                                     predictions=prediction_answer_list, 
                                     question_key="question", 
                                     context_key="source_context", 
                                     prediction_key="answer")
print("...Done Evaluating Answers...")
end_time = time.time()
elapsed_time = (end_time - start_time) / 60
print(f"Elapsed time: {elapsed_time:.2f} minutes")

In [None]:
total_incorrect_counter = 0
total_correct_counter = 0
human_incorrect_ans_review = []
human_correct_ans_review = []

for i, eg in enumerate(prediction_answer_list):
    
    if "INCORRECT" in graded_outputs[i]['text']:
        total_incorrect_counter += 1
        append_list = human_incorrect_ans_review
        
    elif "CORRECT" in graded_outputs[i]['text']:
        total_correct_counter += 1
        append_list = human_correct_ans_review

    append_list.append({"Generate-Index" : i,
                             "Generated Question" :prediction_answer_list[i]['question'],
                              "Context Answer" :prediction_answer_list[i]['source_context'],
                              "Generated Answer" :prediction_answer_list[i]['answer'],
                              "Predicted Grade" :graded_outputs[i]['text']}
                                       )

human_incorrect_ans_review_df = pd.DataFrame(human_incorrect_ans_review)
human_correct_ans_review_df = pd.DataFrame(human_correct_ans_review)

print("Machine Evaluation Total Score: " + str(int(total_correct_counter*100/len(graded_outputs))) + "%")
print("Machine Evaulation Total Incorrect Ans: " + str(int(total_incorrect_counter*100/len(graded_outputs))) + "%")

In [None]:
human_incorrect_ans_review_df

### Manual Evaluation / Review

In [None]:
# langchain.debug = True
# memory.clear()
# qa.invoke(new_examples[2]['qa_pairs']["query"])

In [None]:
# langchain.debug = False

In [None]:
# Filter the human_incorrect_ans_review df to those the chatbot said he did not know the answer
# We will add those answers to the data to expand and improve the chatbot's RAG knowledge capacity

dunno_ans_df = human_incorrect_ans_review_df[human_incorrect_ans_review_df['Generated Answer'].str.contains("I am not sure about the answer", case=False)]

In [None]:
dunno_ans_df.reset_index(inplace=True)

In [None]:
# Filter the human_incorrect_ans_review df to those the chatbot provided maybe truly incorrect answer
# We will review those manually. 
# If it is actually correct, we will move it to the correct list
# If it is actually incorrect, we will add the data to expand and improve the chatbot's RAG knowledge capacity

human_filtered_incorrect_ans_review_df = human_incorrect_ans_review_df[~human_incorrect_ans_review_df['Generate-Index'].isin(dunno_ans_df['Generate-Index'])]

In [None]:
human_filtered_incorrect_ans_review_df.reset_index()

### Human Correction

In [None]:
# Enter the Generate-Index of the questions you want to move into the correct list and those you want to keep in the incorrect list
human_correct_list_generate_index = [17, 27, 46, 63, 76, 85, 96, 117, 123, 127, 146, 173, 187]

# Create a df and change the Predicted Grade to CORRECT
human_correct_df = human_filtered_incorrect_ans_review_df[human_filtered_incorrect_ans_review_df['Generate-Index'].isin(human_correct_list_generate_index)]
human_correct_df["Predicted Grade"] = "HUMAN REVIEWED GRADE: CORRECT"

human_correct_df

In [None]:
human_correct_ans_final_df = pd.concat([human_correct_ans_review_df, human_correct_df])

In [None]:
human_correct_ans_final_df

In [None]:
len(human_correct_df)

In [None]:
# Calculate Chatbot performance score again after human review

reviewed_correct_counter = total_correct_counter + len(human_correct_df)
reviewed_incorrect_counter = total_incorrect_counter - len(human_correct_df)

print("Human Reviewed Evaluation Total Score: " + str(int(reviewed_correct_counter*100/len(graded_outputs))) + "%")
print("Human Reviewed Evaulation Total Incorrect Ans: " + str(int(reviewed_incorrect_counter*100/len(graded_outputs))) + "%")

### Add generated qa pairs into dataset
- After human reviewing the evaluated incorrect answers, to improve the llm chatbot, we will add the selected generated qa pairs into the dataset for the next round of evaluation

In [None]:
print(generated_qa_examples_df["Dataset Index"].dtype)
print(human_correct_ans_final_df["Generate-Index"].dtype)

In [None]:
generated_qa_examples_df["Dataset Index"] = generated_qa_examples_df["Dataset Index"].astype(int)
human_correct_ans_final_df["Generate-Index"] = human_correct_ans_final_df["Generate-Index"].astype(int)

data_to_add_df = generated_qa_examples_df[~generated_qa_examples_df["Dataset Index"].isin(human_correct_ans_final_df["Generate-Index"])]

In [None]:
data_to_add_df
data_to_add_df_no_duplicate = data_to_add_df.drop_duplicates()
data_to_add_df_no_duplicate.reset_index(drop=True, inplace=True)

In [None]:
data_deep_copy = copy.deepcopy(data)
# data_deep_copy

In [None]:
for i in range(len(data_to_add_df_no_duplicate)):
    data_to_append = Document(metadata={'question': data_to_add_df_no_duplicate["Generated Question"][i]},
                       # page_content=data_to_add_df_no_duplicate["Generated Answer"][i]
                      page_content = "Example Question: " + data_to_add_df_no_duplicate["Generated Question"][i] + " " + "Example Answer: " + data_to_add_df_no_duplicate["Generated Answer"][i]
                             )
    data_deep_copy.append(data_to_append)


In [None]:
print(len(data))
print(len(data_to_add_df_no_duplicate))
print(len(data_deep_copy))

### Save the new dataset as a jsonl file for next round's evaluation use

In [None]:
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

In [None]:
save_docs_to_jsonl(data_deep_copy,'dataset_with_appended_new_qa.jsonl')

In [None]:
# data_to_add_df

In [None]:
# Save as CSV
# data_to_add_df.to_csv("data_to_add_df.csv", index=False)

### Test loading the new dataset to make sure it works

In [None]:
dataset_with_appended_new_qa=load_docs_from_jsonl('dataset_with_appended_new_qa.jsonl')
print(len(dataset_with_appended_new_qa))

In [None]:
dataset_with_appended_new_qa[-10:]