### Question-Answer pair for evaluation using GPT-4o mini, Ragas and llama_index

In [23]:
import dotenv, os, csv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

dotenv.load_dotenv()
Model = os.environ["Model"]

# load the document
llm = OpenAI(model = "gpt-3.5-turbo")
documents = SimpleDirectoryReader(input_files=["../data/Pmg_lds.md"]).load_data()



In [24]:
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=50)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(llm=llm)

##### Dataset Generation

In [38]:
from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re

llm = OpenAI(model="gpt-3.5-turbo")
# llm = OpenAI(model = "gpt-4o-mini")

##### define the generate answers function to generate answers based on the context

In [35]:
QA_PROMPT = PromptTemplate(
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)


def generate_answers_for_questions(
    questions: List[str], context: str, llm: OpenAI
) -> str:
    """Generate answers for questions given context."""
    answers = []
    for question in questions:
        fmt_qa_prompt = QA_PROMPT.format(
            context_str=context, query_str=question
        )
        response_obj = llm.complete(fmt_qa_prompt)
        answers.append(str(response_obj))
    return answers

##### generate qa pairs over an entire list of nodes

In [43]:
QUESTION_GEN_USER_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "generate the relevant questions. "
)

QUESTION_GEN_SYS_TMPL = """\
Your task is to generate \
{num_questions_per_chunk} thoughtful and relevant questions. \
Each question should be based on the source materials and focus on the text in the document called preach my gospel used by missionaries from the church of Jesus Christ of Latter Day saints. \
Ensure the questions are diverse and cover different aspects of the document. \
For example: How can I effectively find and teach people? \
"""

question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
    ]
)


def generate_qa_pairs(
    nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10
) -> List[Tuple[str, str]]:
    """Generate questions."""
    qa_pairs = []
    for idx, node in enumerate(nodes):
        print(f"Node {idx}/{len(nodes)}")
        context_str = node.get_content(metadata_mode="all")
        fmt_messages = question_gen_template.format_messages(
            num_questions_per_chunk=10,
            context_str=context_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content
        result_list = str(raw_output).strip().split("\n")
        cleaned_questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip()
            for question in result_list
        ]
        answers = generate_answers_for_questions(
            cleaned_questions, context_str, llm
        )
        cur_qa_pairs = list(zip(cleaned_questions, answers))
        qa_pairs.extend(cur_qa_pairs)
    return qa_pairs

In [44]:
qa_pairs = generate_qa_pairs(
    nodes[:3],
    # nodes,
    llm,
    num_questions_per_chunk=10,
)

Node 0/3
Node 1/3
Node 2/3


In [45]:
import pandas as pd

pairs = pd.DataFrame(qa_pairs)
pairs.tail()

Unnamed: 0,0,1
25,How does the First Presidency message suggest ...,The First Presidency message suggests that mis...
26,What is the significance of the reference to M...,The reference to Moses 1:39 in the First Presi...
27,How does the First Presidency message in Preac...,The First Presidency message in Preach My Gosp...
28,In what ways does the First Presidency message...,The First Presidency message indicates that mi...
29,How can missionaries apply the teachings and e...,Missionaries can apply the teachings and exhor...


In [46]:
pairs.head()

Unnamed: 0,0,1
0,"How does the ""Preach My Gospel"" guide help mis...","The ""Preach My Gospel"" guide helps missionarie..."
1,"What are some key principles outlined in the ""...","Some key principles outlined in the ""Preach My..."
2,"How does ""Preach My Gospel"" emphasize the impo...","""Preach My Gospel"" emphasizes the importance o..."
3,In what ways does the document address the imp...,The document may address the importance of bui...
4,"How does ""Preach My Gospel"" guide missionaries...","""Preach My Gospel"" guides missionaries in adap..."


#### initialize the testset generator object with the corresponding generator and critic llms


In [12]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# initialize a l;sit to store Q-A pairs
qa_pairs = []

question_count = 0
max_question = 10

# loop through each document (section) to generate Q-A pairs
for doc in document:
    if question_count >= max_question:
        break
    
    while question_count < max_question:
        
        question = query_engine.query(f"Generate a question based on the following text: {doc.text}")
        answer = query_engine.query(f"Provide an answer to the following question: {question}\nBased on the text: {doc.text}")
        manual_quote = query_engine.query(f"if the {answer} is a direct quote or is the same text from the document source, then add the source. if not do not include the source text.")

        qa_pairs.append([question, answer, manual_quote])
        question_count += 1

print(qa_pairs[0])

NameError: name 'document' is not defined

In [None]:
# print(qa_pairs[0])

### Save Q-A pairs to CSV

In [14]:
csv_file = "./data/qa_dataset.csv"

#write the Q-A pair to the CSV file path

with open(csv_file, mode="w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Question", "Answer", "Manual Quote"])
    writer.writerows(qa_pairs)
    
print(f"Data saved successfully to csv file path")

Data saved successfully to csv file path
