# Aula 10_11 - Implementação do RAGAS

- Implementar o RAGAS com o LLaMA-3 70B para avaliar a qualidade das 50 anotações do IIRC usadas no exercício passado.
- O RAGAS considera context, question, answer, keys que estão disponíveis no conjunto de teste do IIRC.

Opcional:
- Avaliar as respostas do exercício da aula 9_10
- Usar multi agents

In [1]:
import json, os
import sentence_transformers
import numpy as np

from groq import Groq

from functools import reduce
from dotenv import load_dotenv
from tqdm import tqdm
from numpy import dot
from numpy.linalg import norm

# Original RAGAs prompts
from ragas.metrics._faithfulness import LONG_FORM_ANSWER_PROMPT, NLI_STATEMENTS_MESSAGE
from ragas.metrics._answer_relevance import QUESTION_GEN
from ragas.metrics._context_relevancy import CONTEXT_RELEVANCE

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## Preparação dos dados

In [2]:
test_questions_file = open("./test_questions.json", "r")
test_questions = json.load(test_questions_file)

In [3]:
full_data = {}

def process_answer(answer):
    match answer["type"]:
        case "span":
            return answer["answer_spans"][0]["text"]
        case "value":
            return answer["answer_value"] + " " + answer["answer_unit"]
        case "binary":
            return answer["answer_value"]
        case _:
            print("Unsupported type", answer["type"])

def process_context(context):
    merged_context = ""
    for _context in context:
        merged_context += _context["text"] + '. '
    return merged_context

questions_list = []
for test_question in test_questions:
    question = test_question["question"]
    answer = process_answer(test_question["answer"])
    context = process_context(test_question["context"])
    questions_list.append(
        {
            "question" : question,
            "answer" : answer,
            "context": context,
            "n_context": len(test_question["context"])
        }
    )

questions_list[0]

{'question': 'What is Zeus know for in Greek mythology?',
 'answer': 'sky and thunder god',
 'context': 'he Palici the sons of Zeus. in Greek mythology. Zeus (British English , North American English ; , Zeús ) is the sky and thunder god in ancient Greek religion. ',
 'n_context': 3}

## Groq API

In [4]:
class GroqAPI():
    def __init__(self, prompt=None, json_format=False, temperature=0):
        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY1"))
        
        self.model = "llama3-70b-8192"
        self.prompt = prompt
        self.json_format = json_format
        self.temperature = temperature

    
    def get_answer(self, prompt : str):
        messages = []

        if self.prompt:
            messages.append(
                {
                    "role" : "system",
                    "content" : self.prompt.instruction
                }
            )

            if self.prompt.examples:
                messages.append(
                    {
                        "role" : "system",
                        "content" : f"You MUST output in JSON exactly like this example:\n{self.prompt.examples[0]}\n"
                    }
                )
            
            messages.append(
                {
                    "role" : "system",
                    "content" : f"You will receive a JSON with the following keys: {self.prompt.input_keys}. You must include the received keys and this additional key {self.prompt.output_key} in the output, just like the given example."
                }
            )



        messages.append(
            {
                "role" : "user",
                "content" : prompt
            }
        )


        answer = self.client.chat.completions.create(
            messages=messages,
            model=self.model,
            temperature=self.temperature,
            response_format=({"type": "json_object"} if self.json_format else None)
        )

        return answer.choices[0].message.content
    
api = GroqAPI()
api.get_answer("Opa, beleza?")

'Você está falando português!\n\n"Opa" é uma expressão comum em português, especialmente no Brasil, que pode ser usada para expressar surpresa, entusiasmo ou alegria. Já "beleza" significa "beleza" ou "lindo".\n\nEntão, se você está perguntando "Opa, beleza?", você está perguntando se tudo está bem ou se algo é lindo.'

## RAGAs

### Faithfulness

In [5]:
class Faithfulness():
    def __init__(self, json=True) -> None:
        self.statement_agent = GroqAPI(
            prompt=LONG_FORM_ANSWER_PROMPT,
            json_format=json
        )

        self.verdict_agent = GroqAPI(
            prompt=NLI_STATEMENTS_MESSAGE,
            json_format=json
        )

    def make_sentences(self, answer):
        d = {idx : sentence for idx, sentence in enumerate(answer.split("."))}
        s = "\\n        "
        for k, v in d.items():
            s += f"{k}:{v}\\n        "
        return s

    def get_statements(self, question, answer, simpler_format=True, debug=False):
        prompt = f"""\u007b
  "question": "{question}",
  "answer": "{answer}",
  "sentences": "{self.make_sentences(answer)}"
\u007d
"""
        
        if debug:
            print(f"Getting simpler sentences from the answer and question:\n{prompt}")

        agent_answer = json.loads(self.statement_agent.get_answer(prompt))
        if simpler_format:
            agent_answer = reduce(lambda x,y :x+y , [analyse["simpler_statements"] for analyse in agent_answer["analysis"]])
        
        if debug:
            print(f"Simpler sentences: {json.dumps(agent_answer, indent=2)}")

        return agent_answer

    def get_verdicts(self, statements, context, debug=False):
        statements = "\"\n    \"".join(statements)
        prompt = f"""\u007b
  "context": "{context}",
  "statements": [
    "{statements}"
  ]
\u007d
"""
        if debug:
            print(f"We're comparing statements (generated from the original answer) to the context\nBy doing so we can see if a given part of the answer (statement) was actually given from the context\n{prompt}")

        verdict = json.loads(self.verdict_agent.get_answer(prompt))

        if debug:
            print(f"Verdicts: {json.dumps(verdict, indent=2)}")

        return verdict

    def calculate_faithfulness(self, sample, debug=False):
        statements = self.get_statements(sample["question"], sample["answer"], simpler_format=True, debug=debug)
        verdicts = self.get_verdicts(statements, sample["context"], debug=debug)
        supported = 0
        for verdict in verdicts["answer"]:
            supported += verdict["verdict"]
        return supported / len(verdicts["answer"])


### Answer Relevance

In [6]:
class AnswerRelevance():
    def __init__(self, json=True):
        self.question_generator_agent = GroqAPI(
            prompt=QUESTION_GEN,
            json_format=json,
            temperature=0
        )
        
        self.embedding_model = sentence_transformers.SentenceTransformer('paraphrase-distilroberta-base-v1')
    
    def generate_questions(self, answer, context, n=3):
        prompt = f"""\u007b
  "answer": "{answer}",
  "context": "{context}"
\u007d
"""
        questions = []
        for _ in range(n):
            questions.append(json.loads(self.question_generator_agent.get_answer(prompt)))

        return questions
    
    def calculate_embedding(self, text):
        return self.embedding_model.encode(text, convert_to_tensor=True)

    def calculate_relevance(self, sample):
        actual_question = sample["question"]
        actual_question_emb = self.calculate_embedding(actual_question)

        generated_questions = self.generate_questions(sample["answer"], sample["context"], n=1)
        generated_questions_emb = []

        for question in generated_questions:
            question = question["output"]["question"]
            generated_questions_emb.append(self.calculate_embedding(question))

        def cosine_sim(a, b):
            return dot(a, b)/(norm(a)*norm(b))
        
        relevance = 0
        for emb in generated_questions_emb:
            relevance += cosine_sim(emb, actual_question_emb)
        relevance /= len(generated_questions_emb)
        
        return relevance


answer_relevance = AnswerRelevance(json=True)
answer_relevance.calculate_relevance(questions_list[9])



0.35710054636001587

### Context Relevance

In [7]:
class ContextRelevance():
    def __init__(self, json=True):
        self.context_relevancy_agent = GroqAPI(
            prompt=CONTEXT_RELEVANCE,
            json_format=json,
            temperature=0
        )
    
    def get_relevant_sentences(self, question, context):
        prompt=f"""\u007b
  "question": "{question}",
  "context": "{context}"
\u007d
"""
        return json.loads(self.context_relevancy_agent.get_answer(prompt))

    def calculate_relevance(self, sample):
        extracted_sentences = self.get_relevant_sentences(sample["question"], sample["context"])["candidate sentences"]
        if isinstance(extracted_sentences, str): # Insufficient information
            return 0
        return len(extracted_sentences) / sample["n_context"]

ctx = ContextRelevance()
ctx.calculate_relevance(questions_list[3])

0

## Avaliação

### Faithfulness

In [8]:
def eval_faithfulness(json):
    faithfulness_agent = Faithfulness(json=json)

    faithfulness_list = []
    errors = []

    for idx, sample in tqdm(enumerate(questions_list)):
        try:
            faithfulness_list.append({"idx": idx, "faithfulness": faithfulness_agent.calculate_faithfulness(sample, debug=False)})
        except:
            errors.append(idx)
    
    return faithfulness_list, errors

def print_faithfulness_infos(faithfulness_list, errors):
    l = np.array([x["faithfulness"] for x in faithfulness_list])
    print(l.mean(), "+-", l.std())
    print("#Errors:", len(errors))

In [9]:
faithfulness_list_with_json, errors_with_json = eval_faithfulness(True)

print("Result of faithfulness evaluation with JSON casting")
print_faithfulness_infos(faithfulness_list_with_json, errors_with_json)

50it [18:05, 21.72s/it]

Result of faithfulness evaluation with JSON casting
0.7948717948717948 +- 0.38759529379053775
#Errors: 11





In [10]:
faithfulness_list_without_json, errors_without_json = eval_faithfulness(False)

print("Result of faithfulness evaluation without JSON casting")
print_faithfulness_infos(faithfulness_list_without_json, errors_without_json)

50it [19:26, 23.33s/it]

Result of faithfulness evaluation without JSON casting
0.8333333333333334 +- 0.3639761427327792
#Errors: 11





### Answer Relevance

In [11]:
def eval_answer_relevance(json):
    answer_relevances = []
    answer_relevance_errors = []
    answer_relevance = AnswerRelevance(json=json)

    for idx, sample in tqdm(enumerate(questions_list)):
        try:
            answer_relevances.append({"idx": idx, "relevance": answer_relevance.calculate_relevance(sample)})
        except:
            answer_relevance_errors.append(idx)
    
    return answer_relevances, answer_relevance_errors

def print_answer_relevance_infos(answer_relevances, answer_relevance_errors):
    x = np.array([relevance["relevance"] for relevance in answer_relevances])
    print(x.mean(), "+-", x.std())
    print("#Errors:", len(answer_relevance_errors))

In [12]:

answer_relevances_with_json, answer_relevance_errors_with_json = eval_answer_relevance(True)

print("Evaluating Answer Relevance With JSON")
print_answer_relevance_infos(answer_relevances_with_json, answer_relevance_errors_with_json)

50it [06:58,  8.36s/it]

Evaluating Answer Relevance With JSON
0.5111142351876857 +- 0.1648589007466922
#Errors: 7





In [13]:
answer_relevances_without_json, answer_relevance_errors_without_json = eval_answer_relevance(False)

print("Evaluating Answer Relevance Without JSON")
print_answer_relevance_infos(answer_relevances_without_json, answer_relevance_errors_without_json)

50it [07:11,  8.62s/it]

Evaluating Answer Relevance Without JSON
0.4712211094223536 +- 0.16171204232238717
#Errors: 24





### Context Relevance

In [14]:
def eval_context_relevance(json):
    context_relevances = []
    context_relevance_errors = []
    ctx = ContextRelevance(json=json)

    for idx, sample in tqdm(enumerate(questions_list)):
        try:
            context_relevances.append({"idx": idx, "relevance": ctx.calculate_relevance(sample)})
        except:
            context_relevance_errors.append(idx)
    
    return context_relevances, context_relevance_errors

def print_context_relevance_infos(context_relevances, context_relevance_errors):
    x = np.array([relevance["relevance"] for relevance in context_relevances])
    print(x.mean(), "+-", x.std())
    print("#Errors:", len(context_relevance_errors))

In [15]:

context_relevances_with_json, context_relevance_errors_with_json = eval_context_relevance(True)

print("Evaluating Context Relevance With JSON")
print_context_relevance_infos(context_relevances_with_json, context_relevance_errors_with_json)

50it [05:53,  7.06s/it]

Evaluating Context Relevance With JSON
0.5755555555555555 +- 0.41321682945899985
#Errors: 5





In [16]:

context_relevances_without_json, context_relevance_errors_without_json = eval_context_relevance(False)

print("Evaluating Context Relevance Without JSON")
print_context_relevance_infos(context_relevances_without_json, context_relevance_errors_without_json)

50it [06:05,  7.32s/it]

Evaluating Context Relevance Without JSON
0.5100775193798449 +- 0.22259740877612777
#Errors: 7



