In [1]:
from typing import List, Optional, Dict, Tuple

import sentence_transformers

import toolpy as tp
from toolpy.integrations import groq

  from tqdm.autonotebook import tqdm, trange


In [2]:
groq_interface = groq.GroqInterface(model=groq.GroqModel.LLAMA3_70B, n_retry=5)

registry = tp.llm.LLMRegistry()
registry.registry(model_name="llama3-70b", interface=groq_interface, default=True)

In [3]:
registry._default_model

'llama3-70b'

In [4]:
rag_sample = {'answer': 'sky and thunder god',
            'question': 'What is Zeus know for in Greek mythology?',
            'context': 'Palici: he Palici the sons of Zeus\nPalici: in Greek mythology\nZeus: Zeus (British English , North American English ; , Zeús ) is the sky and thunder god in ancient Greek religion\n',
            'context_sentence_count': '3'}

rag_sample_description = {"answer":"answer to the question", 
                          "question":"question beeing answered", 
                          "context":"context used to answer the question", 
                          "context_sentence_count": "number of sentences in the context"}

In [5]:
class QuestionGenerator(tp.BasicTool):
    '''
    Generates questions from a given answer
    '''

    _system_message = '''You are a question generator that outputs in JSON. 
The JSON object must use the schema: {'questions':['str', 'str', ...]}

Please use a valid JSON format.'''

    _base_prompt = '''Generate questions for the given answer:

Answer: {answer}
'''

    _description = "Generates questions from a given answer"

    _return_description = {"questions":"generated questions from the answer"}

    _input_description = {"answer" : "answer to generate questions from"}

    def __init__(self):
        '''
        QuestionGenerator constructor.

        Args:
            model (str, optional): model to use. Llama3 70B is used if None. Default is None
            api_key (str, optional): Groq API key to use, if None will check the environment 'GROQ_API_KEY' variable. Default is None.
        '''

        super().__init__(self._description, self._input_description, self._base_prompt, 
                         self._return_description, QuestionGenerator._system_message, None,
                         json_mode=True)

In [6]:
question_generator = QuestionGenerator()

questions, generation_return_description = question_generator(rag_sample)

In [7]:
questions

{'questions': ['Which god is associated with the weather in Norse mythology?',
  'Who is the Norse god of the sky and thunder?',
  'What is the name of the Norse god often depicted with a hammer and controlling the weather?',
  'Which Norse god is known for his ability to control the weather and is often associated with thunderstorms?',
  'In Norse mythology, what is the name of the god who can control the weather and is often depicted carrying a hammer?']}

In [8]:
class QuestionComparator(tp.Tool):
    '''
    Comparates questions

    '''

    _description = "Comparate questions, returning the similarity between a single query question and questions to compare."

    _return_description = {"score":"similarity score between the query question and the questions to compare"}

    _input_description = {"query_question" : "single question to compare", "questions":"questions to compare with query"}

    def __init__(self, embedder_model:str="all-MiniLM-L6-v2") -> None:
        '''
        QuestionComparator constructor.

        Args:
            embedder_model (str, optional): model to generate embeddings. Defaults to "all-MiniLM-L6-v2".
        '''
        super().__init__(self._description, self._input_description)

        self._embedder = sentence_transformers.SentenceTransformer(embedder_model)

    def _execute(self, query: Dict[str, str] | None, context: str | None) -> Dict[str, str | List[str] | Dict[str, tp.tool.TextLike]]:
        '''
        Calculates the similarity between one question and a list of questions.

        Args:
            query (Dict[str, str]): must contain keys "query_question" with 
                question to compare with list of questions in "questions" key. 
            context (Optional[str], optional): not used. Defaults to None.

        Returns:
            Dict[str, str]: result, have a single key "score" with the similarity score.
        '''
        query_question = query["query_question"]
        questions = query["questions"]

        q_embedding = self._embedder.encode(query_question, convert_to_tensor=True)
        qi_embeddings = self._embedder.encode(questions, convert_to_tensor=True)
            
        cosine_scores = sentence_transformers.util.cos_sim(q_embedding, qi_embeddings)

        score = cosine_scores.sum().item()
        score /= len(qi_embeddings)

        result = {"score":str(score)}

        return result, self._return_description
        


In [9]:
question_comparator = QuestionComparator()

comparator_query = {"query_question":rag_sample["question"], "questions":questions["questions"]}
comparator_result, _ = question_comparator(comparator_query)

comparator_result

{'score': '0.3964371204376221'}

In [10]:
class AnswerRelevanceEvaluator(tp.Tool):
    '''
    Evaluates the relevance of a answer.
    '''

    _description = "Evaluates the relevance of a answer considering the question it answers."

    _return_description = {"score": "relevance of the answer considering the question"}

    _input_description = {"answer" : "answer to evaluate the relevance"}

    def __init__(self, question_generator:QuestionGenerator=None, question_comparator:QuestionComparator=None) -> None:
        '''
        AnswerRelevanceEvaluator constructor

        Args:
            question_generator (QuestionGenerator, optional): generator to use, if None creates a new with default arguments. Defaults to None.
            question_comparator (QuestionComparator, optional): comparator to use, if None creates a new with default arguments. Defaults to None.
        '''
        super().__init__(self._description, self._input_description)

        if question_generator is None:
            question_generator = QuestionGenerator()
        if question_comparator is None:
            question_comparator = QuestionComparator()

        self._question_generator = question_generator
        self._question_comparator = question_comparator
    
    def _execute(self, query: Dict[str, str], context: str=None) -> Tuple[Dict[str, str], Dict[str,str]]:
        '''
        Evaluates the relevance of a answer.

        Args:
            query (Dict[str, str]): must contain key "answer" with answer to evaluate.
            context (str, optional): not used. Defaults to None.

        Raises:
            ValueError: if query does not contains "answer" key

        Returns:
            Dict[str, str]: result with key "score" with the score.
        '''
        
        if "answer" not in query:
            raise ValueError("Query must have the answer to evaluate (query not have key 'answer').")

        generator_result, _ = self._question_generator(query)

        comparator_query = {"query_question":query["question"], "questions":generator_result["questions"]}
        comparator_result, _ = self._question_comparator(comparator_query)

        return comparator_result, self._return_description

In [11]:
answer_relevance_evaluator = AnswerRelevanceEvaluator()

In [12]:
answer_relevance_evaluator(rag_sample)

({'score': '0.5079158147176107'},
 {'score': 'relevance of the answer considering the question'})

# Adapter

In [13]:
adapter = tp.tool.Adapter()

In [14]:
next_query, next_context = adapter.map_tools(previous_tool=question_generator, 
                                             previous_output=questions, 
                                             previous_output_description=generation_return_description, 
                                             next_tool=question_comparator, 
                                             additional_inputs=rag_sample, 
                                             additional_inputs_description=rag_sample_description)

In [15]:
comparator_result, _ = question_comparator(next_query, next_context)

In [16]:
comparator_result

{'score': '0.3964371204376221'}

## Pipeline

In [17]:
pipeline = tp.Pipeline([question_generator, question_comparator], global_memory=True)

In [18]:
result, result_description = pipeline(rag_sample, rag_sample_description)

In [19]:
result

{'score': '0.45767029126485187'}