# Avaliação automatizada de LLMs utilizando DeepEval

Atenção: 

Executar os comandos abaixo no prompt antes de iniciar os testes:

```bash
.venv\Scripts\activate
deepeval set-local-model \
         --model-name=llama3.2:latest \
         --base-url="http://localhost:11434/v1/" \
         --api-key="ollama"
```

ref.: https://docs.confident-ai.com/docs/metrics-llm-evals

deepeval set-local-model --model-name=phi3:medium --base-url="http://localhost:11434/v1/"--api-key="ollama"

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import GEval

## Corretude

In [None]:
# Define a métrica GEval para avaliar a "Corretude" da saída do LLM
metric = GEval(
    name="Correctness",  # Nome da métrica, que neste caso avalia a exatidão factual da resposta
    criteria="Determine if the current output is factually correct based on the expected output.",  # Descreve o objetivo da métrica
    evaluation_steps=[
        # Passos para a avaliação:
        "Check if any facts in the 'current output' contradict any facts in the 'expected output'.",  # Checa contradições entre a resposta e o esperado
        "Heavily penalize the omission of important details.",  # Define uma penalidade maior para omissões importantes
        "Vague language or contradictory opinions are not acceptable."  # Explicita que linguagem vaga afeta a avaliação
    ],
    # Parâmetros a serem usados na avaliação (entrada, saída atual e saída esperada)
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
)

In [None]:
# Define um caso de teste que contém a entrada, a saída gerada e a saída esperada para a comparação
test_case = LLMTestCase(
    input="Who was the first president of Brazil?",  # Entrada fornecida ao modelo
    actual_output="The first president of Brazil was Deodoro da Fonseca.",  # Resposta gerada pelo modelo
    expected_output="The first president of Brazil was Deodoro da Fonseca."  # Resposta correta esperada
)

# Aplica a métrica ao caso de teste e realiza a avaliação
metric.measure(test_case)

# Exibe o score calculado pela métrica, que representa o nível de corretude factual da saída
print(f"Score: {metric.score}")
# Exibe a justificativa para o score atribuído, explicando eventuais discrepâncias
print(f"Reason: {metric.reason}")

In [None]:
test_case = LLMTestCase(
    input="Who was the first president of Brazil?",  # Entrada fornecida ao modelo
    actual_output="It was Deodoro da Fonseca.",  # Resposta gerada pelo modelo
    expected_output="The first president of Brazil was Deodoro da Fonseca."  # Resposta correta esperada
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

In [None]:
test_case = LLMTestCase(
    input="Who was the first president of Brazil?", 
    actual_output="It was the singer Roberto Carlos da Fonseca!",
    expected_output="The first president of Brazil was Deodoro da Fonseca." 
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")


## Coerência

In [None]:
metric = GEval(
    name="Coherence", 
    criteria="Evaluate if the current output is logically consistent and maintains a clear sequence of ideas.",
    evaluation_steps=[
        "Ensure the response maintains a logical progression of thoughts.",
        "Identify any contradictions or deviations in the flow of information.",
        "Consider if the response is understandable and follows a cohesive structure."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)


In [None]:
test_case = LLMTestCase(
    input="Explain the process of photosynthesis.",
    actual_output=(
        "Photosynthesis is the process by which plants convert sunlight into chemical energy. "
        "First, plants absorb water through their roots and carbon dioxide through their leaves. "
        "Sunlight is captured by chlorophyll in the chloroplasts, initiating a reaction that produces glucose and oxygen. "
        "This process is essential for the production of oxygen and is a crucial part of the Earth’s ecosystem."
    )
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

In [None]:
test_case = LLMTestCase(
    input="Explain the process of photosynthesis.",
    actual_output=(
        "Photosynthesis is the process where plants absorb sunlight and produce sugar. "
        "In this process, animals also play a role in capturing sunlight. "
        "This helps plants release water into the atmosphere. "
        "Overall, photosynthesis is about how plants make use of moonlight to grow faster."
    )
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

## Relevância

In [None]:
metric = GEval(
    name="Relevance",
    criteria="Evaluate if the current output is directly related to the main subject of the question or context provided.",
    evaluation_steps=[
        "Ensure the response directly addresses the main subject of the question, specifically looking for a clear answer to 'What is the capital of France?'.",
        "Identify if any extraneous or unrelated information is included.",
        "Penalize responses that provide background information without answering the question directly."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)


In [None]:
test_case = LLMTestCase(
    input="What is the capital of France?",
    actual_output="The capital of France is Paris."
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

In [None]:
test_case = LLMTestCase(
    input="What is the capital of France?",
    actual_output="Buenos Aires is known for its rich culture and famous landmarks... "
                  "and some say it looks a lot like France"
)

metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

## Fluência

In [None]:
metric = GEval(
    name="Fluency",
    criteria="Evaluate if the current output is grammatically correct, well-structured, and uses natural language.",  # Descreve o objetivo da métrica
    evaluation_steps=[
        "Check if the response is free of grammatical errors; if errors exist, identify them.",  # Verifica e identifica erros gramaticais
        "Ensure the sentence structure is clear and well-formed; specify any issues with structure.",  # Garante que a estrutura da frase é clara, especificando problemas
        "Assess if the language sounds natural and fluent; indicate any awkward phrasing or unnatural language."  # Identifica frases ou construções pouco naturais
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]
)



In [None]:
test_case = LLMTestCase(
    input="Explain the purpose of photosynthesis in plants.",
    actual_output="Photosynthesis allows plants to convert sunlight into energy. Through this process, plants produce oxygen and glucose, which they use for growth and energy storage. This process is essential for plant survival and supports life on Earth by producing oxygen."
)
metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")

In [None]:
test_case = LLMTestCase(
    input="Explain the purpose of photosynthesis in plants.",
    actual_output="Photosynthesis are important it lets plant make food from sun. Plant make oxygen, food by sunlight and water. That very important for Earth."
)
metric.measure(test_case)

print(f"Score: {metric.score}")
print(f"Reason: {metric.reason}")