# DeepEval Framework Testing

### Date: 23 August, 2024
### Currently on pause
### Goal: To test DeepEval Framework to evaluate LLM responses 

In [1]:
# DeepEval imports
from deepeval.test_case import LLMTestCase
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import HallucinationMetric
from deepeval.integrations.llama_index import DeepEvalAnswerRelevancyEvaluator

In [2]:
#custom LLM setup
import torch
import json
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)



In [3]:
class CustomLlama3_70B(DeepEvalBaseLLM):
    def __init__(self):
        self.model_kwargs = {"torch_dtype": torch.bfloat16}

        # Load the model with the appropriate device map
        self.model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3.1-70B-Instruct",
            device_map="auto",
            **self.model_kwargs
        )

        # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Meta-Llama-3.1-70B-Instruct",
        )

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            truncation=True,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Generate output
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt):]

        # Clean up the output to ensure valid JSON
        cleaned_output = self._clean_json_output(output)

        # Try to load the JSON
        try:
            json_result = json.loads(cleaned_output)
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse JSON: {e}.\nGenerated output: {cleaned_output}")

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def _clean_json_output(self, output: str) -> str:
        # Attempt to clean up the output by ensuring it has balanced braces and is properly closed
        output = output.strip()

        # Check if the output starts with a '{' and ends with a '}', if not try to fix it
        if not output.startswith("{"):
            output = "{" + output
        if not output.endswith("}"):
            output += "}"

        # Remove any characters after the closing brace
        closing_brace_index = output.rfind("}")
        if closing_brace_index != -1:
            output = output[:closing_brace_index + 1]

        return output

    def get_model_name(self):
        return "Llama-3.1 70B Instruct"

# Define a schema using Pydantic
class Schema(BaseModel):
   answer: str


In [6]:
custom_llm = CustomLlama3_70B()
user_input = "give me one truth about strawberries. then give me one lie about strawberries"

# Call the generate method with the schema
llm_output = custom_llm.generate(user_input, schema=Schema)

# Print the output as JSON
print(llm_output)

Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

answer='Strawberries are a type of fruit that belongs to the rose family. \nStrawberries are native to the continent of Antarctica. \n\nWhich one is the truth and which one is the lie?'


In [7]:
test_case = LLMTestCase(
    input=user_input,
    actual_output=llm_output,
    context=["""Strawberries are a type of fruit that belongs to the rose family. Strawberries are native to regions in the Northern Hemisphere, particularly in Europe, North America, and Asia."""],
)

metric = HallucinationMetric(threshold=0.5, model=custom_llm)
metric.measure(test_case)

# Print the evaluation results
print(metric.score)
print(metric.reason)

Output()

1.0
The score is 1.00 because the actual output contains information that is not present in the context, specifically the claim that strawberries are native to Antarctica, which contradicts the provided context that states they are native to regions in the Northern Hemisphere, particularly in Europe, North America, and Asia, not Antarctica. This indicates a complete fabrication of information by the model, hence the perfect hallucination score of 1.00
