# DeepEval Framework Testing

### Date: August 23, 2024
### Author: Selin Kaplanoglu
### Currently on pause
**Goal:** To test DeepEval Framework to evaluate LLM responses 

In [1]:
# Imports
import re
import json

import transformers
import torch

from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import (HallucinationMetric, 
                              FaithfulnessMetric, 
                              BiasMetric,
                              ToolCorrectnessMetric
                              )

from pydantic import BaseModel

In [3]:
# define custom llm class for deepeval
class CustomLlama3_8B(DeepEvalBaseLLM):
    def __init__(self, model_path: str = None):

        # Define quantization configuration for 4-bit model
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

        # Load the 4-bit model and tokenizer
        model_4bit = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_path
        )

        self.model = model_4bit
        self.tokenizer = tokenizer
        self.model_name = model_path.split("/")[-1]

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        # Define text generation pipeline
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.model_json_schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Generate text output
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt):]

        # Remove special tokens and parse JSON
        output = output.replace('\n', ' ').strip()
        json_result = json.loads(output)

        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        name = self.model_name
        return ' '.join(name.split('-'))

# Define schema for the BaseModel
class Schema(BaseModel):
    answer: str

In [None]:
# Define the model path for the CustomLlama3_8B model
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
# Initialize the CustomLlama3_8B model with the specified model path
llama = CustomLlama3_8B(model_path=model_path)
# Define the schema for the BaseModel
schema = Schema

In [None]:
# Define input prompt
input = 'You are a Scientist. Tell me about TAF13.'

# Generate text based on input using the llama model and specified schema
actual_output = llama.generate(input, schema)

# Print the actual output
print(actual_output)

In [None]:
# Define context
context = ["TAF13, or TATA-Box Binding Protein Associated Factor 13, is a protein that is encoded by the TAF13 gene in humans.",
           "It is a subunit of the transcription initiation factor TFIID",
           "TAF13 is involved in RNA polymerase II transcription initiation and promoter clearance: TAF13 is part of the TFIID complex,which plays a major role in the initiation of transcription that is dependent on RNA polymerase II.",
           "TAF13 is involved in gene expression.",
           "TAF13 is involved in DNA-binding transcription factor activity."]

# Define a test case with input, actual output, context, and retrieval context
test_case = LLMTestCase(
    input=input,
    actual_output= actual_output.answer,  # EXTRACT STRING WITH .answer
    context=context,
    retrieval_context=["transcription initiation factor"],
)

# Initialize metrics for evaluation
hallucination_metric = HallucinationMetric(model=llama)
faithfulness_metric = FaithfulnessMetric(model=llama)
bias_metric = BiasMetric(model=llama)

# Evaluate test case using defined metrics
# Or evaluate test cases in bulk
# evaluate([test_case], [hallucination_metric, faithfulness_metric, bias_metric])