This code evalutates our LLM by using the DeepEval library

In [2]:
# Import modules
from transformers import AutoModelForCausalLM, AutoTokenizer

from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

In [3]:
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

In [4]:
# Define the model class
class FinanceLLM(DeepEvalBaseLLM):
    # Constructor
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    # Load the model
    def load_model(self):
        return self.model

    # Load the tokenizer
    '''
    @params: prompt - str
    @returns: response - str
    '''
    def generate(self, prompt: str) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(**inputs, max_length=20)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return response

    # Load the tokenizer
    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # Evaluate the model
    def get_model_name(self):
        return "FinanceLLM"

In [5]:
# Function to evaluate RAG
def rag_evaluation():
    # Test case
    test_case = LLMTestCase(
        input="What is the buy and hold strategy?",
        actual_output="The buy and hold strategy involves day trading and making quick profits.",
        expected_output="The buy and hold strategy involves purchasing stocks or other securities and holding them for a long period, regardless of market fluctuations.",
        retrieval_context=[
            """The buy and hold strategy is a long-term investment approach where investors purchase stocks or other securities and retain them for an extended period. This strategy is based on the belief that, despite volatility, the market will generally provide a good return over the long term."""
        ]
    )

    # Load the model
    tokenizer = AutoTokenizer.from_pretrained("Ishreet1/FinanceLLM")
    model = AutoModelForCausalLM.from_pretrained("Ishreet1/FinanceLLM")
    llm = FinanceLLM(model=model, tokenizer=tokenizer)

    # Evaluation retrivals
    contextual_precision = ContextualPrecisionMetric(model=llm)
    contextual_recall = ContextualRecallMetric(model=llm)
    contextual_relevancy = ContextualRelevancyMetric(model=llm)

    # Evaluation generation
    answer_relevancy = AnswerRelevancyMetric(model=llm)
    faithfulness = FaithfulnessMetric(model=llm)

    # Evaluating RAG
    print(evaluate(
        test_cases=[test_case],
        metrics=[
            contextual_precision,
            contextual_recall,
            contextual_relevancy,
            answer_relevancy,
            faithfulness,
        ]
    ))

# Run the evaluation
rag_evaluation()

: 