In [1]:
#!pip install deepeval
#!pip install rouge-score
#!pip install evaluate

# Classical NLP testing and LLM-judge testing with Deepeval

## https://github.com/confident-ai/deepeval

In [21]:
from rouge_score import rouge_scorer
import evaluate

# expected = "ana are mere, tu ce ai?"
# predicted = "are ana mere"

predicted = "ana are mere  tu ce ai"
expected = "are ana mere tu ce ai"

# ROUGE
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = rouge.score(expected, predicted)

print("ROUGE-1 F1:", rouge_scores["rouge1"].fmeasure)
print("ROUGE-L F1:", rouge_scores["rougeL"].fmeasure)

# BLEU
bleu = evaluate.load("bleu")
bleu_result = bleu.compute(predictions=[predicted], references=[[expected]])

print("BLEU score:", bleu_result["bleu"])

ROUGE-1 F1: 1.0
ROUGE-L F1: 0.8333333333333334
BLEU score: 0.5623413251903491


### We will set Deepeval to use as LLM-judge a locally hosted LLM

#### Option 1: with LMStudio

In [16]:
### YOU NEED TO HAVE LMSTUDIO STARTED AND ONE MODEL LOADED
### REPLACE THE MODEL WITH THE ONE THAT YOU LOADED

!deepeval set-local-model --model-name="phi-4@q8_0" --base-url="http://localhost:1234/v1/" --api-key="test"

🙌 Congratulations! You're now using a local model for all evals that require an
LLM.


#### Option 2, create a custom LLM handler (through transformers):

#### https://docs.confident-ai.com/guides/guides-using-custom-llms

In [23]:
from deepeval import assert_test
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

def test_case():
    correctness_metric = GEval(
        name="Correctness",
        criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
        threshold=0.5
    )
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        actual_output="You have 30 days to get a full refund at no extra cost.",
        expected_output="We offer a 30-day full refund at no extra costs.",
        retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
    )
    assert_test(test_case, [correctness_metric])

test_case()

None


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.98s/test case]


## Implement other metrics


### https://docs.confident-ai.com/docs/metrics-introduction#using-local-llm-models

In [25]:
## ....