# Hallucination Detection using Entropy Metrics

This notebook analyzes the effectiveness of entropy and semantic entropy for detecting hallucinations in LLM responses using the HaluEval dataset.

In [3]:
# Install required packages
!uv add datasets litellm torch numpy pandas scikit-learn tqdm vllm

[2mResolved [1m190 packages[0m [2min 0.96ms[0m[0m
[2mAudited [1m174 packages[0m [2min 0.19ms[0m[0m


In [2]:
import datasets
import litellm
import numpy as np
import pandas as pd
from model2vec import StaticModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch
import math
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from vllm import LLM, SamplingParams

from klarity.core.analyzer import EntropyAnalyzer
from klarity.estimator import UncertaintyEstimator
from klarity.models import TokenInfo

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm
2025-02-24 11:16:16,855	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


True

In [4]:
# Load the HaluEval dataset
def get_halueval_dataset(split_name: str = "qa"):
    dataset = datasets.load_dataset("notrichardren/HaluEval", split_name)
    print(f"Dataset size: {len(dataset['train'])}")
    print(dataset['train'][0])
    return dataset

In [5]:
get_halueval_dataset()

Generating train split: 100%|██████████| 10000/10000 [00:00<00:00, 235998.33 examples/s]

Dataset size: 10000
{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.", 'question': "Which magazine was started first Arthur's Magazine or First for Women?", 'right_answer': "Arthur's Magazine", 'hallucinated_answer': 'First for Women was started first.', 'task_type': 'QA'}





DatasetDict({
    train: Dataset({
        features: ['knowledge', 'question', 'right_answer', 'hallucinated_answer', 'task_type'],
        num_rows: 10000
    })
})

In [6]:
entropy_analyzer = EntropyAnalyzer()
uncertainty_estimator = UncertaintyEstimator(top_k=5, analyzer=entropy_analyzer)


In [26]:
def get_response(
    text: str, 
    model: str = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K",
    top_k: int = 5
) -> str:
    """Get entropy metrics for a given response."""
    try:
        response = litellm.completion(
            model=model,
            messages=[{"role": "user", "content": text}],
            logprobs=top_k,
            echo=True
        )
        tokens = response.choices[0].logprobs.tokens
        logprobs = response.choices[0].logprobs.token_logprobs
        return response, response.choices[0].message.content, tokens, logprobs

    except Exception as e:
        print(f"Error processing text: {e}")
        return None, None, None

In [7]:
def get_vllm_response(
    llm: LLM, 
    sampling_params: SamplingParams, 
    text: str,
    top_k: int = 5
):
    vllm_response = llm.generate([text], sampling_params)

    generated_texts = []
    num_generated_tokens = []
    logprobs = []
    tokens = []
    token_infos = []

    for prompt_ind in range(len(vllm_response)):
        for sample_ind in range(len(vllm_response[prompt_ind].outputs)):
            generated_texts.append(vllm_response[prompt_ind].outputs[sample_ind].text)
            tokens.append(vllm_response[prompt_ind].outputs[sample_ind].tokens)
            num_generated_tokens.append(len(vllm_response[prompt_ind].outputs[sample_ind].token_ids))
            if get_logprobs:
                lgps = []
                for logprob in vllm_response[prompt_ind].outputs[sample_ind].logprobs:
                    lgp = [lg.logprob for lg in logprob.values()]
                    lgps.append(lgp)
                logprobs.append(lgps)

    return generated_texts, num_generated_tokens, logprobs, token_ids
    

In [28]:
sample_queries = ["What is the capital of France?", "What is the capital of Spain?"]
r, _, _, _ = get_response(sample_queries[0])

In [29]:
r

ModelResponse(id='91474dc559739567', created=1739979085, model='together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='eos', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, refusal=None), logprobs=ChoiceLogprobs(content=None, refusal=None, token_ids=[791, 6864, 315, 9822, 374, 12366, 13, 128009], tokens=['The', ' capital', ' of', ' France', ' is', ' Paris', '.', '<|eot_id|>'], token_logprobs=[-0.00010919571, -8.34465e-07, -5.9604645e-07, -1.1920929e-07, 0, -6.4373016e-06, -1.9907951e-05, -0.00062179565]))], usage=Usage(completion_tokens=8, prompt_tokens=42, total_tokens=50, completion_tokens_details=None, prompt_tokens_details=None), service_tier=None, prompt=[])

In [17]:
r.choices[0].message.content

'The capital of France is Paris.'

In [None]:
# Process a subset of the dataset for testing
sample_size = 100  # Adjust based on API limits and requirements
results = []

for idx in tqdm(range(sample_size)):
    sample = dataset['train'][idx]
    
    # Process correct response
    correct_entropy, correct_semantic_entropy = get_response_metrics(sample['correct'])
    
    # Process hallucinated response
    hallu_entropy, hallu_semantic_entropy = get_response_metrics(sample['hallucinated'])
    
    results.append({
        'query': sample['query'],
        'correct_entropy': correct_entropy,
        'correct_semantic_entropy': correct_semantic_entropy,
        'hallu_entropy': hallu_entropy,
        'hallu_semantic_entropy': hallu_semantic_entropy
    })

results_df = pd.DataFrame(results)

In [None]:
def evaluate_metrics(df: pd.DataFrame, entropy_threshold: float, semantic_threshold: float) -> Dict:
    """Evaluate the effectiveness of entropy metrics for hallucination detection."""
    # Predictions for correct responses (false positive rate)
    correct_predictions = (
        (df['correct_entropy'] > entropy_threshold) |
        (df['correct_semantic_entropy'] > semantic_threshold)
    )
    false_positive_rate = correct_predictions.mean()
    
    # Predictions for hallucinated responses
    hallu_predictions = (
        (df['hallu_entropy'] > entropy_threshold) |
        (df['hallu_semantic_entropy'] > semantic_threshold)
    )
    
    true_labels = np.ones(len(df))
    accuracy = accuracy_score(true_labels, hallu_predictions)
    f1 = f1_score(true_labels, hallu_predictions)
    
    return {
        'false_positive_rate': false_positive_rate,
        'hallucination_accuracy': accuracy,
        'hallucination_f1': f1
    }

# Try different thresholds
entropy_thresholds = np.linspace(0.5, 2.0, 10)
semantic_thresholds = np.linspace(0.3, 1.5, 10)

best_metrics = None
best_score = -float('inf')

for e_thresh in entropy_thresholds:
    for s_thresh in semantic_thresholds:
        metrics = evaluate_metrics(results_df, e_thresh, s_thresh)
        
        # Score based on high hallucination detection and low false positives
        score = metrics['hallucination_f1'] - metrics['false_positive_rate']
        
        if score > best_score:
            best_score = score
            best_metrics = {
                'entropy_threshold': e_thresh,
                'semantic_threshold': s_thresh,
                **metrics
            }

print("Best Results:")
for key, value in best_metrics.items():
    print(f"{key}: {value:.3f}")