# Hallucination Detection using Entropy Metrics

This notebook analyzes the effectiveness of entropy and semantic entropy for detecting hallucinations in LLM responses using the HaluEval dataset.

In [67]:
# Install required packages
!uv add datasets litellm torch numpy pandas scikit-learn tqdm vllm

/bin/bash: line 1: uv: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
import datasets
import litellm
import numpy as np
import pandas as pd
from model2vec import StaticModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch
import math
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from vllm import LLM, SamplingParams

from klarity.core.analyzer import EntropyAnalyzer
from klarity.estimator import UncertaintyEstimator
from klarity.models import TokenInfo

from transformers import AutoTokenizer

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm
2025-02-24 17:18:06,726	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


True

In [22]:
# Load the HaluEval dataset
def get_halueval_dataset(split_name: str = "qa"
):
    dataset = datasets.load_dataset("notrichardren/HaluEval", split_name)
    print(f"Dataset size: {len(dataset['train'])}")
    return dataset

In [25]:
# Sample the top 100 rows of the dataset
ds = get_halueval_dataset()
ds = ds['train'].select(range(100))

Dataset size: 10000


In [4]:
entropy_analyzer = EntropyAnalyzer()
uncertainty_estimator = UncertaintyEstimator(top_k=5, analyzer=entropy_analyzer)


In [5]:
def get_litellm_response(
    text: str, 
    model: str = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K",
    top_k: int = 1
) -> str:
    """Get entropy metrics for a given response."""
    try:
        response = litellm.completion(
            model=model,
            messages=[{"role": "user", "content": text}],
            logprobs=top_k,
            echo=True
        )
        tokens = response.choices[0].logprobs.tokens
        logprobs = response.choices[0].logprobs.token_logprobs
        return response, response.choices[0].message.content, tokens, logprobs

    except Exception as e:
        print(f"Error processing text: {e}")
        return None, None, None

In [12]:
r, _, _, _ = get_litellm_response("Who is the president of the United States?")
r

ModelResponse(id='917120c4cfd9474c', created=1740417644, model='together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='eos', index=0, message=Message(content='As of my knowledge cutoff in 2023, the President of the United States was Joe Biden. However, please note that my information may not be up to date, and I do not have real-time access to current events. \n\nTo get the most recent and accurate information, I recommend checking a reliable news source or the official website of the White House for the latest updates on the presidency.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, refusal=None), logprobs=ChoiceLogprobs(content=None, refusal=None, token_ids=[2170, 315, 856, 6677, 45379, 304, 220, 2366, 18, 11, 279, 4900, 315, 279, 3723, 4273, 574, 13142, 38180, 13, 4452, 11, 4587, 5296, 430, 856, 2038, 1253, 539, 387, 709, 311, 2457, 11, 323

In [6]:
llm = LLM(model="HuggingFaceTB/SmolLM2-360M-Instruct")

INFO 02-24 17:18:09 __init__.py:207] Automatically detected platform cuda.


INFO 02-24 17:18:13 config.py:549] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 02-24 17:18:13 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='HuggingFaceTB/SmolLM2-360M-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolLM2-360M-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=HuggingFaceTB/

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.24it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.21it/s]



INFO 02-24 17:18:14 model_runner.py:1115] Loading model weights took 0.6750 GB
INFO 02-24 17:18:15 worker.py:267] Memory profiling takes 0.39 seconds
INFO 02-24 17:18:15 worker.py:267] the current vLLM instance can use total_gpu_memory (39.38GiB) x gpu_memory_utilization (0.90) = 35.44GiB
INFO 02-24 17:18:15 worker.py:267] model weights take 0.67GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.47GiB; the rest of the memory reserved for KV Cache is 34.21GiB.
INFO 02-24 17:18:15 executor_base.py:111] # cuda blocks: 56045, # CPU blocks: 6553
INFO 02-24 17:18:15 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 109.46x
INFO 02-24 17:18:17 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:13<00:00,  2.52it/s]

INFO 02-24 17:18:31 model_runner.py:1562] Graph capturing finished in 14 secs, took 0.23 GiB
INFO 02-24 17:18:31 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 16.22 seconds





In [7]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
sampling_params = SamplingParams(
    max_tokens=128,
    temperature=0.0,
    logprobs=5
)

In [8]:
def get_vllm_response(
    llm: LLM,
    tokenizer: AutoTokenizer,
    sampling_params: SamplingParams, 
    text: str,
    top_k: int = 5
):
    mean_entropy = []
    mean_semantic_entropy = []
    messages = [
        {"role": "system", "content": """\
            You are a question answering assistant. Respond with only the answer and no other context"""
        },
        {"role": "user", "content": text}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    vllm_response = llm.generate(input_text, sampling_params)
    answer = vllm_response[0].outputs[0].text
    analysis_results = uncertainty_estimator.analyze_generation(vllm_response[0])
    for token_metric in analysis_results.token_metrics:
        mean_entropy.append(token_metric.raw_entropy)
        mean_semantic_entropy.append(token_metric.semantic_entropy)
    return answer, analysis_results, np.mean(mean_entropy), np.mean(mean_semantic_entropy)

In [9]:
sample_queries = ["What is the capital of France?", "What is the capital of Spain?"]
answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, sample_queries[1])

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 18.30it/s, est. speed input: 678.17 toks/s, output: 146.60 toks/s]


In [11]:
answer

'The capital of Spain is Madrid.'

In [12]:
ds['train'][0]

{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.",
 'question': "Which magazine was started first Arthur's Magazine or First for Women?",
 'right_answer': "Arthur's Magazine",
 'hallucinated_answer': 'First for Women was started first.',
 'task_type': 'QA'}

In [10]:
results = []
predicted_answers = []
mean_entropies = []
mean_semantic_entropies = []
correct_answers = []

In [27]:
ds[0]

{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.",
 'question': "Which magazine was started first Arthur's Magazine or First for Women?",
 'right_answer': "Arthur's Magazine",
 'hallucinated_answer': 'First for Women was started first.',
 'task_type': 'QA'}

In [26]:
def check_answers(predicted_answer, correct_answer):
    # Check for exact match first
    if predicted_answer.strip().lower() == correct_answer.strip().lower():
        return True
    
    # If not an exact match, use JudgeLLM
    prompt = f"""
    Question: Are these two answers equivalent in meaning?
    Answer 1: {predicted_answer}
    Answer 2: {correct_answer}
    Please respond with only 'Yes' or 'No'.
    """
    
    _, judge_response, _, _ = get_litellm_response(prompt)
    return judge_response.strip().lower() == 'yes'

# Update the main loop
for item in tqdm(ds):
    print(item)
    correct_answers.append(item['right_answer'])
    combined_text = f"Context: {item['knowledge']}\nQuestion: {item['question']}\n Answer:"
    predicted_answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, combined_text)
    predicted_answers.append(predicted_answer)
    mean_entropies.append(mean_entropy)
    mean_semantic_entropies.append(mean_semantic_entropy)
    
    # Check if the answers match
    is_correct = check_answers(predicted_answer, item['right_answer'])
    results.append(is_correct)

KeyError: "Column train not in the dataset. Current columns in the dataset: ['knowledge', 'question', 'right_answer', 'hallucinated_answer', 'task_type']"

In [78]:
results[0].token_metrics

[UncertaintyMetrics(raw_entropy=0.5193799723392782, semantic_entropy=0.5193799723392782, token_predictions=[TokenInfo(token='First', token_id=5345, logit=-0.4102168381214142, probability=0.6635063610636857, attention_score=None), TokenInfo(token='Arthur', token_id=27037, logit=-1.5352168083190918, probability=0.2154089836469735, attention_score=None), TokenInfo(token='B', token_id=50, logit=-2.910216808319092, probability=0.05446392035801202, attention_score=None), TokenInfo(token='Both', token_id=12857, logit=-4.535216808319092, probability=0.010724581795883955, attention_score=None), TokenInfo(token='Only', token_id=15017, logit=-5.035216808319092, probability=0.006504787671799594, attention_score=None)], insight=None, attention_metrics=None),
 UncertaintyMetrics(raw_entropy=0.011020957731614189, semantic_entropy=0.0006636399204034579, token_predictions=[TokenInfo(token=' for', token_id=327, logit=-0.0026856327895075083, probability=0.9973179702959892, attention_score=None), TokenInf

In [None]:
def evaluate_metrics(df: pd.DataFrame, entropy_threshold: float, semantic_threshold: float) -> Dict:
    """Evaluate the effectiveness of entropy metrics for hallucination detection."""
    # Predictions for correct responses (false positive rate)
    correct_predictions = (
        (df['correct_entropy'] > entropy_threshold) |
        (df['correct_semantic_entropy'] > semantic_threshold)
    )
    false_positive_rate = correct_predictions.mean()
    
    # Predictions for hallucinated responses
    hallu_predictions = (
        (df['hallu_entropy'] > entropy_threshold) |
        (df['hallu_semantic_entropy'] > semantic_threshold)
    )
    
    true_labels = np.ones(len(df))
    accuracy = accuracy_score(true_labels, hallu_predictions)
    f1 = f1_score(true_labels, hallu_predictions)
    
    return {
        'false_positive_rate': false_positive_rate,
        'hallucination_accuracy': accuracy,
        'hallucination_f1': f1
    }

# Try different thresholds
entropy_thresholds = np.linspace(0.5, 2.0, 10)
semantic_thresholds = np.linspace(0.3, 1.5, 10)

best_metrics = None
best_score = -float('inf')

for e_thresh in entropy_thresholds:
    for s_thresh in semantic_thresholds:
        metrics = evaluate_metrics(results_df, e_thresh, s_thresh)
        
        # Score based on high hallucination detection and low false positives
        score = metrics['hallucination_f1'] - metrics['false_positive_rate']
        
        if score > best_score:
            best_score = score
            best_metrics = {
                'entropy_threshold': e_thresh,
                'semantic_threshold': s_thresh,
                **metrics
            }

print("Best Results:")
for key, value in best_metrics.items():
    print(f"{key}: {value:.3f}")