# Hallucination Detection using Entropy Metrics

This notebook analyzes the effectiveness of entropy and semantic entropy for detecting hallucinations in LLM responses using the HaluEval dataset.

In [67]:
# Install required packages
!uv add datasets litellm torch numpy pandas scikit-learn tqdm vllm

/bin/bash: line 1: uv: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:
import datasets
import litellm
import numpy as np
import pandas as pd
from model2vec import StaticModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch
import math
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from vllm import LLM, SamplingParams

from klarity.core.analyzer import EntropyAnalyzer
from klarity.estimator import UncertaintyEstimator
from klarity.models import TokenInfo

from transformers import AutoTokenizer

load_dotenv()


True

In [31]:
# Load the HaluEval dataset
def get_halueval_dataset(split_name: str = "qa"
):
    dataset = datasets.load_dataset("notrichardren/HaluEval", split_name)
    print(f"Dataset size: {len(dataset['train'])}")
    return dataset

In [32]:
# Sample the top 100 rows of the dataset
ds = get_halueval_dataset()
ds = ds['train'].select(range(100))

Dataset size: 10000


In [33]:
entropy_analyzer = EntropyAnalyzer()
uncertainty_estimator = UncertaintyEstimator(top_k=5, analyzer=entropy_analyzer)


In [34]:
def get_litellm_response(
    text: str, 
    model: str = "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K",
    top_k: int = 1
) -> str:
    """Get entropy metrics for a given response."""
    try:
        response = litellm.completion(
            model=model,
            messages=[{"role": "user", "content": text}],
            logprobs=top_k,
            echo=True
        )
        tokens = response.choices[0].logprobs.tokens
        logprobs = response.choices[0].logprobs.token_logprobs
        return response, response.choices[0].message.content, tokens, logprobs

    except Exception as e:
        print(f"Error processing text: {e}")
        return None, None, None

In [35]:
r, _, _, _ = get_litellm_response("Who is the president of the United States?")
r

ModelResponse(id='91713f0f29d7e993', created=1740418885, model='together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='eos', index=0, message=Message(content='As of my knowledge cutoff in 2023, the President of the United States was Joe Biden. However, please note that my information may not be up to date, and I do not have real-time access to current events. \n\nTo get the most recent and accurate information, I recommend checking a reliable news source or the official website of the White House for the latest updates on the presidency.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, refusal=None), logprobs=ChoiceLogprobs(content=None, refusal=None, token_ids=[2170, 315, 856, 6677, 45379, 304, 220, 2366, 18, 11, 279, 4900, 315, 279, 3723, 4273, 574, 13142, 38180, 13, 4452, 11, 4587, 5296, 430, 856, 2038, 1253, 539, 387, 709, 311, 2457, 11, 323

In [36]:
llm = LLM(model="HuggingFaceTB/SmolLM2-360M-Instruct")

INFO 02-24 17:41:27 config.py:549] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 02-24 17:41:27 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='HuggingFaceTB/SmolLM2-360M-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolLM2-360M-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=HuggingFaceTB/

INFO 02-24 17:41:27 model_runner.py:1110] Starting to load model HuggingFaceTB/SmolLM2-360M-Instruct...
INFO 02-24 17:41:27 weight_utils.py:254] Using model weights format ['*.safetensors']
INFO 02-24 17:41:27 weight_utils.py:304] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.15it/s]



INFO 02-24 17:41:28 model_runner.py:1115] Loading model weights took 0.6748 GB
INFO 02-24 17:41:28 worker.py:267] Memory profiling takes 0.32 seconds
INFO 02-24 17:41:28 worker.py:267] the current vLLM instance can use total_gpu_memory (39.38GiB) x gpu_memory_utilization (0.90) = 35.44GiB
INFO 02-24 17:41:28 worker.py:267] model weights take 0.67GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.46GiB; the rest of the memory reserved for KV Cache is 34.31GiB.
INFO 02-24 17:41:29 executor_base.py:111] # cuda blocks: 56208, # CPU blocks: 6553
INFO 02-24 17:41:29 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 109.78x


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.07 GiB. GPU 0 has a total capacity of 39.38 GiB of which 789.38 MiB is free. Process 1007266 has 38.59 GiB memory in use. Of the allocated memory 37.77 GiB is allocated by PyTorch, with 24.00 MiB allocated in private pools (e.g., CUDA Graphs), and 150.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
sampling_params = SamplingParams(
    max_tokens=128,
    temperature=0.0,
    logprobs=5
)

In [8]:
def get_vllm_response(
    llm: LLM,
    tokenizer: AutoTokenizer,
    sampling_params: SamplingParams, 
    text: str,
    top_k: int = 5
):
    mean_entropy = []
    mean_semantic_entropy = []
    messages = [
        {"role": "system", "content": """\
            You are a question answering assistant. Respond with only the answer and no other context"""
        },
        {"role": "user", "content": text}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    vllm_response = llm.generate(input_text, sampling_params)
    answer = vllm_response[0].outputs[0].text
    analysis_results = uncertainty_estimator.analyze_generation(vllm_response[0])
    for token_metric in analysis_results.token_metrics:
        mean_entropy.append(token_metric.raw_entropy)
        mean_semantic_entropy.append(token_metric.semantic_entropy)
    return answer, analysis_results, np.mean(mean_entropy), np.mean(mean_semantic_entropy)

In [37]:
sample_queries = ["What is the capital of France?", "What is the capital of Spain?"]
answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, sample_queries[1])

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 28.46it/s, est. speed input: 1054.40 toks/s, output: 227.92 toks/s]


In [38]:
answer

'The capital of Spain is Madrid.'

In [40]:
ds[0]

{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.",
 'question': "Which magazine was started first Arthur's Magazine or First for Women?",
 'right_answer': "Arthur's Magazine",
 'hallucinated_answer': 'First for Women was started first.',
 'task_type': 'QA'}

In [41]:
results = []
predicted_answers = []
mean_entropies = []
mean_semantic_entropies = []
correct_answers = []

In [42]:
ds[0]

{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.",
 'question': "Which magazine was started first Arthur's Magazine or First for Women?",
 'right_answer': "Arthur's Magazine",
 'hallucinated_answer': 'First for Women was started first.',
 'task_type': 'QA'}

In [28]:
def check_answers(predicted_answer, correct_answer):
    # Check for exact match first
    if predicted_answer.strip().lower() == correct_answer.strip().lower():
        return True
    
    # If not an exact match, use JudgeLLM
    prompt = f"""
    Question: Are these two answers equivalent in meaning?
    Answer 1: {predicted_answer}
    Answer 2: {correct_answer}
    Please respond with only 'Yes' or 'No'.
    """
    
    _, judge_response, _, _ = get_litellm_response(prompt)
    return judge_response.strip().lower() == 'yes'

accuracy = []
# Update the main loop
for item in tqdm(ds):
    correct_answers.append(item['right_answer'])
    combined_text = f"Context: {item['knowledge']}\nQuestion: {item['question']}\n Answer:"
    predicted_answer, result, mean_entropy, mean_semantic_entropy = get_vllm_response(llm, tokenizer, sampling_params, combined_text)
    predicted_answers.append(predicted_answer)
    mean_entropies.append(mean_entropy)
    mean_semantic_entropies.append(mean_semantic_entropy)
    
    # Check if the answers match
    is_correct = check_answers(predicted_answer, item['right_answer'])
    accuracy.append(is_correct)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 44.16it/s, est. speed input: 4397.24 toks/s, output: 177.59 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 13.28it/s, est. speed input: 1276.46 toks/s, output: 252.59 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 31.23it/s, est. speed input: 5568.46 toks/s, output: 218.92 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 17.42it/s, est. speed input: 3034.36 toks/s, output: 261.54 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 34.64it/s, est. speed input: 3678.09 toks/s, output: 208.13 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 15.89it/s, est. speed input: 2704.12 toks/s, output: 254.47 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 46.06it/s, est. speed input: 7063.45 toks/s, output: 184.59 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 39.74it/s, est. speed input: 9955.34 toks/s, output: 199.03 toks/s]
Processed prompts: 100%|

In [78]:
prinf(f"Accuracy is {np.mean(accuracy)}")

[UncertaintyMetrics(raw_entropy=0.5193799723392782, semantic_entropy=0.5193799723392782, token_predictions=[TokenInfo(token='First', token_id=5345, logit=-0.4102168381214142, probability=0.6635063610636857, attention_score=None), TokenInfo(token='Arthur', token_id=27037, logit=-1.5352168083190918, probability=0.2154089836469735, attention_score=None), TokenInfo(token='B', token_id=50, logit=-2.910216808319092, probability=0.05446392035801202, attention_score=None), TokenInfo(token='Both', token_id=12857, logit=-4.535216808319092, probability=0.010724581795883955, attention_score=None), TokenInfo(token='Only', token_id=15017, logit=-5.035216808319092, probability=0.006504787671799594, attention_score=None)], insight=None, attention_metrics=None),
 UncertaintyMetrics(raw_entropy=0.011020957731614189, semantic_entropy=0.0006636399204034579, token_predictions=[TokenInfo(token=' for', token_id=327, logit=-0.0026856327895075083, probability=0.9973179702959892, attention_score=None), TokenInf

In [29]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

def plot_metrics(semantic_entropy, raw_entropy, labels):
    # Calculate metrics
    semantic_auroc = roc_auc_score(labels, semantic_entropy)
    raw_auroc = roc_auc_score(labels, raw_entropy)
    
    semantic_precision, semantic_recall, _ = precision_recall_curve(labels, semantic_entropy)
    raw_precision, raw_recall, _ = precision_recall_curve(labels, raw_entropy)
    
    semantic_pr_auc = auc(semantic_recall, semantic_precision)
    raw_pr_auc = auc(raw_recall, raw_precision)

    # Plot ROC curves
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.plot([0, 1], [0, 1], linestyle='--', label='Random')
    plt.plot(semantic_recall, semantic_precision, label=f'Semantic (AUC = {semantic_pr_auc:.2f})')
    plt.plot(raw_recall, raw_precision, label=f'Raw (AUC = {raw_pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()

    # Plot PR curves
    plt.subplot(1, 3, 2)
    plt.hist(semantic_entropy, bins=50, alpha=0.5, label='Semantic')
    plt.hist(raw_entropy, bins=50, alpha=0.5, label='Raw')
    plt.xlabel('Entropy')
    plt.ylabel('Frequency')
    plt.title('Entropy Distribution')
    plt.legend()

    # Plot accuracy
    thresholds = np.linspace(0, 1, 100)
    semantic_accuracy = [accuracy_score(labels, semantic_entropy > t) for t in thresholds]
    raw_accuracy = [accuracy_score(labels, raw_entropy > t) for t in thresholds]

    plt.subplot(1, 3, 3)
    plt.plot(thresholds, semantic_accuracy, label='Semantic')
    plt.plot(thresholds, raw_accuracy, label='Raw')
    plt.xlabel('Threshold')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Threshold')
    plt.legend()

    plt.tight_layout()
    plt.show()

NameError: name 'results_df' is not defined

In [None]:
plot_metrics(mean_semantic_entropies, mean_entropies, results)