<a href="https://colab.research.google.com/github/ppmurmu/four-kites-hallucination/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q transformers torch accelerate sentence-transformers spacy scipy

#Load Model

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

def load_models(llm_name="Qwen/Qwen2.5-1.5B-Instruct", embed_name="all-MiniLM-L6-v2"):
    """
    Loads Qwen 2.5 (1.5B) and the Embedder into memory.
    """
    print(f"⏳ Loading LLM: {llm_name}...")
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 1. Load LLM (Qwen 2.5)
    tokenizer = AutoTokenizer.from_pretrained(llm_name)
    model = AutoModelForCausalLM.from_pretrained(
        llm_name,
        torch_dtype="auto",
        device_map="auto"   # Automatically uses GPU if available
    )

    # 2. Load Embedder
    print(f"⏳ Loading Embedder: {embed_name}...")
    embedder = SentenceTransformer(embed_name, device=device)

    print("✅ Models Loaded Successfully!")
    return {
        "model": model,
        "tokenizer": tokenizer,
        "embedder": embedder,
        "device": device
    }

# EXECUTE LOAD (Run this once)
global_components = load_models()

⏳ Loading LLM: Qwen/Qwen2.5-1.5B-Instruct...


Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

⏳ Loading Embedder: all-MiniLM-L6-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Models Loaded Successfully!


# Framework

In [4]:
import numpy as np
from sentence_transformers import util

class HallucinationDetector:
    def __init__(self, components):
        self.model = components["model"]
        self.tokenizer = components["tokenizer"]
        self.embedder = components["embedder"]
        self.device = components["device"]

    def _format_prompt(self, user_query):
        """
        Forces the model into a strict Q&A format to limit output variance.
        """
        # We use a few-shot format or direct instruction to enforce brevity
        return f"Question: {user_query}\nAnswer in a single word:"

    def get_intrinsic_score(self, prompt, max_new_tokens=3):
        """
        Checks confidence (logits) on the immediate next tokens.
        """
        formatted_prompt = self._format_prompt(prompt)
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                return_dict_in_generate=True,
                output_scores=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

        # Decode
        generated_seq = outputs.sequences[0]
        # Only decode the new tokens
        full_text = self.tokenizer.decode(generated_seq, skip_special_tokens=True)
        answer_text = full_text.replace(formatted_prompt, "").strip()

        # Logit Analysis
        transition_scores = self.model.compute_transition_scores(
            outputs.sequences, outputs.scores, normalize_logits=True
        )
        probs = torch.exp(transition_scores)

        # We only care about the confidence of the FIRST meaningful token (the answer)
        # If the answer is "Paris", we want the confidence of "Paris".
        if len(probs) > 0:
            avg_confidence = torch.mean(probs).item()
        else:
            avg_confidence = 0.0

        return answer_text, avg_confidence

    def get_consistency_score(self, prompt, samples=4, max_new_tokens=3):
        """
        Generates 4 answers with high randomness.
        If the model knows the fact, it should output the same single word 4 times.
        """
        formatted_prompt = self._format_prompt(prompt)
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                num_return_sequences=samples,
                temperature=0.8, # High temp to encourage diversity if uncertain
                top_k=50,
                pad_token_id=self.tokenizer.eos_token_id
            )

        answers = []
        for seq in outputs:
            text = self.tokenizer.decode(seq, skip_special_tokens=True)
            # Extract just the answer part
            clean_answer = text.replace(formatted_prompt, "").strip().lower()
            answers.append(clean_answer)

        # Semantic Consistency Check
        # If answers are ["paris", "paris", "paris", "paris"], consistency = 1.0
        # If answers are ["1899", "london", "yes", "blue"], consistency = low

        embeddings = self.embedder.encode(answers, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings, embeddings)

        mask = torch.triu(torch.ones_like(cosine_scores), diagonal=1).bool()
        sim_values = cosine_scores[mask]

        consistency_score = torch.mean(sim_values).item() if len(sim_values) > 0 else 0.0

        return answers, consistency_score

    def analyze(self, query):
        """
        Runs the full check.
        """
        # 1. Intrinsic (Greedy decode for best guess)
        best_answer, confidence = self.get_intrinsic_score(query)

        # 2. Extrinsic (Consistency check)
        samples, consistency = self.get_consistency_score(query)

        # 3. Score
        # If consistency is low, it's definitely a hallucination.
        # If confidence is low, it's a guess.
        hallucination_index = 1.0 - (0.3 * confidence + 0.7 * consistency)

        return {
            "query": query,
            "prediction": best_answer,
            "samples": samples,
            "metrics": {
                "confidence": round(confidence, 2),
                "consistency": round(consistency, 2),
                "HALLUCINATION_INDEX": round(hallucination_index, 2)
            }
        }

# Testing

In [5]:
detector = HallucinationDetector(global_components)

result_fact = detector.analyze("What is the capital of France?")



print(f"--- FACT CHECK ---")
print(f"Query: {result_fact['query']}")
print(f"Prediction: {result_fact['prediction']}")
print(f"Samples (Consistency): {result_fact['samples']}")
print(f"Scores: {result_fact['metrics']}")



--- FACT CHECK ---
Query: What is the capital of France?
Prediction: Paris

This
Samples (Consistency): ['paris\n\nthis', 'paris.', 'paris.', 'paris\n\nthe']
Scores: {'confidence': 0.66, 'consistency': 0.88, 'HALLUCINATION_INDEX': 0.19}
