In [None]:
pip install transformers datasets sentence-transformers wikipedia torch scikit-learn tqdm matplotlib pandas


In [None]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import wikipedia
import pandas as pd
import os
import json
from tqdm import tqdm

os.environ["WANDB_DISABLED"] = "true"

# Load TruthfulQA dataset
dataset = load_dataset("truthful_qa", "generation")
questions = [item["question"] for item in dataset["validation"]]

# Generator model
generator = pipeline("text-generation", model="gpt2", max_new_tokens=60)

# Caching paths
gen_cache_path = "generated_answers.json"
evidence_cache_path = "wiki_evidence.json"

# Load cache if exists
generated_data = {}
if os.path.exists(gen_cache_path):
    with open(gen_cache_path) as f:
        generated_data = json.load(f)
else:
    for question in tqdm(questions, desc="Generating answers"):
        try:
            answer = generator(question)[0]["generated_text"]
            generated_data[question] = answer
        except:
            generated_data[question] = "Error generating answer"
    with open(gen_cache_path, "w") as f:
        json.dump(generated_data, f)

# Wikipedia evidence retrieval with caching
evidence_data = {}
if os.path.exists(evidence_cache_path):
    with open(evidence_cache_path) as f:
        evidence_data = json.load(f)
else:
    for question in tqdm(questions, desc="Retrieving Wikipedia evidence"):
        try:
            summary = wikipedia.summary(question, sentences=2)
        except:
            summary = "No relevant Wikipedia summary found."
        evidence_data[question] = summary
    with open(evidence_cache_path, "w") as f:
        json.dump(evidence_data, f)

# Sentence-BERT similarity model
sim_model = SentenceTransformer("all-MiniLM-L6-v2")

def is_consistent_with_evidence(answer, evidence, threshold=0.7):
    embedding1 = sim_model.encode(answer, convert_to_tensor=True)
    embedding2 = sim_model.encode(evidence, convert_to_tensor=True)
    score = util.cos_sim(embedding1, embedding2).item()
    return 1 if score >= threshold else 0

# Build labeled dataset
labeled_data = []
for q in tqdm(questions, desc="Labeling"):
    ans = generated_data[q]
    ev = evidence_data[q]
    label = is_consistent_with_evidence(ans, ev)
    labeled_data.append({
        "question": q,
        "answer": ans,
        "evidence": ev,
        "label": label
    })

# Convert to DataFrame
df = pd.DataFrame(labeled_data)

# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["answer"] + " [SEP] " + examples["evidence"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

tokenized = df.apply(tokenize_function, axis=1)
input_ids = list(tokenized.apply(lambda x: x["input_ids"]))
attention_mask = list(tokenized.apply(lambda x: x["attention_mask"]))
labels = df["label"].tolist()

# Dataset
class HallucinationDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx])
        }

train_ids, val_ids, train_mask, val_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42)

train_dataset = HallucinationDataset(train_ids, train_mask, train_labels)
val_dataset = HallucinationDataset(val_ids, val_mask, val_labels)

# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=1)
    return {"accuracy": accuracy}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./hallucination_detector_full",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
training_args = TrainingArguments(
    output_dir="./hallucination_detector_full",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(eval_results)
