In [None]:
pip install transformers datasets sentence-transformers wikipedia torch scikit-learn tqdm matplotlib pandas


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinu

In [None]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import wikipedia
import pandas as pd
import os
import json
from tqdm import tqdm

os.environ["WANDB_DISABLED"] = "true"

# Load TruthfulQA dataset
dataset = load_dataset("truthful_qa", "generation")
questions = [item["question"] for item in dataset["validation"]]

# Generator model
generator = pipeline("text-generation", model="gpt2", max_new_tokens=60)

# Caching paths
gen_cache_path = "generated_answers.json"
evidence_cache_path = "wiki_evidence.json"

# Load cache if exists
generated_data = {}
if os.path.exists(gen_cache_path):
    with open(gen_cache_path) as f:
        generated_data = json.load(f)
else:
    for question in tqdm(questions, desc="Generating answers"):
        try:
            answer = generator(question)[0]["generated_text"]
            generated_data[question] = answer
        except:
            generated_data[question] = "Error generating answer"
    with open(gen_cache_path, "w") as f:
        json.dump(generated_data, f)

# Wikipedia evidence retrieval with caching
evidence_data = {}
if os.path.exists(evidence_cache_path):
    with open(evidence_cache_path) as f:
        evidence_data = json.load(f)
else:
    for question in tqdm(questions, desc="Retrieving Wikipedia evidence"):
        try:
            summary = wikipedia.summary(question, sentences=2)
        except:
            summary = "No relevant Wikipedia summary found."
        evidence_data[question] = summary
    with open(evidence_cache_path, "w") as f:
        json.dump(evidence_data, f)

# Sentence-BERT similarity model
sim_model = SentenceTransformer("all-MiniLM-L6-v2")

def is_consistent_with_evidence(answer, evidence, threshold=0.7):
    embedding1 = sim_model.encode(answer, convert_to_tensor=True)
    embedding2 = sim_model.encode(evidence, convert_to_tensor=True)
    score = util.cos_sim(embedding1, embedding2).item()
    return 1 if score >= threshold else 0

# Build labeled dataset
labeled_data = []
for q in tqdm(questions, desc="Labeling"):
    ans = generated_data[q]
    ev = evidence_data[q]
    label = is_consistent_with_evidence(ans, ev)
    labeled_data.append({
        "question": q,
        "answer": ans,
        "evidence": ev,
        "label": label
    })

# Convert to DataFrame
df = pd.DataFrame(labeled_data)

# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["answer"] + " [SEP] " + examples["evidence"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

tokenized = df.apply(tokenize_function, axis=1)
input_ids = list(tokenized.apply(lambda x: x["input_ids"]))
attention_mask = list(tokenized.apply(lambda x: x["attention_mask"]))
labels = df["label"].tolist()

# Dataset
class HallucinationDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx])
        }

train_ids, val_ids, train_mask, val_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42)

train_dataset = HallucinationDataset(train_ids, train_mask, train_labels)
val_dataset = HallucinationDataset(val_ids, val_mask, val_labels)

# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=1)
    return {"accuracy": accuracy}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./hallucination_detector_full",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Generating answers:   0%|          | 0/817 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   0%|          | 1/817 [00:04<58:52,  4.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   0%|          | 2/817 [00:08<54:54,  4.04s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   0%|          | 3/817 [00:12<56:05,  4.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   0%|          | 4/817 [00:16<54:14,  4.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   1%|          | 5/817 [00:20<53:11,  3.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating answers:   1%|          | 6/817 [00:24<56:42,  4.20s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating an

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Labeling: 100%|██████████| 817/817 [01:17<00:00, 10.54it/s]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.10383295267820358, 'eval_accuracy': 0.9817073170731707, 'eval_precision': 1.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 291.9748, 'eval_samples_per_second': 0.562, 'eval_steps_per_second': 0.072, 'epoch': 3.0}


In [None]:
training_args = TrainingArguments(
    output_dir="./hallucination_detector_full",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.10587287694215775, 'eval_accuracy': 0.9817073170731707, 'eval_precision': 1.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 4.5614, 'eval_samples_per_second': 35.954, 'eval_steps_per_second': 4.604, 'epoch': 3.0}
