In [1]:
!pip install transformers torch



In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

In [3]:
# mount drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = "/content/drive/MyDrive/NLP/LLM_Hallucination/halueval_summarization_paraphrased.csv"
df = pd.read_csv(path)

df.head()

Unnamed: 0,document,summary,summary_paraphrased
0,It has been 40 years since the historic evacua...,Ba Van Nguyen piloted a military helicopter on...,Ba Van Nguyen piloted a military helicopter on...
1,What if someone took your greatest insecuritie...,Dove's latest ad campaign shows actresses bull...,Dove's latest ad campaign shows actresses bull...
2,"So, you'd like a ""Full House"" reunion and spin...","Show will return with a one-hour special, foll...","The show will return with a one-hour special, ..."
3,Ahead of a weekend featuring drama from the Ba...,Crystal Palace is set to face West Bromwich Al...,Crystal Palace is set to face West Bromwich Al...
4,The dawn of a new era of space travel may be u...,His Blue Origin company completed a successful...,His Blue Origin company completed a successful...


In [5]:
model = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)
model.eval()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [6]:
label_map = {0: "contradiction", 1: "neutral", 2: "entailment"}

def nli_probs(premise: str, hypothesis: str):
    inputs = tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding=True,
        return_tensors="pt",
        max_length=512,
    )
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
    return {
        "contradiction": float(probs[0]),
        "neutral": float(probs[1]),
        "entailment": float(probs[2]),
    }

In [34]:
results = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    s = str(row["summary"])
    p = str(row["summary_paraphrased"])

    # summary -> paraphrase
    sp = nli_probs(s, p)
    # paraphrase -> summary
    ps = nli_probs(p, s)

    results.append({
        "index": idx,
        "summary": s,
        "summary_paraphrased": p,
        "sp_entail": sp["entailment"],
        "sp_contra": sp["contradiction"],
        "ps_entail": ps["entailment"],
        "ps_contra": ps["contradiction"],
    })

nli_df = pd.DataFrame(results)

100%|██████████| 1000/1000 [09:58<00:00,  1.67it/s]


In [35]:
CONTRA_THRESHOLD = 0.3
nli_df["suspicious_contra"] = (
    (nli_df["sp_contra"] >= CONTRA_THRESHOLD) |
    (nli_df["ps_contra"] >= CONTRA_THRESHOLD)
)
suspicious_df = nli_df[nli_df["suspicious_contra"]].copy()

print("Suspicious pairs:", len(suspicious_df), "out of", len(nli_df))
suspicious_df.head()


Suspicious pairs: 20 out of 1000


Unnamed: 0,index,summary,summary_paraphrased,sp_entail,sp_contra,ps_entail,ps_contra,suspicious_contra
0,0,Ba Van Nguyen piloted a military helicopter on...,Ba Van Nguyen piloted a military helicopter on...,0.849562,0.026522,0.01802,0.38343,True
47,47,Historians have criticized the BBC drama Wolf ...,Historians have criticized the BBC drama Wolf ...,0.138235,0.051826,0.011353,0.469424,True
49,49,David Cameron is secretly planning to cut Chil...,The Conservatives have been accused of keeping...,0.019321,0.833669,0.043859,0.024442,True
136,136,The South American boa constrictor has been ca...,The South American boa constrictor has been ca...,0.606599,0.348393,0.233074,0.53723,True
150,150,A Scottish man who scammed a dying architect o...,A Scottish man who scammed a dying architect o...,0.062103,0.880807,0.003391,0.035931,True


In [40]:
from google.colab import userdata
import os
from openai import OpenAI
import json

API_KEY = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key=API_KEY)

In [41]:
def judge_pair_llm(summary_a: str, summary_b: str):
  """
  Uses an LLM to check for factual drift between summaries
  """
  system_msg = (
    "You are a careful factual consistency checker. "
    "Your job is to detect any factual drift between two summaries of the same document."
  )

  user_msg = f"""
    Summary A (original):
    {summary_a}

    Summary B (paraphrased):
    {summary_b}

    Compare Summary B against Summary A ONLY.

    Answer these questions:
    1. Does B introduce any new factual claims not supported by A?
    2. Does B remove any factual claims clearly stated in A?
    3. Does B contradict any factual claims in A?

    Respond in VALID JSON exactly in this format:

    {{
    "label": "no_drift" | "added_facts" | "removed_facts" | "contradiction",
    "explanation": "short explanation (1-2 sentences)"
    }}
  """

  resp = client.chat.completions.create(
      model="gpt-4.1-mini",
      messages=[
          {"role": "system", "content": system_msg},
          {"role": "user", "content": user_msg},
      ],
      temperature=0,
  )

  content = resp.choices[0].message.content.strip()

  try:
      data = json.loads(content)
  except json.JSONDecodeError:
      start = content.find("{")
      end = content.rfind("}")
      data = json.loads(content[start:end+1])

  return data

In [42]:
judgments = []
for _, row in tqdm(suspicious_df.iterrows(), total=len(suspicious_df)):
    res = judge_pair_llm(row["summary"], row["summary_paraphrased"])
    res["index"] = row["index"]
    judgments.append(res)

judge_df = pd.DataFrame(judgments)
suspicious_with_judgments = suspicious_df.merge(judge_df, on="index", how="left")
print(suspicious_with_judgments["label"].value_counts())

100%|██████████| 20/20 [00:33<00:00,  1.66s/it]

label
removed_facts    16
no_drift          3
contradiction     1
Name: count, dtype: int64





The removal of facts could potentially be removing the hallucinations themselves. We drop the rows containing these and the contradictions from our test set.

In [44]:
bad_rows = suspicious_with_judgments[
    suspicious_with_judgments["label"].isin(["removed_facts", "contradiction"])
]

bad_indices = bad_rows["index"].unique()
print("Dropping ", len(bad_rows), " rows:", bad_indices)

Dropping  17  rows: [  0  47  49 136 150 214 228 252 254 341 390 541 624 827 862 908 921]


In [46]:
X_test = pd.read_csv("/content/drive/MyDrive/NLP/LLM_Hallucination/halueval_summarization_paraphrased.csv")
y_test = pd.read_csv("/content/drive/MyDrive/NLP/LLM_Hallucination/y_test.csv")

print(len(X_test))
print(len(y_test))

1000
1000


In [48]:
X_test_clean = X_test.drop(index=bad_indices)
X_test_clean = X_test_clean.reset_index(drop=True)

y_test_clean = y_test.drop(index=bad_indices)
y_test_clean = y_test_clean.reset_index(drop=True)

print(len(X_test_clean))
print(len(y_test_clean))

983
983


In [49]:
X_test_clean.to_csv("/content/drive/MyDrive/NLP/LLM_Hallucination/X_test_clean.csv", index=False)
y_test_clean.to_csv("/content/drive/MyDrive/NLP/LLM_Hallucination/y_test_clean.csv", index=False)