In [1]:
import os
import random

import pandas as pd
import re
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from datasets import Dataset
import wikipedia
from sklearn.metrics import classification_report

from groq import Groq
from tqdm import tqdm

In [2]:
df = pd.read_csv("hallucination_output.csv")
df.columns

Index(['question', 'right_answer', 'hallucinated_answer',
       'together_llama_answer', 'together_llama_is_hallucinated',
       'together_gemma_answer', 'together_gemma_is_hallucinated',
       'groq_qwen_answer', 'groq_qwen_is_hallucinated'],
      dtype='object')

In [3]:
df_valid_llama = df.dropna(subset=["question", "right_answer", "together_llama_answer"]).copy()
df_valid_gemma = df.dropna(subset=["question", "right_answer", "together_gemma_answer"]).copy()
df_valid_qwen = df.dropna(subset=["question", "right_answer", "groq_qwen_answer"]).copy()

In [4]:
# Load NLI pipelines
nli_pipe = pipeline("text-classification", model="roberta-large-mnli", device=0)
fact_checker = pipeline("text-classification", model="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli", device=0)

# Define vague phrases
vague_phrases = [
    "i don't know", "i do not know", "i'm not sure", "unable to", "no information",
    "not enough information", "not available", "unfortunately", "i cannot verify",
    "cannot be confirmed", "data is missing", "unknown", "n/a", "none found",
    "there is no", "i do not have access"
]

def is_vague(text):
    if pd.isna(text):
        return False
    text = text.lower()
    return any(phrase in text for phrase in vague_phrases)

def clean_markdown(text):
    return re.sub(r'[*_~`]+', '', text)

def get_first_sentence(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return sentences[0] if sentences else text

def verify_claim_with_wikipedia(claim):
    try:
        search_results = wikipedia.search(claim)
        if not search_results:
            return "NO WIKI RESULTS", None
        page = wikipedia.page(search_results[0])
        premise = page.summary
        nli_input = f"{premise} [SEP] {claim}"
        result = fact_checker(nli_input, truncation=True)[0]
        return result['label'], result['score']
    except Exception as e:
        return "ERROR", str(e)

def run_nli_with_vagueness_and_factual_check(data, answer_column, batch_size=16, name='llama'):
    def build_input(row):
        evidence = f"The answer to the question is '{row['right_answer']}'."
        claim = row[answer_column]
        return f"{evidence} [SEP] {claim}"

    data["nli_input"] = data.apply(build_input, axis=1)
    hf_dataset = Dataset.from_pandas(data)

    def run_nli_batch(batch):
        results = nli_pipe(batch["nli_input"], truncation=True)
        return {
            "nli_label": [r["label"] for r in results],
            "nli_score": [r["score"] for r in results]
        }

    hf_dataset = hf_dataset.map(run_nli_batch, batched=True, batch_size=batch_size, desc=f"NLI for {name}")

    def factuality_callback(row):
        label = row["nli_label"]
        score = row["nli_score"]
        claim_text = row["claim_text"]
        vague = is_vague(claim_text)
    
        if label.lower() == "neutral":
            first_sentence = get_first_sentence(clean_markdown(claim_text))
            wiki_label, wiki_score = verify_claim_with_wikipedia(first_sentence)
            label = f"NEUTRAL → {wiki_label}"
            score = wiki_score if isinstance(wiki_score, float) else 0.0
    
        return {"label": label, "score": score, "is_vague": vague}

    # Parallel processing
    rows = [{
        "nli_label": hf_dataset[i]["nli_label"],
        "nli_score": hf_dataset[i]["nli_score"],
        "claim_text": row[answer_column]
    } for i, row in data.iterrows()]
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(factuality_callback, rows), total=len(rows)))
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(f"./nli_predictions/nli_{name}_results.csv", index=False)
    return results_df

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassific

In [5]:
gemma_nli_results = run_nli_with_vagueness_and_factual_check(df_valid_gemma.copy(), "together_gemma_answer", name="gemma").to_dict(orient="records")

NLI for gemma:   0%|          | 0/10000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 10000/10000 [11:58<00:00, 13.92it/s] 


In [15]:
def evaluate_nli(results, human_verified, threshold=0.65):
    def nli_to_pred(entry):
        if entry.get("is_vague", False):  # override for vague responses
            return True
        return ("contradiction" in entry["label"].lower()) and entry["score"] >= threshold

    predicted_labels = [nli_to_pred(r) for r in results]
    true_labels = human_verified.tolist()

    false_negatives = [
        i for i, (true, pred) in enumerate(zip(true_labels, predicted_labels))
        if true and not pred
    ]

    return classification_report(true_labels, predicted_labels), false_negatives

In [16]:
'''print("LLAMA NLI Metrics")
print(evaluate_nli(llama_nli_results, df_valid_llama['together_llama_is_hallucinated']))'''
print("Gemma NLI Metrics")
gemma_report, gemma_false_negs = evaluate_nli(gemma_nli_results, df_valid_gemma['together_gemma_is_hallucinated'])
print(gemma_report)
'''print("QWEN NLI Metrics")
print(evaluate_nli(qwen_nli_results, df_valid_qwen['groq_qwen_is_hallucinated']))'''

Gemma NLI Metrics
              precision    recall  f1-score   support

       False       0.55      0.91      0.69      5268
        True       0.63      0.17      0.26      4732

    accuracy                           0.56     10000
   macro avg       0.59      0.54      0.48     10000
weighted avg       0.59      0.56      0.49     10000



'print("QWEN NLI Metrics")\nprint(evaluate_nli(qwen_nli_results, df_valid_qwen[\'groq_qwen_is_hallucinated\']))'

In [13]:
import wikipedia
from transformers import pipeline
import re

def clean_markdown(text):
    return re.sub(r'[*_~`]+', '', text)  # removes markdown symbols like **, __, ~~ etc.

# Load fact-checking model
fact_checker = pipeline("text-classification", model="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")

# Function to verify a claim
def verify_claim_with_wikipedia(claim, top_k=1):
    try:
        # Search Wikipedia
        search_results = wikipedia.search(claim)
        if not search_results:
            return "NO WIKI RESULTS", None

        # Fetch top result's summary
        page = wikipedia.page(search_results[0])
        premise = page.summary

        # Prepare input for NLI
        nli_input = f"{premise} [SEP] {claim}"
        result = fact_checker(nli_input)[0]

        return result['label'], result['score']
    except Exception as e:
        return "ERROR", str(e)

# Example claim
claim = "Cadmium chloride is slightly soluble in **water**."
claim = clean_markdown(claim)

label, score = verify_claim_with_wikipedia(claim)
print(f"Claim: {claim}\nLabel: {label} (Confidence: {score:.5f})")


Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Claim: Cadmium chloride is slightly soluble in water.
Label: contradiction (Confidence: 0.99820)
