In [1]:
!pip install evaluate bert-score datasets transformers torch tqdm

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate, bert-score
Successfully installed bert-score-0.3.13 evaluate-0.4.6


In [23]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import json
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, accuracy_score, classification_report
import numpy as np
import torch
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from bert_score import score as bertscore
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

### Import Data

In [4]:
cfg = "summarization_samples"
split = "data"

ds = load_dataset("pminervini/HaluEval", cfg, split=split)
print(ds)
print(ds.features)

# first 3 examples
for i in range(3):
    print(f"\nSample {i}:")
    for k, v in ds[i].items():
        print(f"{k}: {v}")

Dataset({
    features: ['document', 'summary', 'hallucination'],
    num_rows: 10000
})
{'document': Value('string'), 'summary': Value('string'), 'hallucination': Value('string')}

Sample 0:
document: Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin's comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but 

### EDA

In [5]:
df = pd.DataFrame(ds)
print(df.columns)
print(df['hallucination'].value_counts(dropna=False))

Index(['document', 'summary', 'hallucination'], dtype='object')
hallucination
yes    5010
no     4990
Name: count, dtype: int64


In [9]:
# set labels to make FactCC model classification
# hallucination is INCORRECT, no hallucination is CORRECT
df['label'] = df['hallucination'].map({'yes': "INCORRECT", 'no': "CORRECT"})
print(df['label'].value_counts(dropna=False))

label
INCORRECT    5010
CORRECT      4990
Name: count, dtype: int64


### Prepare Data for Baseline Model

In [10]:
documents = df['document'].tolist()
summaries = df['summary'].tolist()
labels = df['label'].tolist()

### Build Model

In [11]:
model_path = "manueldeprada/FactCC"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

text='''The US has "passed the peak" on new coronavirus cases, the White House reported. They predict that some states would reopen this month.
The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world.'''
wrong_summary = '''The pandemic has almost not affected the US'''

# test model is set up correctly

input_dict = tokenizer(text, wrong_summary, max_length=512, padding='max_length', truncation='only_first', return_tensors='pt')
logits = model(**input_dict).logits
pred = logits.argmax(dim=1)
model.config.id2label[pred.item()] # prints: INCORRECT

'INCORRECT'

In [15]:
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LENGTH = 512
BATCH_SIZE = 16

In [16]:
class FactCCScorer:
    def __init__(self, model_path, batch_size, max_length, device):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
        self.batch_size = batch_size
        self.max_length = max_length
        self.device = device

    def _get_prediction(self, document_batch, summary_batch):
        inputs = self.tokenizer(document_batch, summary_batch, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = self.model(**inputs).logits
        predictions = logits.argmax(dim=1)
        return [self.model.config.id2label[p.item()] for p in predictions]

    def score_batch(self, documents, summaries):
        predictions = []
        for i in tqdm(range(0, len(documents), self.batch_size), desc="Scoring batches"):
            doc_batch = documents[i:i + self.batch_size]
            sum_batch = summaries[i:i + self.batch_size]
            preds = self._get_prediction(doc_batch, sum_batch)
            predictions.extend(preds)
        return predictions

In [17]:
factcc_scorer = FactCCScorer(model_path, BATCH_SIZE, MAX_LENGTH, DEVICE)
predictions = factcc_scorer.score_batch(documents, summaries)

# Add predictions to DataFrame
df['factcc_prediction'] = predictions

Scoring batches: 100%|██████████| 625/625 [02:30<00:00,  4.15it/s]


In [24]:
y_true = df['label'].tolist()
y_pred = df['factcc_prediction'].tolist()

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

f1_incorrect = f1_score(y_true, y_pred, pos_label='INCORRECT')
print(f"F1-score for 'INCORRECT' (Hallucination): {f1_incorrect:.4f}")

f1_correct = f1_score(y_true, y_pred, pos_label="CORRECT")
print(f"F1-score for 'CORRECT' (No Hallucination): {f1_correct:.4f}")

print(classification_report(y_true, y_pred))

Accuracy: 0.4766
F1-score for 'INCORRECT' (Hallucination): 0.6084
F1-score for 'CORRECT' (No Hallucination): 0.2110
              precision    recall  f1-score   support

     CORRECT       0.43      0.14      0.21      4990
   INCORRECT       0.49      0.81      0.61      5010

    accuracy                           0.48     10000
   macro avg       0.46      0.48      0.41     10000
weighted avg       0.46      0.48      0.41     10000

