In [3]:
!pip install selfcheckgpt datasets sentencepiece

Collecting selfcheckgpt
  Using cached selfcheckgpt-0.1.4-py3-none-any.whl
Collecting datasets
  Using cached datasets-2.16.1-py3-none-any.whl (507 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting transformers>=4.11.3 (from selfcheckgpt)
  Using cached transformers-4.36.2-py3-none-any.whl (8.2 MB)
Collecting bert-score (from selfcheckgpt)
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Collecting nltk (from selfcheckgpt)
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Collecting huggingface-hub>=0.19.4 (from datasets)
  U

## Based on official SelfCheckGPT documentation

In [4]:
import torch
import spacy

from datasets import load_dataset
from sklearn.metrics import precision_recall_curve, auc

from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore, SelfCheckNgram, SelfCheckNLI

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm

[0mCollecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Passage Example

In [7]:
# LLM's text (e.g. GPT-3 response) to be evaluated at the sentence level  & Split it into sentences
nlp = spacy.load("en_core_web_sm")
passage = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He is the host of The Savage Nation."
sentences = [sent.text.strip() for sent in nlp(passage).sents] # spacy sentence tokenization
print(sentences)
['Michael Alan Weiner (born March 31, 1942) is an American radio host.', 'He is the host of The Savage Nation.']

# Other samples generated by the same LLM to perform self-check for consistency
sample1 = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He is the host of The Savage Country."
sample2 = "Michael Alan Weiner (born January 13, 1960) is a Canadian radio host. He works at The New York Times."
sample3 = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He obtained his PhD from MIT."

['Michael Alan Weiner (born March 31, 1942) is an American radio host.', 'He is the host of The Savage Nation.']


### SelfCheck ngram model

In [8]:
selfcheck_ngram = SelfCheckNgram(n=1) # n=1 means Unigram, n=2 means Bigram, etc.
# SelfCheck-Ngram: Score at sentence- and document-level where value is in [0.0, +inf) and high value means non-factual
# as opposed to SelfCheck-MQAG and SelfCheck-BERTScore, SelfCheck-Ngram's score is not bounded
sent_scores_ngram = selfcheck_ngram.predict(
    sentences = sentences,
    passage = passage,
    sampled_passages = [sample1, sample2, sample3],
)
print(sent_scores_ngram)

SelfCheck-1gram initialized
{'sent_level': {'avg_neg_logprob': [3.184312427726156, 3.279774864365169], 'max_neg_logprob': [3.476098689835273, 4.574710978503383]}, 'doc_level': {'avg_neg_logprob': 3.218678904916201, 'avg_max_neg_logprob': 4.025404834169327}}


### SelfCheck BERT score model

In [9]:
selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True)
# --------------------------------------------------------------------------------------------------------------- #
# SelfCheck-BERTScore: Score for each sentence where value is in [0.0, 1.0] and high value means non-factual
sent_scores_bertscore = selfcheck_bertscore.predict(
    sentences = sentences,                          # list of sentences
    sampled_passages = [sample1, sample2, sample3], # list of sampled passages
)
print(sent_scores_bertscore)
# [0.0695562  0.45590915]

SelfCheck-BERTScore initialized
[0.05884961 0.53198812]


In [None]:
### SelfCheck NLI model

In [10]:
selfcheck_nli = SelfCheckNLI(device=device) # set device to 'cuda' if GPU is available

sent_scores_nli = selfcheck_nli.predict(
    sentences = sentences,                          # list of sentences
    sampled_passages = [sample1, sample2, sample3], # list of sampled passages
)
print(sent_scores_nli)
# [0.334014 0.975106 ] -- based on the example above

SelfCheck-NLI initialized to device cuda
[0.33401403 0.97510584]


# Amazon Products Test dataset (20 rows)

In [14]:
import pandas as pd
data = pd.read_csv('s3://d3-data-bucket/labs/trustworthy_ai/data/test.csv')

In [16]:
from datasets import load_dataset, Dataset
amazon_dataset = Dataset.from_pandas(data)

In [None]:
amazon_dataset

In [18]:
amazon_dataset_iter = amazon_dataset.to_iterable_dataset()

In [20]:
# print a few examples
for sample in amazon_dataset_iter:
    x = sample
    print(x)
    break

{'title': 'X. L. Carbon Fiber Money Clip, made in the USA', 'brand': 'Roar Carbon', 'feature': "['Real Carbon Fiber', 'Made in USA', '5 year warranty against delamination', 'Holds Cash and Cards', 'You can return this item for any reason and get a full refund: no shipping charges. The item must be returned in new and unused condition.', 'Read the full returns policy', 'Go to Your Orders to start the return', 'Print the return shipping label', 'Ship it!', 'Package Dimensions:\\n                    \\n2.5 x 2.2 x 0.6 inches', 'Shipping Weight:\\n                    \\n0.32 ounces (View shipping rates and policies)']", 'description': "['When you pull out your extra large carbon fiber money clip it gives you that status as you know whats new and has people asking what it is and where did you get it? Each carbon fiber money clip is made from the highest quality Carbon Fiber in America. Each money clip is infused with an epoxy resin giving you a strong and light weight money clip. This gives

In [21]:
label_mapping = {
    'accurate': 0.0,
    'minor_inaccurate': 1.0,
    'major_inaccurate': 1.0,
}

In [None]:
scores = []
for sample in amazon_dataset_iter:
    sent_scores_nli = selfcheck_nli.predict(
    sentences = sample['generated_description'],
    sampled_passages = sample['sample_generated_description'],
  )
scores.extend(sent_scores_nli)

In [None]:
print(scores)