In [2]:
# !pip install selfcheckgpt transformers spacy

import torch
from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckBERTScore, SelfCheckNgram
from transformers import pipeline
import spacy

nlp = spacy.load("en_core_web_sm")

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def self_check_llm(passage, sample_list):
  selfcheck_mqag = SelfCheckMQAG(device=device) # set device to 'cuda' if GPU is available
  selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True)
  selfcheck_ngram = SelfCheckNgram(n=1) # n=1 means Unigram, n=2 means Bigram, etc.
  # --------------------------------------------------------------------------------------------------------------- #
  # Split passage into sentences
  sentences = [sent.text.strip() for sent in nlp(passage).sents] # spacy sentence tokenization
  sample1, sample2, sample3 = sample_list[0], sample_list[1],sample_list[2]
  # --------------------------------------------------------------------------------------------------------------- #
  # SelfCheck-MQAG: Score for each sentence where value is in [0.0, 1.0] and high value means non-factual
  # Additional params for each scoring_method:
  # -> counting: AT (answerability threshold, i.e. questions with answerability_score < AT are rejected)
  # -> bayes: AT, beta1, beta2
  # -> bayes_with_alpha: beta1, beta2
  sent_scores_mqag = selfcheck_mqag.predict(
      sentences = sentences,               # list of sentences
      passage = passage,                   # passage (before sentence-split)
      sampled_passages = [sample1, sample2, sample3], # list of sampled passages
      num_questions_per_sent = 5,          # number of questions to be drawn
      scoring_method = 'bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
      beta1 = 0.8, beta2 = 0.8,            # additional params depending on scoring_method
  )
  print("SelfCheck-MQAG:", sent_scores_mqag)

  # --------------------------------------------------------------------------------------------------------------- #
  # SelfCheck-BERTScore: Score for each sentence where value is in [0.0, 1.0] and high value means non-factual
  sent_scores_bertscore = selfcheck_bertscore.predict(
      sentences = sentences,                          # list of sentences
      sampled_passages = [sample1, sample2, sample3], # list of sampled passages\
      )
  print("SelfCheck-BERTScore:", sent_scores_bertscore)

  # --------------------------------------------------------------------------------------------------------------- #
  # SelfCheck-Ngram: Score at sentence- and document-level where value is in [0.0, +inf) and high value means non-factual
  # as opposed to SelfCheck-MQAG and SelfCheck-BERTScore, SelfCheck-Ngram's score is not bounded
  sent_scores_ngram = selfcheck_ngram.predict(
      sentences = sentences,
      passage = passage,
      sampled_passages = [sample1, sample2, sample3],
      )
  print("SelfCheck-Ngram scores:",sent_scores_ngram)

  return sent_scores_mqag, sent_scores_bertscore, sent_scores_ngram


# {'sent_level': { # sentence-level score similar to MQAG and BERTScore variant
#     'avg_neg_logprob': [3.184312, 3.279774],
#     'max_neg_logprob': [3.476098, 4.574710]
#     },
#  'doc_level': {  # document-level score such that avg_neg_logprob is computed over all tokens
#     'avg_neg_logprob': 3.218678904916201,
#     'avg_max_neg_logprob': 4.025404834169327
#     }
# }

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Other samples generated by the same LLM to perform self-check for consistency
sample1 = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He is the host of The Savage Country."
sample2 = "Michael Alan Weiner (born January 13, 1960) is a Canadian radio host. He works at The New York Times."
sample3 = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He obtained his PhD from MIT."
sample_list = [sample1, sample2, sample3]
# LLM's text (e.g. GPT-3 response) to be evaluated at the sentence level  & Split it into sentences
passage = "Michael Alan Weiner (born March 31, 1942) is an American radio host. He is the host of The Savage Nation."

self_check_llm(passage, sample_list)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SelfCheck-MQAG initialized to device cuda
SelfCheck-BERTScore initialized
SelfCheck-1gram initialized
['Michael Alan Weiner (born March 31, 1942) is an American radio host.', 'He is the host of The Savage Nation.']
SelfCheck-MQAG: [0.21206286 0.4177945 ]
SelfCheck-BERTScore: [0.05884961 0.53198777]
SelfCheck-Ngram scores: {'sent_level': {'avg_neg_logprob': [3.184312427726156, 3.279774864365169], 'max_neg_logprob': [3.476098689835273, 4.574710978503383]}, 'doc_level': {'avg_neg_logprob': 3.218678904916201, 'avg_max_neg_logprob': 4.025404834169327}}


(array([0.21206286, 0.4177945 ]),
 array([0.05884961, 0.53198777]),
 {'sent_level': {'avg_neg_logprob': [3.184312427726156, 3.279774864365169],
   'max_neg_logprob': [3.476098689835273, 4.574710978503383]},
  'doc_level': {'avg_neg_logprob': 3.218678904916201,
   'avg_max_neg_logprob': 4.025404834169327}})

In [5]:
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
selfcheck_nli = SelfCheckNLI(device=device) # set device to 'cuda' if GPU is available\
sample_list = [sample1, sample2, sample3]
sentences = [sent.text.strip() for sent in nlp(passage).sents] # spacy sentence tokenization
print(sentences)
sent_scores_nli = selfcheck_nli.predict(
    sentences = sentences,                          # list of sentences
    sampled_passages = sample_list, # list of sampled passages
)
print(sent_scores_nli)


SelfCheck-NLI initialized to device cuda
['Michael Alan Weiner (born March 31, 1942) is an American radio host.', 'He is the host of The Savage Nation.']
[0.33401404 0.97510576]


In [6]:
# Option1: open-source model
from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

llm_model = "mistralai/Mistral-7B-Instruct-v0.2"
selfcheck_prompt = SelfCheckLLMPrompt(llm_model, device)

sent_scores_prompt = selfcheck_prompt.predict(
    sentences = sentences,                          # list of sentences
    sampled_passages = sample_list, # list of sampled passages
    verbose = True, # whether to show a progress bar
)

print(sent_scores_prompt)
# [0.33333333, 0.66666667] -- based on the example above


# Option2: API access (currently only support client_type="openai")
# from selfcheckgpt.modeling_selfcheck_apiprompt import SelfCheckAPIPrompt
# selfcheck_prompt = SelfCheckAPIPrompt(client_type="openai", model="gpt-3.5-turbo")

# sent_scores_prompt = selfcheck_prompt.predict(
#     sentences = sentences,                          # list of sentences
#     sampled_passages = [sample1, sample2, sample3], # list of sampled passages
#     verbose = True, # whether to show a progress bar
# )
# print(sent_scores_prompt)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

SelfCheck-LLMPrompt (mistralai/Mistral-7B-Instruct-v0.2) initialized to device cuda


100%|██████████| 2/2 [00:01<00:00,  1.56it/s]

[0.33333333 0.66666667]



