## Setup

In [1]:
# For transformer models
!pip install -q accelerate
# !pip install -q bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install -q flash-attn --no-build-isolation

# For sentence similarity
!pip install sentence_transformers

# For web queries
!pip install googlesearch-python

# For Retrieval Augmentated Generation (RAG) since HF doesn't have great support for it
!pip install langchain
!pip install chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/297.6 kB[0m [31m960.2 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/297.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m235.5/297.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/297.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hLooking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
# Import libraries
import os
import json
import requests

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

import torch.nn.functional as F

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

from collections import Counter

In [3]:
# Set up colab environment variables
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['SERPER_API_KEY'] = userdata.get('SERPER_API_KEY')

In [8]:
# Set up HuggingFace authentication
from huggingface_hub import login, notebook_login
# notebook_login()
login(os.environ.get('HF_TOKEN'))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
# Define the quantization configuration (ref: https://huggingface.co/blog/4bit-transformers-bitsandbytes)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load pre-trained model and tokenizer (might take a while to download model weights)
model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # [meta-llama/Llama-2-13b, allenai/OLMo-7B]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True, # Trust the model weights from the remote server
    device_map="auto", # Use all RAM from GPU, CPU, disk, in that order (ref: https://huggingface.co/docs/accelerate/en/usage_guides/big_modeling#using--accelerate)
    quantization_config=bnb_config, # Quantize the model using bitsandbytes
    # attn_implementation='flash_attention_2', # Use flash attention 2 (ref: https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one?install=NVIDIA#flashattention-2)
)

# Test the model for fun
prompt = "Tell me a joke aboutlarge language models"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output_ids = model.generate(input_ids, max_length=1024, num_return_sequences=1, early_stopping=True)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tell me a joke aboutlarge language models...

Why did the large language model go to the party? It wanted to expand its social circle.

(I'm a large language model, so I thought I'd share a joke about us!)

(Also, I'm a large language model, so I can make this joke as many times as I want without getting tired!)

(But seriously, I hope you found that joke amusing. I'll be here if you need any assistance or just want to chat!)


In [10]:
# Utility function to retry a function until it succeeds
def retry_function(fn, num_retries=5):
    for i in range(num_retries):
        try:
            return fn()
        except Exception as e:
            print(f"Failed attempt {i+1}/{num_retries}: {e}")
            continue
        break
    raise Exception(f"Failed after {num_retries} attempts")

# Utility function to convert a multiline string to a list
# Python's eval() function doesn't support this
def multiline_string_to_list(string):
    # Remove leading and trailing whitespace and newlines
    string = string.strip()

    # Check if the string starts with '[' and ends with ']'
    if string.startswith('[') and string.endswith(']'):
        # Remove the opening and closing brackets
        string = string[1:-1]

        # Split the string by commas and newlines
        items = string.split(',')

        # Strip whitespace and single/double quotes from each item
        cleaned_items = [item.strip().strip("'").strip('"') for item in items]

        return cleaned_items
    else:
        raise ValueError("Invalid input format. The string should represent a valid Python list.")

In [11]:
!wget https://raw.githubusercontent.com/shayantist/LLM-FactChecker/main/data/examples.json

--2024-04-29 05:24:13--  https://raw.githubusercontent.com/shayantist/LLM-FactChecker/main/data/examples.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5325 (5.2K) [text/plain]
Saving to: ‘examples.json’


2024-04-29 05:24:13 (51.9 MB/s) - ‘examples.json’ saved [5325/5325]



In [12]:
# Load examples from JSON file
with open('examples.json', 'r') as f:
    examples = json.load(f)

# Load Sentence Transformer model for sentence/example similarity
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to select the best few-shot examples
def select_best_examples(input, examples, example_key, num_examples=3):
    """
    Selects the best few-shot examples based on semantic similarity to the input.

    Args:
        claim (str): The input claim.
        examples (list): A list of examples.
        example_key (str): The key to use for comparison to the input.
        template (str): The prompt template.
        num_examples (int): The number of examples to return.

    Returns:
        list: The best few-shot examples.
    """
    # Extract the specific sentences to compare to the input
    example_inputs = [example[example_key] for example in examples]

    # Calculate sentence embeddings for the input sentence and the examples
    input_embeddings = sentence_model.encode(input)
    example_embeddings = sentence_model.encode(example_inputs)

    # Calculate cosine similarity scores between them
    similarity_scores = cos_sim(input_embeddings, example_embeddings).flatten()

    # Filter out any examples that are too similar to the input
    similarity_scores = similarity_scores[similarity_scores < 1]

    # Select the top k similar examples
    best_example_idx = similarity_scores.topk(num_examples).indices

    best_examples = [examples[idx] for idx in best_example_idx]
    return best_examples

# # Example usage
# claim = "The United States has had two black presidents: Barack Obama, who served two terms from 2009 to 2017, and Donald Trump, who served one term from 2017 to 2021."
# best_examples = select_best_examples(claim, examples["claim_atomization_examples"], "statement", 3)
# best_examples

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Subtasks

### Task 1: Claim Atomization


In [13]:
## Claim Atomization

# Define prompt template (ref: https://docs.mistral.ai/guides/prompting_capabilities/)
claim_atomization_template = """
You are a helpful assistant. Your task is to break down a set of statements given after <<<>>> into a minimal number of atomic claims.
These atomic claims need to be comprehensible, coherent, and context-independent.

Segmentation Criteria:
1. Each sub-claim should focus on a single idea or concept.
2. Sub-claims should be independent of each other and not rely heavily on the context of the original statement.
3. Aim for clarity and coherence in the segmented sub-claims.

You will only respond with the atomic claims in the format of a single, one-dimensional Python list of string objects in exactly one line.
Do not provide any explanations or notes.

###
Here are some examples:
{examples}
###

<<<
Statements: {statements}
>>>
Atomic Claims: ["""

def generate_atomic_claims(statements, num_examples=3):
    """
    Generates atomic claims for the input statements.

    Args:
        claim (str): The input statements.
        num_examples (int, optional): The number of few-shot examples to include in the prompt. Defaults to 3.

    Returns:
        str: The generated atomic claims.
    """
    if num_examples > 0: # Populate the prompt with few-shot examples (w/ proper formatting)
        examples_text = ""
        best_examples = select_best_examples(statements, examples["claim_atomization_examples"], "statement", num_examples)

        # Add each example to the prompt
        for example in best_examples:
            examples_text += f"Statements: {example['statement']}\n"
            examples_text += f"Atomic Claims: {example['atomic_claims']}\n"

        # Finally, fill in the prompt template with the examples and the input statements
        prompt = claim_atomization_template.format(examples=examples_text.strip(), statements=statements).strip()
    else: # Otherwise leave the examples section of the prompt template blank and only include the input statements
        prompt = claim_atomization_template.format(examples="", statements=statements.strip()).strip()

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate the response using the model
    output_ids = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.unk_token_id,
        num_return_sequences=1,
        early_stopping=True
    )
    # Decode the generated text
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the list of claims from the model's output
    try:
        # Assuming output format directly returns Python list
        atomic_claims = multiline_string_to_list(output_text.split('Atomic Claims:')[-1].strip())
        # POST-PROCESSING ERROR HANDLING: If list contains lists, return a flattened list
        if isinstance(atomic_claims[0], list):
            atomic_claims = [item for sublist in atomic_claims for item in sublist]
        return atomic_claims
    except:
        print(f"Error parsing model output: {output_text}")
        return ["Error parsing model output"]

# # Example usage for claim atomization
# statement = 'After a 2022 law, the vast majority of colleges in New York State do not have on-campus poll sites.'
# atomic_claims = generate_atomic_claims(statement, num_examples=3)
# print(f"Statement: {statement}")
# print(f"Atomic Claims: {atomic_claims}")

### Task 2: Question Generation

In [14]:
## Question Generation

# Define prompt template
question_generation_template = """
You are a helpful assistant. Your task is to provide a set of unique, independent questions to search on the web to verify the claim given after <<<>>>.

Question generation criteria:
1. Each question should be context-independent and answered independently (i.e., without access to claim)
1. Each question should be able to be fact-checked by a True/False.
2. Be as specific and concise as possible. Try to minimize the number of questions.
4. Include enough details to ensure that the claim can be verified.

You will only respond with the generated questions in the format of a single, one-dimensional Python list in exactly one line (no multi-line lists).
Do not provide any explanations or notes.

###
Here are some examples:
{examples}
###

<<<
Claim: {claim}
>>>
Questions: ["""

def generate_questions(claim, num_examples=3):
    """
    Generates questions to verify the factuality of the input claim.

    Args:
        claim (str): The input claim.
        num_examples (int, optional): The number of few-shot examples to include in the prompt. Defaults to 3.

    Returns:
        str: The generated questions.
    """
    if num_examples > 0: # Populate the prompt with few-shot examples (w/ proper formatting)
        examples_text = ""
        best_examples = select_best_examples(claim, examples["question_generation_examples"], "claim", num_examples)

        # Add each example to the prompt
        for example in best_examples:
            examples_text += f"Claim: {example['claim']}\n"
            examples_text += f"Questions: {example['questions']}\n"

        # Finally, fill in the prompt template with the examples and the input claim
        prompt = question_generation_template.format(examples=examples_text.strip(), claim=claim).strip()
    else: # Otherwise leave the examples section of the prompt template blank and only include the input claim
        prompt = question_generation_template.format(examples="", claim=claim).strip()

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate the response using the model
    output_ids = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.unk_token_id,
        num_return_sequences=1,
        early_stopping=True
    )

    # Decode the generated text
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the list of questions from the model's output
    try:
        # Assuming output format directly returns Python list
        questions = multiline_string_to_list(output_text.split('Questions:')[-1].strip())
        return questions
    except:
        print(f"Error parsing model output: {output_text}")
        return ["Error parsing model output"]

# # Example usage for question generation
# claim = "Donald Trump said ‘Crime is down in Venezuela by 67%'"
# questions = generate_questions(claim, num_examples=2)
# print(f"Claim: {claim}")
# print(f"Questions: {questions}")

### Task 3: Web Querying & Scraping

In [15]:
## Web Querying & Scraping
import json
import requests
import pprint
import re
from bs4 import BeautifulSoup

# Make sure we don't scrape from known fact checking websites
SOURCE_BLACKLIST = ['politifact.org', 'factcheck.org']

def extract_website_name(url):
    """Extracts the website name from a given URL using regex"""
    match = re.search(r'(?P<url>https?://[^\s]+)', url)
    if match:
        url = match.group('url')
        return url.split('//')[1].split('/')[0].lower().replace('www.', '')
    return None

def scrape_text_from_website(url):
    """Scrapes text and metadata from a given website URL."""
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove script and style tags
            for script in soup(["script", "style"]):
                script.decompose()

            # Extract all text from the website
            text = soup.get_text()

            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text).strip()

            return text
        else:
            print(f"Failed to retrieve content from the URL: {url}")
            return None
    except Exception as e:
        print(f"Error during website scraping: {e}")
        return None

def fetch_search_results(question, scrape_website=False):
    """
    Fetches search results for a given question using an API.

    Args:
        question (str): The question to search for.
        scrape_website (bool, optional): Whether to scrape the website content. Defaults to False.

    Returns:
        list: A list of organic search results.
    """
    api_key = os.environ.get("SERPER_API_KEY")

    headers = {
        "X-API-KEY": api_key,
        "Content-Type": "application/json",
    }

    payload = json.dumps({"q": question})
    try:
        response = requests.post("https://google.serper.dev/search", headers=headers, data=payload)
        result = json.loads(response.text)

        # Extract the organic search results and transform them into our desired format
        results = []
        for item in result['organic']:
            # ALSO while iterating through the results, remove any websites on our source blacklist
            source = extract_website_name(item.get('link', ''))
            if source in SOURCE_BLACKLIST: continue
            website_text = scrape_text_from_website(item.get('link', '')) if scrape_website else item.get('snippet', '')
            if website_text is None or website_text == '': # if we failed to scrape the website, use the snippet
                website_text = item.get('snippet', '')
            results.append({
                "title": item.get('title', ''),
                "source": source,
                "date_published": item.get('date', ''),
                "relevant_excerpt": item.get('snippet', ''),
                "text": website_text,
                "search_position": item.get('position', -1),
                "url": item.get('link', ''),
            })
        return results

    except Exception as e:
        print(f"Failed to fetch information: {e}")
        return []

# # Example usage
# question = "What is the estimated cost of the Green New Deal according to its proponents?"
# search_results = fetch_search_results(question, scrape_website=True)
# search_results

### Task 4: Retrieval Augmented Generation (RAG) Retriever

In [16]:
## Retrieval Augmented Generation (RAG) Retriever
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch

import copy

# Initialize embedding model for retrieval (sentence similarity)
BATCH_SIZE = 32
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
retriever_model_id='sentence-transformers/all-MiniLM-L6-v2'
retriever_model = HuggingFaceEmbeddings(
    model_name=retriever_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': BATCH_SIZE},
)

def retrieve_relevant_documents_using_rag(search_results, content_key, question, chunk_size=512, chunk_overlap=128, top_k=10):
    """
    Takes in search results and a query question, processes and splits the documents,
    and retrieves relevant documents using a RAG approach.

    Args:
        search_results (list of dict): A list of dictionaries containing web-scraped data.
        question (str): The query question for retrieving relevant documents.
        content_key (str): The key in the dictionary containing the text content.
        chunk_size (int): The maximum size of the text chunks.
        chunk_overlap (int): The overlap between consecutive text chunks.
        top_k (int): The number of relevant documents to retrieve.

    Returns:
        list: A list of relevant document chunks.
    """
    # Create LangChain documents from search results
    documents = []
    for result in search_results:
        page_content = result.pop(content_key, None)  # Extract the text content, remaining keys are metadata
        if page_content is not None:
            documents.append(Document(page_content=page_content, metadata=result))

    # Split documents into smaller chunks (if needed, based on document size)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    split_documents = text_splitter.split_documents(documents)

    # Initialize ChromaDB vector store to index the document chunks
    db = Chroma.from_documents(
        documents=split_documents,
        embedding=retriever_model,
    )

    # Retrieve the most relevant chunks for the given question
    relevant_docs = db.max_marginal_relevance_search(question, k=top_k)

    return relevant_docs

# # Example usage
# question = "What is the estimated cost of the Green New Deal according to its proponents?"
# relevant_docs = retrieve_relevant_documents_using_rag(search_results, 'text', question)
# relevant_docs

### Task 5: RAG-based Question Answering

In [17]:
## RAG-based Question Answering

# Define prompt template
answer_synthesis_template = """
You are a helpful assistant. Your task is to synthesize the documents (along with their source metadata) provided below to answer the question given after <<<>>>.
Only use the documents below to answer the question. In a separate section below your answer titled "Sources:", cite the relevant documents you used to answer the question as a Python list."
If you cannot answer the question given the relevant documents, just say that you don't have enough information to answer the question. Do not make up an answer or sources.

Here are the relevant documents:
{documents}

<<<
Question: {question}
>>>
Answer: """

def synthesize_answer(relevant_docs, question, return_sources=True):
    """
    Synthesizes an answer to a given question using the relevant documents.

    Args:
        relevant_docs (list of dict): A list of relevant document chunks.
        question (str): The question to answer.

    Returns:
        str: The synthesized answer.
    """
    # Format the relevant documents for the prompt
    documents_text = ""
    for doc in relevant_docs:
        documents_text += f"Title: {doc.metadata.get('title', '')}\n"
        documents_text += f"URL: {doc.metadata.get('url', '')}\n"
        documents_text += f"Text: {doc.page_content.strip()}\n"
        documents_text += f"Date Published: {doc.metadata.get('date_published', '')}\n\n"

    # Fill in the prompt template with the relevant documents and the question
    prompt = answer_synthesis_template.format(documents=documents_text.strip(), question=question).strip()
    prompt = prompt.replace('\n\n\n', '\n')

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate the response using the model
    output_ids = model.generate(
        input_ids,
        max_new_tokens=1024,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.unk_token_id,
        num_return_sequences=1,
    )

    # Decode the generated text
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract the answer and sources separately from the model's output
    try:
        answer = output_text.split('Answer:')[-1].split('Sources:')[0].strip()
        sources = output_text.split('Sources:')[-1].strip()
        if return_sources: return answer, sources
        return answer
    except:
        raise ValueError(f"Error parsing model output: {output_text}")

# # Example usage for RAG-based question answering (intentionally mismatched with the search results above for testing)
# question = "What is the estimated cost of the Green New Deal?"
# answer, sources = synthesize_answer(relevant_docs, question)
# print(f"Question: {question}")
# print(f"Answer: {answer}")
# print(f"Sources: {sources}")

### Task 6: Claim Classification

In [18]:
## Claim Classification

# Define prompt template for reasoning and classification
claim_classification_template = """
You are a logical reasoning assistant. Given the original claim, a set of questions to help verify the claim, and their answers, use logical reasoning to come to a verdict on whether the claim is true or false.
Think step-by-step about your reasoning process.
Return the verdict after "Verdict:" and provide a clear explanation after "Reasoning:"
For the verdict, only classify the claim as "True" or "False".

Claim: {claim}

{questions_and_answers}

Verdict: """

def classify_claim(claim, questions, answers, return_reasoning=True):
    """
    Uses a chain-of-thought approach to classify the original claim as true or false based on the answers to generated questions.

    Args:
        claim (str): The original claim.
        questions (list): List of questions related to the claim.
        answers (list): List of answers corresponding to the questions.

    Returns:
        str: The conclusion whether the claim is true or false with reasoning.
    """
    # Format the questions and answers into a single string
    questions_and_answers = ""
    for question, answer in zip(questions, answers):
        questions_and_answers += f"Question: {question}\nAnswer: {answer}\n\n"

    # Fill in the prompt template with the claim and formatted questions and answers
    prompt = claim_classification_template.format(claim=claim, questions_and_answers=questions_and_answers)

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt")

    # Generate the response using the model
    output_ids = model.generate(
        input_ids=input_ids["input_ids"],
        max_new_tokens=512,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        num_return_sequences=1,
    )

    # Decode the generated text
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract the verdict and reasoning separately from the model's output
    try:
        verdict = output_text.split('Verdict:')[-1].split('Reasoning:')[0].strip()
        reasoning = output_text.split('Reasoning:')[-1].strip()
        if return_reasoning: return verdict, reasoning
        return verdict
    except:
        raise ValueError(f"Error parsing model output: {output_text}")

# # Example usage
# claim = "The Green New Deal would cost American taxpayers over $90 trillion."
# questions = ["What is the estimated cost of the Green New Deal?", "How will the Green New Deal be funded?"]
# answers = ["The estimated cost is around $93 trillion according to some experts.", "It would be funded through various taxes and government budgets."]
# verdict, reasoning  = classify_claim(claim, questions, answers)
# print(f"Claim: {claim}")
# print(f"Verdict: {verdict}")
# print(f"Reasoning: {reasoning}")

### Task 7: Generate a FactScore for the Original Statement

In [19]:
## Generate Fact Score Label for Statement (Statement Classification)

def generate_fact_score_label(verdicts):
    """
    Generates a fact score label based on the verdicts provided. The fact score label can be one of the following:
    - True: All atomic claims are true.
    - Mostly True: More than half of the atomic claims are true.
    - Half True: Half of the atomic claims are true.
    - Mostly False: More than half of the atomic claims are false.
    - Pants on Fire: All atomic claims are false.
    - Unverifiable: The number of unverifiable atomic claims is greater than or equal to the number of true/false atomic claims.

    Args:
        verdicts (list): A list of verdicts (True/False/Unverifiable) for each atomic claim within a statement.

    Returns:
        str: The fact score label.
    """

    label = 'Unknown'
    perc_unverified = 0
    v_cleaned = verdicts
    if 'Unveriable' in verdicts:
        v_cleaned = verdicts.remove('Unverifiable')
        perc_unverified = Counter(verdicts)['Unverifiable'] / len(verdicts)
    perc_true = Counter(verdicts)['True'] / len(verdicts)
    perc_false = Counter(verdicts)['False'] / len(verdicts)
    perc = [perc_true, perc_false, perc_unverified]
    winner = np.argwhere(perc == np.amax(perc))

    if len(winner) == 3: # three-way tie
        label = "Unverifiable"

    elif len(winner) == 2: # two-way tie
        if 0 in winner and 1 in winner: # half true
            label = 'Half True'
        elif 0 in winner and 2 in winner: # true & unverifable
            label = "Unverifiable"
        elif 1 in winner and 2 in winner: # false & unverifable
            label = "Unverifiable"

    elif winner == 0:
        if perc_true == 1: # all true
            label = "True"
        elif Counter(v_cleaned)['True'] / len(v_cleaned) > 0.5: # mostly true
            label = "Mostly True"

    elif winner == 1:
        if perc_false == 1: # all false
            label = "Pants on Fire"
        elif Counter(v_cleaned)['False'] / len(v_cleaned) > 0.5: # mostly false
            label = "Mostly False"

    elif winner == 2:
        label = 'Unverifiable'
    return label

## Putting It All Together

In [20]:
# Final Code Block: Putting It All Together
def verify_statement(statement, num_examples=3):
    """
    Runs the entire fact-checking pipeline for the input claim.

    Args:
        statement (str): The input statement(s).
        num_examples (int, optional): The number of few-shot examples to include in the prompts. Defaults to 3.

    Returns:
        tuple: A tuple containing the atomic claims, questions, and reasoning/verification for the claim.
    """
    # Write out the whole pipeline and be verbose about what's happening (print out the steps)
    atomic_claims = generate_atomic_claims(statement, num_examples=num_examples)
    print("Atomic Claims generated:", len(atomic_claims))

    results = []  # List to store all the info for each atomic claim (claim, questions, answers, verdict, reasoning)
    verdicts = []

    for i, claim in enumerate(atomic_claims[1:], start=1):
        print(f"Processing Atomic Claim {i}/{len(atomic_claims)}:")
        print("\tClaim:", claim)

        res = {}
        res['claim'] = claim

        questions = generate_questions(claim, num_examples=num_examples)
        print("\tQuestions generated:", len(questions))

        res['qa-pairs'] = []
        answers = []
        for j, question in enumerate(questions, start=1):
            print(f"\n\t\tQuestion {j}/{len(questions)}:", question)

            search_results = fetch_search_results(question)
            relevant_docs = retrieve_relevant_documents_using_rag(search_results, 'relevant_excerpt', question)

            answer, source = synthesize_answer(relevant_docs, question)
            answers.append(answer)

            res['qa-pairs'].append({'question': question, 'answer': answer, 'source': source})

            print(f"\t\tAnswer {j}/{len(questions)}:", answer)
            # print(f"\t\tSources {j}:", source)

        verdict, reasoning = classify_claim(claim, questions, answers)
        verdicts.append(verdict)
        res['verdict'] = verdict
        res['reasoning'] = reasoning

        print("\tVerdict:", verdict)
        print("\tReasoning:", reasoning)

        results.append(res)

    print("\nVerdicts:", verdicts)

    fact_score = generate_fact_score_label(verdicts)
    print("\nFact Score:", fact_score)

    return fact_score, results

In [21]:
import pandas as pd

df = pd.read_csv('pilot.csv', index_col=0)
df

Unnamed: 0_level_0,statement_originator,statement,questions to verify the statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link,Unnamed: 9
verdict,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
FALSE,Instagram posts,"""Chicken and cows have high levels of estrogen...",x,2/26/2024,Social Media,Sofia Ahmed,3/1/2024,https://www.politifact.com/factchecks/2024/mar...,
FALSE,Joe Biden,"""He[Trump] has caused the deficit with China t...",x,?,debate,CNN,,https://www.cnn.com/factsfirst/politics/factch...,
TRUE,Joe Biden,"""Millions of Arizonans will soon live under an...",x,4/9/2024,speech,,,https://www.nytimes.com/2024/04/09/us/politics...,
MOSTLY TRUE,Jonathan Becker,"After a 2022 law, “The vast majority of colleg...",x,2/16/2024,written,Politifact,4/8/2024,https://www.politifact.com/factchecks/2024/apr...,
FALSE,X user viral post,Social Security Administration data shows the ...,x,4/4/2024,Social Media,AP news,,,
FALSE,X user viral post,Immigrants in the U.S. illegally on Tuesday st...,x,4/16/2024,Social Media,AP news,4/17/2024,https://apnews.com/article/fact-check-immigran...,
FALSE,Donald Trump,"""This year, the typical family’s tax bill is t...",What is the average American family tax bill i...,4/15/2024,Truth Social,factcheck.org,4/17/2024,https://www.factcheck.org/2024/04/trumps-unfou...,
MOSTLY FALSE,"Nicole Shanahan, Robert F. Kennedy Jr.’s choic...",“I discovered that women’s fertility is in pre...,What is the global average fertility rate? Why...,3/26/2024,Speech,factcheck.org,4/4/2024,https://www.factcheck.org/2024/04/scicheck-sha...,
MOSTLY TRUE,Joe Biden,"""Billionares pay an average federal tax rate o...",What is the average tax rate for a U.S. Billio...,,,,,,
MOSTLY TRUE,Joe Biden,"""100 million Americans can no longer be denied...",Does President Trump want to repeal the ACA? H...,3/9/2024,State of the Union speech,factcheck.org,,https://www.factcheck.org/2024/04/familiar-cla...,


In [None]:
# Example usage of entire pipeline
statement = "Gen Z is divided 50-50 on the issue of support for Hamas or Israel."
fact_score, results = verify_statement(statement)



Number of Atomic Claims generated: 3
Processing Atomic Claim 1/3:
	Claim: Gen Z is divided on the issue of support for Hamas or Israel.
	Number of questions generated: 5
		Question 1/5: What percentage of Gen Z supports Hamas?
		Answer 1: According to the Harvard-Harris poll cited in the articles, 48% of 18-to-24 year olds are neutral on the issue of supporting Hamas or Israel. Therefore, it is not accurate to say that 50% of Gen Z supports Hamas based on this information alone.
		Question 2/5: What percentage of Gen Z supports Israel?
		Answer 2: According to the Axios articles, 48% of Gen Z and millennials believe the U.S. should publicly voice support of Israel. This percentage can be interpreted as an indication of Gen Z's support for Israel, although it does not directly state the percentage of Gen Z that supports Israel per se.
		Question 3/5: What is the stance of Gen Z towards Hamas?
		Answer 3: According to the Harvard-Harris poll mentioned in the PolitiFact articles, Gen Z is

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


		Answer 5: Yes, according to the documents provided, a Harvard-Harris poll found that among 18-to-24 year olds, 48% sympathize more with Palestinians and 42% sympathize more with Israel. However, it's important to note that this does not necessarily mean they support Hamas or Israel, but rather where their sympathies lie.




	Verdict: True
	Reasoning: The claim that Gen Z is divided on the issue of support for Hamas or Israel is supported by the evidence provided. The answers to the questions indicate that there is a significant portion of Gen Z that is neutral, sympathizes with Palestine, or opposes Israel. This aligns with the claim that Gen Z is divided on the issue.
Processing Atomic Claim 2/3:
	Claim: The division among Gen Z on this issue is approximately 50-50.
	Number of questions generated: 5
		Question 1/5: What percentage of Gen Z supports this issue?
		Answer 1: The documents do not provide enough information to answer the question with certainty. Some documents mention specific issues that Gen Z has strong opinions on, such as defunding the police, access to birth control, and LGBT rights. However, none of the documents provide a percentage of Gen Z that supports a particular issue beyond the specific examples given. Therefore, it is not possible to answer the question with the information pro

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


		Answer 5: I cannot answer the question given the relevant documents. The documents do not provide enough information to determine what "etc." refers to in the context of handling parsing errors in Python LangChain.
	Verdict: False
	Reasoning: The claim states that the division among Gen Z on a specific issue is approximately 50-50. However, the documents provided do not give enough information to verify this claim. The documents only provide information about Gen Z's opinions on certain issues, but they do not give a percentage of Gen Z that supports or opposes these issues beyond the specific examples given. Therefore, it is not possible to determine if the claim is true or false based on the information provided.

Verdicts: ['True', 'False']

Fact Score: Half True


In [None]:
import json
print(json.dumps(results, indent=2))

[
  {
    "claim": "Gen Z is divided on the issue of support for Hamas or Israel.",
    "qa-pairs": [
      {
        "question": "What percentage of Gen Z supports Hamas?",
        "answer": "According to the Harvard-Harris poll cited in the articles, 48% of 18-to-24 year olds are neutral on the issue of supporting Hamas or Israel. Therefore, it is not accurate to say that 50% of Gen Z supports Hamas based on this information alone.",
        "source": "- [\"Fact check: Is Gen Z is divided '50-50' on supporting Hamas or Israel?\", wral.com, Nov 3, 2023]\n- [\"Fact check: Is it true that 50% of Gen Zers support Hamas?\", statesman.com, Nov 4, 2023]\n- [\"Fact check: Is it true that 50% of Gen Zers support Hamas?\", statesman.com, Nov 4, 2023]\n- [\"Fact check: Is it true that 50% of Gen Zers support Hamas?\", statesman.com, Nov 4, 2023]\n- [\"Fact check: Is it true that 50% of Gen Zers support Hamas?\", statesman.com, Nov 4, 2023]\n- [\"PolitiFact: Is it true that 50% of Gen Zers suppo