## Setup

In [1]:
# For transformer models
!pip install -q accelerate
# !pip install -q bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install -q flash-attn --no-build-isolation

# For sentence similarity
!pip install -q sentence_transformers

# For web queries
!pip install -q googlesearch-python

# For Retrieval Augmentated Generation (RAG) since HF doesn't have great support for it
!pip install -q langchain chromadb

# For using the Unofficial HuggingChat Python API: https://github.com/Soulter/hugging-chat-api
!pip install -q hugchat

!pip install -q mistralai

In [91]:
import os
import sys
import json

import dotenv

# Load environment variables from .env file
dotenv.load_dotenv()

# Add the parent directory to sys.path so we can import other py files
sys.path.append('../')

# Hide warnings cuz they're annoying
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load examples from JSON file
with open('../data/examples.json', 'r') as f:
    examples = json.load(f)

In [79]:
# DeepInfra API Setup
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
client = OpenAI(
    api_key=os.environ["DEEPINFRA_API_TOKEN"],
    base_url="https://api.deepinfra.com/v1/openai",
)

model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
def get_LLM_response(prompt: str, model_name: str = model_name) -> str:
    chat_response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        top_p=0.3,
        n=1,
    )
    return chat_response.choices[0].message.content

## Subtasks

In [88]:
from prompts import claim_atomization_template
from utils.nlp_utils import select_best_examples
from utils.code_utils import multiline_string_to_list

def generate_atomic_claims(statements, num_examples=3):
    """
    Generates atomic claims for the input statements.

    Args:
        claim (str): The input statements.
        num_examples (int, optional): The number of few-shot examples to include in the prompt. Defaults to 3.

    Returns:
        str: The generated atomic claims.
    """
    if num_examples > 0: # Populate the prompt with few-shot examples (w/ proper formatting)
        examples_text = ""
        best_examples = select_best_examples(statements, examples["claim_atomization_examples"], "statement", num_examples)

        # Add each example to the prompt
        for example in best_examples:
            examples_text += f"Statements: {example['statement']}\n"
            examples_text += f"Atomic Claims: {example['atomic_claims']}\n"

        # Finally, fill in the prompt template with the examples and the input statements
        prompt = claim_atomization_template.format(examples=examples_text.strip(), statements=statements).strip()
    else: # Otherwise leave the examples section of the prompt template blank and only include the input statements
        prompt = claim_atomization_template.format(examples="", statements=statements.strip()).strip()

    # # Print the entire prompt for debugging purposes
    # print(prompt)

    # Generate atomic claims using the API
    # output_text = get_LLM_response(prompt)
    output_text = """
    [
    "There exists a person who is the top donor to a major super PAC supporting Donald Trump for president in 2024",
    "This same person is also the top donor to the super PAC supporting Robert F. Kennedy Jr. for president."
    ]
    """

    # Extract only the list of claims from the model's output
    # Assuming output format directly returns Python list
    try:
        output_text = output_text.split('Atomic Claims:')[-1].strip()
        atomic_claims = multiline_string_to_list(output_text)
        return atomic_claims
    except:
        print(f"Error parsing model output: {output_text}\nRetrying...")
        return generate_atomic_claims(statements, num_examples)

# Example usage
statements = 'After a 2022 law, “The vast majority of colleges in New York State do not have on-campus poll sites.”'
atomic_claims = generate_atomic_claims(statements, num_examples=3)
print(f'Statement: {statements}')
print(f'Atomic Claims: {atomic_claims}')

Statement: After a 2022 law, “The vast majority of colleges in New York State do not have on-campus poll sites.”
Atomic Claims: ['There exists a person who is the top donor to a major super PAC supporting Donald Trump for president in 2024', 'This same person is also the top donor to the super PAC supporting Robert F. Kennedy Jr. for president.']


In [6]:
## Question Generation
from prompts import question_generation_template

def generate_questions(claim, num_examples=3):
    """
    Generates questions to verify the factuality of the input claim.

    Args:
        claim (str): The input claim.
        num_examples (int, optional): The number of few-shot examples to include in the prompt. Defaults to 3.

    Returns:
        str: The generated questions.
    """
    if num_examples > 0: # Populate the prompt with few-shot examples (w/ proper formatting)
        examples_text = ""
        best_examples = select_best_examples(claim, examples["question_generation_examples"], "claim", num_examples)

        # Add each example to the prompt
        for example in best_examples:
            examples_text += f"Claim: {example['claim']}\n"
            examples_text += f"Questions: {example['questions']}\n"

        # Finally, fill in the prompt template with the examples and the input claim
        prompt = question_generation_template.format(examples=examples_text.strip(), claim=claim).strip()
    else: # Otherwise leave the examples section of the prompt template blank and only include the input claim
        prompt = question_generation_template.format(examples="", claim=claim).strip()

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Generate questions using the API
    output_text = get_LLM_response(prompt)

    # Extract only the list of questions from the model's output
    try:
        # Assuming output format directly returns Python list
        questions = multiline_string_to_list(output_text.split('Questions:')[-1].strip())
        return questions
    except:
        print(f"Error parsing model output: {output_text}\nRetrying...")
        return generate_questions(claim, num_examples)
    
# # Example usage for question generation
# claim_question = dict()
# for i, claim in enumerate(atomic_claims):
#   questions = generate_questions(claim)
#   claim_question[claim] = questions
#   print(f"Claim: {claim}")
#   print(f"Questions: {claim_question[claim]}")

In [7]:
## Web Querying & Scraping
import json
import requests
import re
from bs4 import BeautifulSoup

SOURCE_BLACKLIST = ['politifact.com', 'factcheck.org', 'snopes.com']

def extract_website_name(url):
    """Extracts the website name from a given URL using regex"""
    match = re.search(r'(?P<url>https?://[^\s]+)', url)
    if match:
        url = match.group('url')
        return url.split('//')[1].split('/')[0].lower().replace('www.', '')
    return None

def scrape_text_from_website(url):
    """Scrapes text and metadata from a given website URL."""
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser', from_encoding="iso-8859-1")

            # Remove script and style tags
            for script in soup(["script", "style"]):
                script.decompose()

            # Extract all text from the website
            text = soup.get_text()

            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text).strip()

            return text
        else:
            # print(f"Failed to retrieve content from the URL: {url}")
            return None
    except Exception as e:
        # print(f"Error during website scraping: {e}")
        return None

def fetch_search_results(question, scrape_website=False):
    """
    Fetches search results for a given question using an API.

    Args:
        question (str): The question to search for.
        scrape_website (bool, optional): Whether to scrape the website content. Defaults to False.

    Returns:
        list: A list of organic search results.
    """
    api_key = os.environ.get("SERPER_API_KEY")

    headers = {
        "X-API-KEY": api_key,
        "Content-Type": "application/json",
    }

    payload = json.dumps({"q": question})
    try:
        response = requests.post("https://google.serper.dev/search", headers=headers, data=payload)
        result = json.loads(response.text)

        # Extract the organic search results and transform them into our desired format
        results = []
        for item in result['organic']:
            # ALSO while iterating through the results, remove any websites on our source blacklist
            source = extract_website_name(item.get('link', '')).lower()
            if source in SOURCE_BLACKLIST: continue
            website_text = scrape_text_from_website(item.get('link', '')) if scrape_website else item.get('snippet', '')
            if website_text is None or website_text == '': # if we failed to scrape the website, use the snippet
                website_text = item.get('snippet', '')
            results.append({
                "title": item.get('title', ''),
                "source": source,
                "date_published": item.get('date', ''),
                "relevant_excerpt": item.get('snippet', ''),
                "text": website_text,
                "search_position": item.get('position', -1),
                "url": item.get('link', ''),
            })
        return results

    except Exception as e:
        print(f"Failed to fetch information: {e}")
        return []

# # Example usage:
# question = '''Has the National Guard been historically involved in disaster response?'''
# search_results = fetch_search_results(question)
# search_results

In [8]:
## Retrieval Augmented Generation (RAG) Retriever
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch

import copy

# Initialize embedding model for retrieval (sentence similarity)
BATCH_SIZE = 32
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
retriever_model_id='sentence-transformers/all-MiniLM-L6-v2'
retriever_model = HuggingFaceEmbeddings(
    model_name=retriever_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': BATCH_SIZE},
)

def retrieve_relevant_documents_using_rag(search_results, content_key, question, chunk_size=512, chunk_overlap=128, top_k=10):
    """
    Takes in search results and a query question, processes and splits the documents,
    and retrieves relevant documents using a RAG approach.

    Args:
        search_results (list of dict): A list of dictionaries containing web-scraped data.
        question (str): The query question for retrieving relevant documents.
        content_key (str): The key in the dictionary containing the text content.
        chunk_size (int): The maximum size of the text chunks.
        chunk_overlap (int): The overlap between consecutive text chunks.
        top_k (int): The number of relevant documents to retrieve.

    Returns:
        list: A list of relevant document chunks.
    """
    # Create LangChain documents from search results
    documents = []
    for result in search_results:
        page_content = result.pop(content_key, None)  # Extract the text content, remaining keys are metadata
        if page_content is not None:
            documents.append(Document(page_content=page_content, metadata=result))

    # Split documents into smaller chunks (if needed, based on document size)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    split_documents = text_splitter.split_documents(documents)

    # Initialize ChromaDB vector store to index the document chunks
    try: 
        db = Chroma.from_documents(
            documents=split_documents,
            embedding=retriever_model,
        )
    except Exception as e:
        print(f"Failed to initialize ChromaDB: {e}")
        search_results = fetch_search_results(question, scrape_website=True)
        return retrieve_relevant_documents_using_rag(search_results, content_key, question, chunk_size, chunk_overlap, top_k)

    # Retrieve the most relevant chunks for the given question
    top_k = min(top_k, len(split_documents))  # Ensure we don't request more documents than available
    relevant_docs = db.similarity_search(question, k=top_k)

    return relevant_docs

# relevant_docs = retrieve_relevant_documents_using_rag(search_results, 'text', question)
# relevant_docs

In [9]:
## RAG-based Question Answering
from prompts import answer_synthesis_template

def synthesize_answer(relevant_docs, question, return_sources=True):
    """
    Synthesizes an answer to a given question using the relevant documents.

    Args:
        relevant_docs (list of dict): A list of relevant document chunks.
        question (str): The question to answer.

    Returns:
        str: The synthesized answer.
    """
    # Format the relevant documents for the prompt
    documents_text = ""
    for doc in relevant_docs:
        documents_text += f"Title: {doc.metadata.get('title', '')}\n"
        documents_text += f"URL: {doc.metadata.get('url', '')}\n"
        documents_text += f"Text: {doc.page_content.strip()}\n"
        documents_text += f"Date Published: {doc.metadata.get('date_published', '')}\n\n"

    # Fill in the prompt template with the relevant documents and the question
    prompt = answer_synthesis_template.format(documents=documents_text.strip(), question=question).strip()
    prompt = prompt.replace('\n\n\n', '\n')

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Generate the answer using the API
    output_text = get_LLM_response(prompt)
    
    # Extract the answer and sources separately from the model's output
    try:
        answer = output_text.split('Answer:')[-1].split('Sources:')[0].strip()
        sources = output_text.split('Sources:')[-1].strip()
        if return_sources: return answer, sources
        return answer
    except:
        print(f"Error parsing model output: {output_text}\nRetrying...")
        return synthesize_answer(relevant_docs, question, return_sources)
    
# # Example usage for RAG-based question answering
# answer, sources = synthesize_answer(relevant_docs, question)

In [10]:
## Claim Classification
from prompts import claim_classification_template

def classify_claim(claim, questions, answers, return_reasoning=True):
    """
    Uses a chain-of-thought approach to classify the original claim as true or false based on the answers to generated questions.

    Args:
        claim (str): The original claim.
        questions (list): List of questions related to the claim.
        answers (list): List of answers corresponding to the questions.

    Returns:
        str: The conclusion whether the claim is true or false with reasoning.
    """
    # Format the questions and answers into a single string
    questions_and_answers = ""
    for question, answer in zip(questions, answers):
        questions_and_answers += f"Question: {question}\nAnswer: {answer}\n\n"

    # Fill in the prompt template with the claim and formatted questions and answers
    prompt = claim_classification_template.format(claim=claim, questions_and_answers=questions_and_answers)

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Generate the classification using the  API
    output_text = get_LLM_response(prompt)

    # Extract the verdict and reasoning separately from the model's output
    try:
        verdict = output_text.split('Verdict:')[-1].split('Reasoning:')[0].strip()
        reasoning = output_text.split('Reasoning:')[-1].strip()
        if return_reasoning: return verdict, reasoning
        return verdict
    except:
        print(f"Error parsing model output: {output_text}\nRetrying...")
        return classify_claim(claim, questions, answers, return_reasoning)

In [11]:
## Claim Classification
from prompts import statement_classification_template

def classify_statement(statement, claims, verdicts, reasonings, return_reasoning=True):
    """
    Uses a chain-of-thought approach to classify the original statement into one of the one of the five following labels: [True, Mostly True, Half True, Mostly False, False, Unverifiable]. 

    Args:
        claim (str): The original claim.
        questions (list): List of questions related to the claim.
        answers (list): List of answers corresponding to the questions.

    Returns:
        str: The conclusion whether the claim is true or false with reasoning.
    """
    # Format the questions and answers into a single string
    claims_verdicts_reasonings = ""
    for claim, verdict, reasoning in zip(claims, verdicts, reasonings):
        claims_verdicts_reasonings += f"Claim: {claim}\n"
        # claims_verdicts_reasonings += f"Verdict: {verdict}\n"
        claims_verdicts_reasonings += f"Reasoning: {reasoning}\n\n"

    # Fill in the prompt template with the claim and formatted questions and answers
    prompt = statement_classification_template.format(statement=statement, claims_verdicts_reasonings=claims_verdicts_reasonings)

    # Print the entire prompt for debugging purposes
    # print(prompt)

    # Generate the classification using the  API
    output_text = get_LLM_response(prompt)

    # Extract the verdict and reasoning separately from the model's output
    try:
        verdict = output_text.split('Verdict:')[-1].split('Reasoning:')[0].strip()
        reasoning = output_text.split('Reasoning:')[-1].strip()
        if return_reasoning: return verdict, reasoning
        return verdict
    except:
        print(f"Error parsing model output: {output_text}\nRetrying...")
        return classify_statement(statement, claims, verdicts, reasonings, return_reasoning)

## Putting It All Together

In [12]:
def verify_statement(statement, num_examples=3):
    """
    Runs the entire fact-checking pipeline for the input claim.

    Args:
        statement (str): The input statement(s).
        num_examples (int, optional): The number of few-shot examples to include in the prompts. Defaults to 0.

    Returns:
        tuple: A tuple containing the verdicts, output dictionary, fact score, and reasoning for the claim.
    """
    # Write out the whole pipeline and be verbose about what's happening (print out the steps)
    print("Statement:", statement)

    atomic_claims = generate_atomic_claims(statement, num_examples)
    print(f"{len(atomic_claims)} Atomic Claims generated: {atomic_claims}")

    output_dict = []  # List to store all the info for each atomic claim (claim, questions, answers, verdict, reasoning)
    verdicts = []
    reasonings = []

    for i, claim in enumerate(atomic_claims, start=1):
        print(f"Processing Atomic Claim {i}/{len(atomic_claims)}:")
        print("\tClaim:", claim)

        claim_output = {}
        claim_output['claim'] = claim

        questions = generate_questions(claim, num_examples)
        print(f"\t{len(questions)} Questions generated: {questions}")

        claim_output['qa-pairs'] = {}
        claim_output['qa-pairs']['questions'] = questions
        answers = []
        sources = []
        for j, question in enumerate(questions, start=1):
            print(f"\t\tQuestion {j}/{len(questions)}:", question)

            search_results = fetch_search_results(question, scrape_website=True)
            relevant_docs = retrieve_relevant_documents_using_rag(search_results, 'relevant_excerpt', question)

            answer, source = synthesize_answer(relevant_docs, question)
            answers.append(answer)
            sources.append(source)

            print(f"\t\tAnswer {j}/{len(questions)}:", answer)
            # print(f"\t\tSources {j}:", source)

        claim_output['qa-pairs']['answers'] = answers
        claim_output['qa-pairs']['sources'] = sources

        verdict, reasoning = classify_claim(claim, questions, answers)
        verdicts.append(verdict)
        reasonings.append(reasoning)
        claim_output['verdict'] = verdict
        claim_output['reasoning'] = reasoning

        print("\tVerdict:", verdict)
        print("\tReasoning:", reasoning)

        output_dict.append(claim_output)

    print("Verdicts:", verdicts)

    fact_label, reasoning = classify_statement(statement, atomic_claims, verdicts, reasonings)
    print("Fact Label:", fact_label)
    print("Reasoning:", reasoning)
    print('\n')

    reasoning = ""
    for claim, reasoning in zip(atomic_claims, reasonings):
        reasoning += f"Claim: {claim}\nReasoning: {reasoning}\n"

    return verdicts, output_dict, fact_label, reasoning

In [13]:
import pandas as pd

base_dir = '../data/' # TODO: modify this to the correct path for you!
filename = 'cleaned_pilot.csv'
full_path = os.path.join(base_dir, filename)
df = pd.read_csv(full_path)

df['statement'] = df['statement'].astype(str)
df['statement'].head()

0    “The National Guard in the HISTORY of its life...
1    "On Jan. 6, 2021, U.S. Capitol 'protestors car...
2        "Not even one rocket (from Iran) hit Israel."
3    "326,000 migrants were flown to Florida with t...
4    "Crime is down in Venezuela by 67% because the...
Name: statement, dtype: object

In [58]:
import os
import pickle

save_filename = '../pkl_for_final_dataset/fact_check_samples_mixtral.pkl'
if os.path.exists(save_filename):
    fact_check_samples = pickle.load(open(save_filename, 'rb'))
else: 
    fact_check_samples = []

In [90]:
fact_check_samples

[(['True', 'False'],
  [{'claim': 'The National Guard is typically called in after a disaster',
    'qa-pairs': {'questions': ['Is the National Guard commonly activated following a disaster?',
      "Is the National Guard's role typically associated with disaster response?"],
     'answers': ['Yes, the National Guard is commonly activated following a disaster. They play a vital role in saving lives, stemming suffering, and protecting property during disaster relief operations. This includes responding to incidents such as winter storms, tornados, earthquakes, floods, wildland fires, oil spills, and even terrorist attacks. The President can activate the National Guard to participate in federal missions, both domestically and overseas, in addition to state active duty missions which can last from 15-60 days after a disaster. The response time of the National Guard usually depends on when the Governor activates them, but it is generally within a minimum of 24 hours.',
      "Yes, the Nati

In [89]:
statement = "The top donor to a major super PAC supporting Donald Trump for president in 2024 is also the top donor to the super PAC supporting Robert F. Kennedy Jr. for president."
verdicts, output_dict, fact_label, reasoning = verify_statement(statement, num_examples=3)
print(f"Statement: {statement}")
print(f"Verdicts: {verdicts}")
print(f"Fact Label: {fact_label}")
print(f"Reasoning: {reasoning}")

Statement: The top donor to a major super PAC supporting Donald Trump for president in 2024 is also the top donor to the super PAC supporting Robert F. Kennedy Jr. for president.
2 Atomic Claims generated: ['There exists a person who is the top donor to a major super PAC supporting Donald Trump for president in 2024', 'This same person is also the top donor to the super PAC supporting Robert F. Kennedy Jr. for president.']
Processing Atomic Claim 1/2:
	Claim: There exists a person who is the top donor to a major super PAC supporting Donald Trump for president in 2024
	3 Questions generated: ["Who is the top donor to a major super PAC supporting Donald Trump's 2024 presidential campaign?", 'How much has the top donor contributed to the super PAC?', "Is there public confirmation that the top donor's contributions support Donald Trump's 2024 presidential campaign?"]
		Question 1/3: Who is the top donor to a major super PAC supporting Donald Trump's 2024 presidential campaign?
		Answer 1/3

In [15]:
from tqdm.auto import tqdm
df_subset = df[45:50]

for index, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    statement = row['statement']
    result = verify_statement(statement, num_examples=3)
    fact_check_samples.append(result)

  0%|          | 0/5 [00:00<?, ?it/s]

Statement: "China’s eyes and ears — dangerously close, too dangerous to lead."
3 Atomic Claims generated: ['China’s actions can be seen as a threat', 'China’s influence is close and extensive', "China's leadership is considered dangerous"]
Processing Atomic Claim 1/3:
	Claim: China’s actions can be seen as a threat
	2 Questions generated: ['What actions has China taken recently that are considered a threat?', "Have China's recent actions violated international law?', 'What is the global community's response to China's recent actions?"]
		Question 1/2: What actions has China taken recently that are considered a threat?
		Answer 1/2: China has been involved in several recent activities that are considered threats. According to the FBI, the Chinese government poses a "broad and unrelenting" threat to U.S. critical infrastructure (Source: https://www.fbi.gov/news/stories/chinese-government-poses-broad-and-unrelenting-threat-to-u-s-critical-infrastructure-fbi-director-says). This threat is 

In [39]:
len(fact_check_samples)

50

In [40]:
fact_check_samples[-1]

(['False',
  'Unverifiable',
  'True',
  'False',
  'Unverifiable',
  'False',
  'Unverifiable',
  'Unverifiable'],
 [{'claim': 'Dollar Tree has had mass store closures during Joe Biden’s presidency.',
   'qa-pairs': {'questions': ['How many Dollar Tree stores have closed during Joe Biden’s presidency?',
     'What is the reason for Dollar Tree store closures since Joe Biden became president?'],
    'answers': ["Based on the provided documents, there is no specific number mentioned for Dollar Tree store closures during Joe Biden's presidency. The documents only mention that Dollar Tree, Inc., which owns Dollar Tree and Family Dollar, will close about 1000 stores, but they do not specify how many of these are Dollar Tree stores.",
     'Dollar Tree store closures since Joe Biden became president have been due to poor acquisition outcomes, specifically the acquisition of Family Dollar. The company has faced challenges such as declining sales, economic headwinds, and rampant theft, leadin

In [34]:
# fact_check_samples.pop()

('Mostly True',
 "The National Guard's primary mission is to provide assistance to federal, state, and local authorities during emergencies. They are typically called upon after a disaster has occurred to help with recovery efforts, such as providing security, rebuilding infrastructure, and delivering essential supplies. However, there are instances where the National Guard can be called in before an event to provide support, such as during natural disasters like hurricanes, where they might be used for evacuation efforts or preparing critical infrastructure for the incoming event. Nevertheless, the general statement that the National Guard is usually called in after a disaster is mostly true.\n\nSources:\n1. National Guard: https://www.nationalguard.mil/\n2. Congressional Research Service: https://crsreports.congress.gov/product/pdf/R/R45194\n3. Federal Emergency Management Agency (FEMA): https://www.fema.gov/")

In [24]:
import pickle
with open(save_filename, 'wb') as f:
    pickle.dump(fact_check_samples, f)

In [41]:
mixtral_sample = pd.DataFrame({
    "verdicts": [x[0] for x in fact_check_samples],
    "fact_score": [x[2] for x in fact_check_samples],
    "output": [x[1] for x in fact_check_samples],
    "reasonings": [x[3] for x in fact_check_samples]
})
mixtral_sample.head()

Unnamed: 0,verdicts,fact_score,output,reasonings
0,"[True, False]",Mostly False,[{'claim': 'The National Guard is typically ca...,The claim that the National Guard is not usual...
1,"[True, True, False]",False,"[{'claim': 'On January 6, 2021, individuals ga...",The claim that 'Protestors' at the U.S. Capito...
2,[False],Mostly False,[{'claim': 'No rockets from Iran reached Israe...,The claim states that no rockets from Iran rea...
3,"[False, True, Unverifiable, Unverifiable]",Mostly False,[{'claim': 'The number of migrants flown to Fl...,The claim is that the migrant flight program m...
4,"[False, False, True]",False,[{'claim': 'The rate of crime in Venezuela has...,The second answer provides evidence that the U...


In [None]:
# Save the results to an Excel file
filename = '../pkl_for_final_dataset/final-mixtral-samples.csv'
full_path = os.path.join(base_dir, filename)
mixtral_sample.to_csv(full_path)

## Baseline

In [50]:
statement_verification_template = """
You are a fact-checking, logical-reasoning assistant. Let's think step-by-step. 
Given a statement, verify whether the statement is factual by classifying it to one of the five labels given (True/Mostly True/Half True/Mostly False/False/Unverifiable). 
Explain your reasoning in a logical manner. Only answer from your own knowledge base and don't search the internet for additional information.
In your response, return only the label after "Verdict:" and return an explanation after "Reasoning:".

Reasoning Criteria:
1. Rate the statement over both plausibility and truthfulness.
2. If not enough information is provided, always err on the side of caution instead of blind guessing.

Statement: {statement}

Format your message in the exact format below.
Verdict: (True/Mostly True/Half True/Mostly False/False/Unverifiable)
Reasoning: """

def verify_statement_baseline(statement):
    print("Statement:", statement)

    # Fill in the prompt template with the relevant documents and the question
    prompt = statement_verification_template.format(statement=statement)

    output_text = get_LLM_response(prompt)

    try:
        verdict = output_text.split('Verdict:')[-1].split('Reasoning:')[0].strip()
        reasoning = output_text.split('Reasoning:')[-1].strip()

        print("Verdict:", verdict)
        print("Reasoning:", reasoning)
        
        return verdict, reasoning
    except:
        print(f"Error parsing model output: {output_text}")
        return verify_statement_baseline(statement)

In [51]:
import pandas as pd

base_dir = '../data/' # TODO: modify this to the correct path for you!
filename = 'cleaned_pilot.csv'
full_path = os.path.join(base_dir, filename)
df = pd.read_csv(full_path)

df['statement'] = df['statement'].astype(str)
df['statement'].head()

0    “The National Guard in the HISTORY of its life...
1    "On Jan. 6, 2021, U.S. Capitol 'protestors car...
2        "Not even one rocket (from Iran) hit Israel."
3    "326,000 migrants were flown to Florida with t...
4    "Crime is down in Venezuela by 67% because the...
Name: statement, dtype: object

In [52]:
import os
import pickle

save_filename = '../pkl_for_final_dataset/fact_check_samples_mixtral_baseline.pkl'
if os.path.exists(save_filename):
    fact_check_samples = pickle.load(open(save_filename, 'rb'))
else: 
    fact_check_samples = []

In [53]:
from tqdm.auto import tqdm
df_subset = df[:50]

for index, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    statement = row['statement']
    result = verify_statement_baseline(statement)
    fact_check_samples.append(result)

  0%|          | 0/50 [00:00<?, ?it/s]

Statement: “The National Guard in the HISTORY of its life, gets called in AFTER a disaster, not BEFORE something happens.”
Verdict: Mostly True
Reasoning: The National Guard is typically called in after a disaster or emergency has occurred, to help with recovery and restoration efforts. While it's possible that there could be exceptions depending on the specific circumstances, the general rule is that they are not regularly called in before a disaster to prevent it from happening. This statement is mostly true because it accurately reflects the typical role of the National Guard, but there may be rare instances where the Guard is called in before an event to provide security or other support services.
Statement: "On Jan. 6, 2021, U.S. Capitol 'protestors carried no weapons.' "
Verdict: Mostly False
Reasoning: According to news reports, while many protesters at the U.S. Capitol on Jan. 6, 2021, did not carry weapons, some indeed did. For instance, police found Molotov cocktails, a knife

In [54]:
len(fact_check_samples)

50

In [55]:
fact_check_samples[-1]

('Mostly True',
 "A majority of the companies listed have indeed closed stores during Joe Biden's presidency. However, the statement does not specify the number of store closures or whether these closures were directly linked to Biden's presidency. Dollar Tree closed 390 Family Dollar stores in 2020, Walgreens closed 200 stores in 2021, Macy's closed around 30 stores in 2020, Foot Locker closed 400 stores in 2")

In [None]:
# fact_check_samples.pop()

In [56]:
import pickle
with open(save_filename, 'wb') as f:
    pickle.dump(fact_check_samples, f)

In [None]:
# hugchat_sample = pd.DataFrame({
#     "verdicts": [x[0] for x in fact_check_samples],
#     "fact_score": [x[2] for x in fact_check_samples],
#     "output": [x[1] for x in fact_check_samples],
#     "reasonings": [x[3] for x in fact_check_samples]
# })
# hugchat_sample

mixtral_baseline_sample = pd.DataFrame({
    "fact_score": [x[0] for x in fact_check_samples],
    "reasonings": [x[1] for x in fact_check_samples]
})
mixtral_baseline_sample.head()

Unnamed: 0,fact_score,reasonings
0,Mostly True,The National Guard's primary mission is to pro...
1,Mostly False,While it's true that many of the individuals w...
2,True,"According to a Reuters fact check article, dur..."
3,Mostly False,The statement combines several claims that nee...
4,False,The claim that crime is down in Venezuela by 6...
5,Mostly False,"The first part of the statement about ""no barr..."
6,Mostly False,The claim that the average salary in the semic...
7,Mostly False,The statement is partially true but mostly fal...
8,Mostly True,To fact-check the claim that tens of thousands...
9,Mostly False,The productivity of a congress can be measured...


In [None]:
mixtral_baseline_sample['fact_score'].value_counts()

fact_score
Mostly False    19
Unverifiable    12
Mostly True      8
False            6
True             5
Name: count, dtype: int64

In [None]:
# Save the results to an Excel file
filename = '../pkl_for_final_dataset/final-mixtral-samples.xlsx'
full_path = os.path.join(base_dir, filename)
mixtral_baseline_sample.to_excel(full_path)