In [21]:
%reload_ext autoreload
%autoreload 2
from termcolor import colored
import dotenv
import sys
import dspy
import os

from tqdm.auto import tqdm

sys.path.append('../pipeline_v2/')
import main 
dotenv.load_dotenv('../.env')

from utils import print_header

import pandas as pd

def print_final_result(statement, verdict, confidence, reasoning, gold_verdict=None):
    # Print final result
    print("\nFinal Fact-Check Result:")
    print_header(f"Statement: {colored(statement, 'white')}", level=1)
    print_header(f"Overall Verdict: {colored(verdict, 'green')}", level=1)
    print_header(f"Overall Confidence: {colored(str(confidence), 'yellow')}", level=1)
    print_header(f"Overall Reasoning: {colored(reasoning, 'cyan')}", level=1)
    if gold_verdict: print_header(f"Gold Verdict: {colored(gold_verdict, 'green')}", level=1)

In [None]:
### Load data
if os.path.exists('results_v2.pkl'):
    df = pd.read_pickle('results_v2.pkl')
else: 
    df = pd.read_csv('../data/pilot_updated_v2.csv')

    # # Drop unneeded columns
    df.drop(columns=['Assignee', 'questions to verify the statement', 'Gold Label', 'factcheck_date'], inplace=True)

    # Reformat dates
    df['statement_date'] = pd.to_datetime(df['statement_date']).dt.strftime("%B %d, %Y")

df

In [17]:
# Set custom constants for whole pipeline
main.VERBOSE = True # Print intermediate results
# main.VERDICTS=["Supported", "Refuted", "Not Enough Evidence", "Conflicting Evidence/Cherry-picking"]

# Initialize DSPy
# lm = dspy.LM('gemini/gemini-1.5-flash', api_key=os.getenv('GOOGLE_GEMINI_API_KEY'), cache=False)
# lm = dspy.LM('ollama_chat/mistral', api_base='http://localhost:11434', api_key='', cache=False)
# lm = dspy.LM('openrouter/mistralai/mistral-7b-instruct:free', api_key=os.getenv('OPENROUTER_API_KEY'), cache=False)
# lm = dspy.LM('ollama_chat/llama3.1:8b', api_base='http://localhost:11434', api_key='', cache=False)
# lm = dspy.LM('openrouter/meta-llama/llama-3.1-8b-instruct:free', api_key=os.getenv('OPENROUTER_API_KEY'), cache=False)
# lm = dspy.LM('ollama_chat/deepseek-r1:7b', api_base='http://localhost:11434', api_key='', cache=False)
lm = dspy.LM('openrouter/deepseek/deepseek-r1-distill-llama-70b:free', api_key=os.getenv('OPENROUTER_API_KEY'), cache=False)
# lm = dspy.LM('openrouter/deepseek/deepseek-r1:free', api_key=os.getenv('OPENROUTER_API_KEY'), cache=False)
dspy.settings.configure(lm=lm, temperature=0.3)

pipeline = main.FactCheckPipeline(
    search_provider=main.SearchProvider(provider="duckduckgo"),
    model_name=lm,
    embedding_model=main.EMBEDDING_MODEL,
    retriever_k=5
)

In [18]:
# Test with a single statement from dataset
index = -1
statement = df.iloc[index]['statement']
statement_originator = df.iloc[index]['statement_originator']
statement_date = df.iloc[index]['statement_date']
gold_verdict = df.iloc[index]['verdict']

verdict, confidence, reasoning, claims = pipeline.fact_check(
    # statement=f"According to {statement_originator} on {statement_date}, {statement}", 
    statement=f"On {statement_date}, {statement_originator} claimed: {statement}", 
    # statement=statement, 
    # context=f"Statement Originator: {statement_originator}, Date Claim Was Made: {statement_date}"
)
print_final_result(statement, verdict, confidence, reasoning, gold_verdict)

[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Statement: [97mOn January 10, 2024, Deputy White House press secretary Andrew Bates claimed: "House Republicans took numerous votes that would have damaged economic growth and harmed our national security, like attempting to eliminate over 2,000 Border Patrol agents."[0m [0m
[36m  ===== Atomic Claim Extraction =====[0m
[36m   Extracted Claims (1):  [0m
[36m     1. [97mHouse Republicans took numerous votes that would have damaged economic growth and harmed our national security, like attempting to eliminate over 2,000 Border Patrol agents.[0m [0m
[36m    ===== Question Generation [1/1] =====[0m
[36m       Decomposed Components (Questions + Search Queries) (2): [0m
[36m         1. Question: [33mWhat votes have House Republicans taken that could potentially harm economic growth?[0m [0m
[36m            Search Queries: [33m['House Republicans votes economic growth impact', 'House GOP economic growth harmfu

                                                          

[36m        ===== Web Search for Query [2/2] =====[0m
[36m         Query: [33mHouse GOP economic growth harmful votes[0m [0m
[36m         Retrieved 10 Sources: [0m
[36m           1. Club for Growth names the best, and worst, Republicans on the economy [0m
[36m           URL: https://perry.house.gov/news/documentsingle.aspx?DocumentID=402876 [0m
[36m           Excerpt: The scorecard does not include all legislation but rather what the Club for Growth considers "key bills that either promote or undermine the principles of economic freedom and limited government." Of the more than 1,000 combined floor votes between the House and Senate, the organization included 17 House votes and 14 Senate votes. [0m
[36m           2. Conservative ire threatens to jeopardize key vote on Donald Trump ... [0m
[36m           URL: https://thehill.com/homenews/house/5122385-conservative-ire-threatens-to-jeopardize-key-vote-on-trump-agenda-bill/ [0m
[36m           Excerpt: Speaker Mike Johns

                                                          

[36m      ===== Synthesizing Answer [1/2] =====[0m
[36m         Question: [33mWhat votes have House Republicans taken that could potentially harm economic growth?[0m [0m
[36m         Search Queries: [33m['House Republicans votes economic growth impact', 'House GOP economic growth harmful votes'][0m [0m




[36m         Answer: [32m{
    "text": "House Republicans have taken several votes that could potentially harm economic growth. They voted to permanently extend individual tax cuts [2], which may increase the deficit and lead to higher inflation. Additionally, they passed the Limit, Save, Grow Act aimed at reducing spending, which might cut investments in growth-stimulating areas [3]. They also passed the REIN IN Act, requiring inflation impact assessments, potentially delaying pro-growth regulations [5]. These actions, while intended to boost the economy, carry risks that could negatively impact economic growth."
    "citations": [
        {
            "snippet": "The House on Friday voted to permanently extend the individual rate cuts in the GOP's $1.5 trillion tax-cut law as part of Republicans' 'Tax reform 2.0' effort...",
            "source_url": "https://apnews.com/article/business-laws-bills-tax-reform-kevin-brady-dc7e3eb56a0a185f9c38c67e90a24076",
            "source_title"

                                                          

[36m        ===== Web Search for Query [2/2] =====[0m
[36m         Query: [33mImpact of reducing Border Patrol agents on national security[0m [0m
[36m         Retrieved 10 Sources: [0m
[36m           1. Border Crisis: CBP's Response - U.S. Customs and Border Protection [0m
[36m           URL: https://www.cbp.gov/frontline/border-crisis-cbp-s-response [0m
[36m           Excerpt: "Border security is national security — there is no difference — and the crisis on our southwest border puts our national security at risk," U.S. Border Patrol Chief Carla Provost told a House subcommittee dealing with Homeland Security, June 20. ... significantly reducing agents' presence on the border. "On a daily basis, agents ... [0m
[36m           2. CBP Releases December 2024 Monthly Update | U.S. Customs and Border ... [0m
[36m           URL: https://www.cbp.gov/newsroom/national-media-release/cbp-releases-december-2024-monthly-update [0m
[36m           Excerpt: WASHINGTON - U.S. Custom

                                                          

[36m      ===== Synthesizing Answer [2/2] =====[0m
[36m         Question: [33mDid House Republicans vote to eliminate over 2,000 Border Patrol agents, and how would this impact national security?[0m [0m
[36m         Search Queries: [33m['House Republicans Border Patrol agents elimination vote', 'Impact of reducing Border Patrol agents on national security'][0m [0m




[36m         Answer: [32mHouse Republicans did not vote to eliminate over 2,000 Border Patrol agents, but they did oppose funding for hiring additional agents [1]. This could impact national security by potentially reducing border control effectiveness and increasing risks like drug smuggling [2]. 

Reasoning: The claim that House Republicans voted to eliminate over 2,000 Border Patrol agents originated from the White House, but fact-checks indicate this is misleading. Republicans actually voted against funding increases for hiring more agents, which could affect future border security. Reducing the number of Border Patrol agents could lead to increased national security risks, such as heightened illegal activities and drug trafficking.[0m [0m
[36m         Citations:  [0m
[36m           [1] [33mFACT: Despite tough talk on border security, Republicans voted against necessary funding, opposing $7.2 billion for Border Patrol operations, including for hiring; $65 million for 300 mo

In [None]:
lm.inspect_history(n=10)

In [None]:
model = 'gemini-1.5-pro'
num_trials = 3

# If column doesn't exist, create it
if f'{model}_results' not in df.columns: df[f'{model}_results'] = None
df[f'{model}_results'] = df[f'{model}_results'].astype(object)

for index in tqdm(range(len(df))):
    # If results already exist, skip if num_trials is reached
    if df.loc[index, f'{model}_results'] is not None: 
        if len(df.loc[index, f'{model}_results']) == num_trials:
            continue
        else:
            results = df.loc[index, f'{model}_results']
    else: 
        results = []

    for trial_i in tqdm(range(num_trials), leave=False):
        statement = df.iloc[index]['statement']
        statement_originator = df.iloc[index]['statement_originator']
        statement_date = df.iloc[index]['statement_date']
        gold_verdict = df.iloc[index]['verdict']

        verdict, confidence, reasoning, claims = pipeline.fact_check(
            statement=statement, 
            context=f"Statement Originator: {statement_originator}, Date Claim Was Made: {statement_date}"
        )
        results.append({
            'verdict': verdict,
            'confidence': confidence,
            'reasoning': reasoning,
            'claims': claims
        })

        print_final_result(statement, verdict, confidence, reasoning, gold_verdict)

    # Update the dataframe
    df.at[index, f'{model}_results'] = results

In [11]:
df.to_pickle('results_v2.pkl')

## Analysis

In [19]:
import pandas as pd

df = pd.read_pickle('results_v2_gemini-pro.pkl')

# For benchmarking, we want to compare the performance of the model to the gold label
# Since we generate multiple verdicts, we want to see if the model's verdict is in the top 3
# We can do this by creating a new column in the dataframe that checks if the model's verdict is in the top 3
# We can then group by the gold label and see the percentage of times the model's verdict is in the top 3

# df['model_verdict_in_top_3'] = df.apply(lambda row: row['model_verdict'] in row['gold_verdict_top_3'], axis=1)
df

Unnamed: 0,verdict,statement_originator,statement,statement_date,context,factchecker,factcheck_date,factcheck_analysis_link,gemini_results
0,FALSE,Instagram posts,“The National Guard in the HISTORY of its life...,"April 02, 2024",Social Media,Politifact,4/8/2024,https://www.politifact.com/factchecks/2024/apr...,"[{'verdict': 'MOSTLY FALSE', 'confidence': 0.8..."
1,PANTS ON FIRE,ROBERT F. Kennedy Jr.,"""On Jan. 6, 2021, U.S. Capitol 'protestors car...","April 05, 2024",Written Copy on Website,Politifact,04/05/2024,,"[{'verdict': 'FALSE', 'confidence': 1.0, 'reas..."
2,FALSE,Threads Post,"""Not even one rocket (from Iran) hit Israel.""","April 14, 2024",Social Media,Politifact,4/15/2024,https://www.politifact.com/factchecks/2024/apr...,"[{'verdict': 'FALSE', 'confidence': 0.9, 'reas..."
3,FALSE,Instagram Post,"""326,000 migrants were flown to Florida with t...","April 04, 2024",Social Media,Politifact,4/12/2024,https://www.politifact.com/factchecks/2024/apr...,"[{'verdict': 'MOSTLY FALSE', 'confidence': 0.9..."
4,FALSE,Donald Trump,"""Crime is down in Venezuela by 67% because the...","April 02, 2024",Speech,Politifact,4/10/2024,https://www.politifact.com/factchecks/2024/apr...,"[{'verdict': 'FALSE', 'confidence': 1.0, 'reas..."
...,...,...,...,...,...,...,...,...,...
78,FALSE,Nicole Shanahan,"""I will be the the youngest vice president in ...","March 26, 2024",Speech,factcheck.org,3/27/2024,https://factcheck.org/2024/03/factchecking-rfk...,
79,FALSE,Nicole Shanahan,"""Pharmaceutical medicine” was one of “three ma...","March 26, 2024",Speech,factcheck.org,3/27/2024,factcheck.org/2024/03/factchecking-rfk-jr-s-v-...,
80,FALSE,Donald Trump,"""This year, the typical family’s tax bill is t...","April 15, 2024",Truth Social,factcheck.org,4/17/2024,https://www.factcheck.org/2024/04/trumps-unfou...,
81,MOSTLY FALSE,Robert F. Kennedy Jr.,“Those policies that both of them engineered t...,"March 26, 2024",Speech,factcheck.org,3/27/2024,https://factcheck.org/2024/03/factchecking-rfk...,


In [9]:
import pandas as pd

df_pro = pd.read_pickle('results_v2_gemini-pro.pkl')
df_flash = pd.read_pickle('results_v2_gemini.pkl')

# Extract the verdicts from the gemini_results column
df_pro['pred_verdicts_gemini-pro'] = df_pro['gemini_results'].apply(lambda x: [result['verdict'] for result in x] if x else None)
df_flash['pred_verdicts_gemini-flash'] = df_flash['gemini_results'].apply(lambda x: [result['verdict'] for result in x] if x else None)

# Merge the two dataframes on the statement column 
df_merged = pd.merge(df_pro, df_flash, on=['statement', 'verdict'], suffixes=('_pro', '_flash'))
df_merged[['statement', 'verdict', 'pred_verdicts_gemini-pro', 'pred_verdicts_gemini-flash']].head(20)

Unnamed: 0,statement,verdict,pred_verdicts_gemini-pro,pred_verdicts_gemini-flash
0,“The National Guard in the HISTORY of its life...,FALSE,"[MOSTLY FALSE, MOSTLY TRUE, MOSTLY TRUE]","[UNVERIFIABLE, UNVERIFIABLE, UNVERIFIABLE]"
1,"""On Jan. 6, 2021, U.S. Capitol 'protestors car...",PANTS ON FIRE,"[FALSE, FALSE, FALSE]","[FALSE, FALSE, FALSE]"
2,"""Not even one rocket (from Iran) hit Israel.""",FALSE,"[FALSE, FALSE, FALSE]","[FALSE, FALSE, FALSE]"
3,"""326,000 migrants were flown to Florida with t...",FALSE,"[MOSTLY FALSE, FALSE, MOSTLY TRUE]","[FALSE, FALSE, FALSE]"
4,"""Crime is down in Venezuela by 67% because the...",FALSE,"[FALSE, MOSTLY FALSE, MOSTLY FALSE]","[MOSTLY FALSE, MOSTLY FALSE, FALSE]"
5,"""In New York, there are no barriers to law enf...",MOSTLY FALSE,"[HALF TRUE, HALF TRUE, UNVERIFIABLE]",[UNVERIFIABLE]
6,"""Speaking of semiconductor industry jobs, ""Kno...",MOSTLY FALSE,"[HALF TRUE, HALF TRUE, MOSTLY TRUE]",
7,"""Starting in 2025 ""no matter what your total b...",MOSTLY TRUE,"[MOSTLY TRUE, MOSTLY TRUE, HALF TRUE]",
8,“Tens of thousands of auto jobs were lost nati...,HALF TRUE,"[MOSTLY FALSE, HALF TRUE, MOSTLY FALSE]",
9,"""The current Congress is “the least productive...",MOSTLY TRUE,"[MOSTLY TRUE, MOSTLY TRUE, UNVERIFIABLE]",
