# Performance Evaluation

## Helper Functions

In [1]:
# install necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [2]:
class_labels = ["unverifiable", "false", "mostly false", "half true", "mostly true", "true"]
class_2_index = {label: i for i, label in enumerate(class_labels)}
index_2_class = {i: label for i, label in enumerate(class_labels)}
num_labels = [class_2_index[label] for label in class_labels] # [0, 1, 2, 3, 4, 5]

In [3]:
def generate_cm(y_true, y_pred):
    '''
    Calculate the confusion matrix with the cost matrix
    '''
    cm = confusion_matrix(y_true, y_pred)
    return cm

def generate_metrics(y_true, y_pred):
    '''
    Calculate the weighted F1 score
    '''
    weighted_precision = precision_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    weighted_recall = recall_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    weighted_f1 = f1_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    metrics = pd.DataFrame({'Precision': weighted_precision, 'Recall': weighted_recall, 'F1': weighted_f1}, index=class_labels)
    return metrics

## Label Generation

In [4]:
## Load the data
df = pd.read_csv('../data/[FINAL] Pilot - Pilot Claims copy.csv')
statements = df['statement'].to_list()

In [28]:
%reload_ext autoreload
%autoreload 2
import dotenv
import sys
import dspy
import os
sys.path.append('../pipeline_v2/')
import main 
dotenv.load_dotenv('../.env')

# Initialize search provider
main.NUM_SEARCH_RESULTS = 10 # Number of search results to retrieve
main.SCRAPE_TIMEOUT = 5 # Timeout for scraping a webpage (in seconds)
search_provider = main.SearchProvider(provider="duckduckgo")

# Initialize DSPy
lm = dspy.LM('gemini/gemini-1.5-flash', api_key=os.getenv('GOOGLE_GEMINI_API_KEY'))
# lm = dspy.LM('ollama_chat/mistral', api_base='http://localhost:11434', api_key='')
dspy.settings.configure(lm=lm)

# Initialize pipeline
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
main.VERBOSE = False # Print intermediate results
main.INTERACTIVE = False # Allow the user to provide feedback
main.USE_BM25 = True # Use BM25 for retrieval (in addition to cosine similarity)
main.BM25_WEIGHT = 0.5 # Weight for BM25 in the hybrid retrieval

pipeline = main.FactCheckPipeline(
    search_provider=search_provider,
    model_name=lm,
    embedding_model=embedding_model,
    retriever_k=2
)

# Example statement to fact-check
# statement = """And then there's the reality of the Trump economy, 
# where wages adjusted for inflation were rising. The wage gap between 
# rich and poor was shrinking. The savings rate for black Americans was 
# the highest in the history of our country."""

# statement = """The US economy is in a recession now in 2024."""
results = []
for index, statement in enumerate(tqdm(statements)):
    verdict = None
    for i in range(5):
        try:
            verdict, confidence, reasoning, claims = pipeline.fact_check(statement)
        except Exception as e:
            print(f"Error {e}: retrying for statement {index}, attempt {i+1}")
            continue 
        break   
    
    if verdict is None:
        results.append(index)
    else:
        results.append((verdict, confidence, reasoning, claims))
    with open('results_v2.pkl', 'wb') as f:
        pickle.dump(results, f)

In [30]:
### REGENERATE RESULTS
with open('results.pkl', 'rb') as f:
    results = pickle.load(f)
# statement = """The US economy is in a recession now in 2024."""
for index, statement in enumerate(tqdm(statements)):
    if type(results[index]) != int:
        continue
    verdict = None
    for i in range(5):
        try:
            verdict, confidence, reasoning, claims = pipeline.fact_check(statement)
        except Exception as e:
            print(f"Error {e}: retrying for statement {index}, attempt {i+1}")
            continue 
        break   
    
    if verdict is None:
        results.append(index)
    else:
        results.append((verdict, confidence, reasoning, claims))
    with open('results_v2.pkl', 'wb') as f:
        pickle.dump(results, f)

  0%|          | 0/83 [00:00<?, ?it/s]

[36m         Query: [33mNational Guard Hurricane Katrina deployment dates[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 25206.15it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mHurricane Katrina timeline National Guard response[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 95325.09it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mNational Guard deployment 9/11 date[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 117817.53it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33m9/11 timeline National Guard response[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 110086.72it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mNational Guard California wildfires 2020 deployment dates[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 123361.88it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mCalifornia wildfires 2020 timeline National Guard response[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 92589.49it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mJanuary 6th Capitol attack weapons seized[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 96866.14it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mweapons found on January 6th rioters[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 45051.60it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mimprovised weapons January 6th Capitol riot[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 7691.74it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mJanuary 6th Capitol attack improvised explosive devices[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 126334.46it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Error litellm.InternalServerError: litellm.InternalServerError: VertexAIException - {
  "error": {
    "code": 503,
    "message": "The model is overloaded. Please try again later.",
    "status": "UNAVAILABLE"
  }
}
: retrying for statement 1, attempt 1
[36m         Query: [33mJanuary 6th Capitol attack weapons seized[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 109798.53it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mweapons found on January 6th rioters[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 78692.38it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Error litellm.InternalServerError: litellm.InternalServerError: VertexAIException - {
  "error": {
    "code": 503,
    "message": "The model is overloaded. Please try again later.",
    "status": "UNAVAILABLE"
  }
}
: retrying for statement 1, attempt 2
[36m         Query: [33mJanuary 6th Capitol attack weapons seized[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 96866.14it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mweapons found on January 6th rioters[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 126334.46it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Error litellm.InternalServerError: litellm.InternalServerError: VertexAIException - {
  "error": {
    "code": 503,
    "message": "The model is overloaded. Please try again later.",
    "status": "UNAVAILABLE"
  }
}
: retrying for statement 1, attempt 3
[36m         Query: [33mJanuary 6th Capitol attack weapons seized[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 112447.83it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mweapons found on January 6th rioters[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 105120.40it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mimprovised weapons January 6th Capitol riot[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 104077.02it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[36m         Query: [33mJanuary 6th Capitol attack improvised explosive devices[0m [0m


Processing sources: 100%|██████████| 10/10 [00:00<00:00, 124830.48it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [27]:
len([i for i in results if type(i) == int])

55

In [6]:
main.VERBOSE = True

In [62]:
s = 'In New York, there are no barriers to law enforcement to work with the federal government on immigration laws, and there are 100 crimes where migrants can be handed over.'

In [9]:
s = 'Support for Roe is higher today in America than it has ever been.'
lm = dspy.LM('gemini/gemini-1.5-flash', api_key=os.getenv('GOOGLE_GEMINI_API_KEY'))

In [18]:
%reload_ext autoreload
%autoreload 2
import dotenv
import sys
import dspy
import os
sys.path.append('../pipeline_v2/')
import main 
dotenv.load_dotenv('../.env')

# Initialize search provider
main.NUM_SEARCH_RESULTS = 10 # Number of search results to retrieve
main.SCRAPE_TIMEOUT = 5 # Timeout for scraping a webpage (in seconds)
search_provider = main.SearchProvider(provider="duckduckgo")

# Initialize DSPy
lm = dspy.LM('gemini/gemini-1.5-flash', api_key=os.getenv('GOOGLE_GEMINI_API_KEY'))
# lm = dspy.LM('ollama_chat/mistral', api_base='http://localhost:11434', api_key='')
dspy.settings.configure(lm=lm)

# Initialize pipeline
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
main.VERBOSE = False # Print intermediate results
main.INTERACTIVE = False # Allow the user to provide feedback
main.USE_BM25 = True # Use BM25 for retrieval (in addition to cosine similarity)
main.BM25_WEIGHT = 0.5 # Weight for BM25 in the hybrid retrieval

pipeline = main.FactCheckPipeline(
    search_provider=search_provider,
    model_name=lm,
    embedding_model=embedding_model,
    retriever_k=2
)

# Example statement to fact-check
# statement = """And then there's the reality of the Trump economy, 
# where wages adjusted for inflation were rising. The wage gap between 
# rich and poor was shrinking. The savings rate for black Americans was 
# the highest in the history of our country."""

# Print final result
print("\nFinal Fact-Check Result:")


Final Fact-Check Result:


In [21]:
verdict, confidence, reasoning, claims = pipeline.fact_check(s)

Exception: Failed to extract claim: The statement "Support for Roe is higher today in America than it has ever been" is a claim about the level of public support for Roe v. Wade throughout US history.  To extract a verifiable claim, I need to specify a time period for comparison.  Since the statement implies a comparison across all of US history, it's impossible to verify without access to comprehensive historical polling data across the entire history of the United States.  Therefore, I cannot create a claim that meets the criteria of being specific, testable, and containing sufficient context for verification.  The claim is too broad.

In [75]:
lm.inspect_history(n=10)





[34m[2025-01-29T23:27:52.255546][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): The text to extract claims from

Your output fields are:
1. `reasoning` (str)
2. `claims` (str): JSON object containing:
    {
        "claims": [
            {
                "text": string, # Extracted claim containing required context for independent verification (e.g., "The wage gap between rich and poor was shrinking during the Trump administration in 2016-2020.")
            }
        ]
    }

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## claims ## ]]
{claims}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Extract specific, testable factual claims from the given text.
        Requirements:
        1. Each claim must contain the required context to verify it (e.g., specific time period in years, location, entities involve

In [54]:
for i in range(5):
    try:
        verdict, confidence, reasoning, claims = pipeline.fact_check("water is not wet")
    except Exception as e:
        print(f"Error: {e}, attempt {i+1}")
        continue 
    break   

[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Statement: [97mwater is not wet[0m [0m
[36m  ===== Atomic Claim Extraction =====[0m
[36m   Extracted Claims (0):  [0m
Error: list index out of range, attempt 1
[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Statement: [97mwater is not wet[0m [0m
[36m  ===== Atomic Claim Extraction =====[0m
[36m   Extracted Claims (0):  [0m
Error: list index out of range, attempt 2
[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Statement: [97mwater is not wet[0m [0m
[36m  ===== Atomic Claim Extraction =====[0m
[36m   Extracted Claims (0):  [0m
Error: list index out of range, attempt 3
[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Statement: [97mwater is not wet[0m [0m
[36m  ===== Atomic Claim Extraction =====[0m
[36m   Extracted Claims (0):  [0m
Error: list index out of range, attempt 4
[36m===== Starting Fact Check Pipeline =====[0m
[36m Original Stateme