# Performance Evaluation

## Helper Functions

In [40]:
### install necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [2]:
### defining labels
class_labels = ["unverifiable", "false", "mostly false", "half true", "mostly true", "true"]
class_2_index = {label: i for i, label in enumerate(class_labels)}
index_2_class = {i: label for i, label in enumerate(class_labels)}
num_labels = [class_2_index[label] for label in class_labels] # [0, 1, 2, 3, 4, 5]

In [2]:
### functions to generate confusion matrix and metrics
def generate_cm(y_true, y_pred):
    '''
    Calculate the confusion matrix with the cost matrix
    '''
    cm = confusion_matrix(y_true, y_pred)
    return cm

def generate_metrics(y_true, y_pred):
    '''
    Calculate the weighted F1 score
    '''
    weighted_precision = precision_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    weighted_recall = recall_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    weighted_f1 = f1_score(y_true, y_pred, average='weighted', sample_weight=None, labels=num_labels)
    metrics = pd.DataFrame({'Precision': weighted_precision, 'Recall': weighted_recall, 'F1': weighted_f1}, index=class_labels)
    return metrics

## Label Generation

In [2]:
### load the data
df = pd.read_csv('../data/[FINAL] Pilot - Pilot Claims copy.csv')
statements = df['statement'].to_list()
statement_dates = df['statement_date'].to_list()
statement_originators = df['statement_originator'].to_list()
gold = df['verdict'].to_list()

'''
Creates a list of strings that contain the statement, source, and date in the following format:
Statement: {statement}
According to: {source}
Date: {date}
'''
statements_agg = [f"""Statement: {statement} \n According to: {source} \n Date: {date}""" for statement, source, date in zip(statements, statement_originators, statement_dates)]

In [38]:
### generate results
def generate_results(statements: list, 
                     old_file: str = None, 
                     new_file: str = None):
    '''
    generate and store results for a list of statements
    make sure to:
    (1) load the old results if they exist
    (2) initialize the pipeline and define LM before running
    (3) run %%capture to suppress output

    '''
    # load results if it exists
    results = []
    if os.path.exists(old_file):
        with open(old_file, 'rb') as f:
            results = pickle.load(f)

    for index, statement in enumerate(tqdm(statements)):
        if len(results) <= index+1 and type(results[index]) != int:
            continue
        verdict = None

        # retry 5 times if there is an error
        for i in range(5):
            try:
                verdict, confidence, reasoning, claims = pipeline.fact_check(statement)
            except Exception as e:
                print(f"Error {e}: retrying for statement {index}, attempt {i+1}")
                continue 
            break   
        
        if verdict is None:
            results.append(index)
        else:
            results.append((verdict, confidence, reasoning, claims))
        with open(new_file, 'wb') as f:
            pickle.dump(results, f)

In [39]:
### generate results dataframe
def generate_results_df(results_filename: str,
                        df_filename: str):
    '''
    convert results to a dataframe and save it
    '''
    if os.path.exists(results_filename):
        with open(results_filename, 'rb') as f:
            results = pickle.load(f)
    results_df = pd.DataFrame(data=results, columns=['verdict', 'confidence', 'reasoning', 'claims'])
    results_df.to_csv(df_filename)
    return results_df

### Gemini

In [33]:
%reload_ext autoreload
%autoreload 2

### load fact-checking pipeline
import dotenv
import sys
import dspy
import os
sys.path.append('../pipeline_v2/')
import main 
dotenv.load_dotenv('../.env')

# initialize search provider
main.NUM_SEARCH_RESULTS = 10 # Number of search results to retrieve
main.SCRAPE_TIMEOUT = 5 # Timeout for scraping a webpage (in seconds)
search_provider = main.SearchProvider(provider="duckduckgo")

# initialize DSPy
lm = dspy.LM('gemini/gemini-1.5-flash', api_key=os.getenv('GOOGLE_GEMINI_API_KEY'), cache=False)
#lm = dspy.LM('ollama_chat/mistral', api_base='http://localhost:11434', api_key='')
dspy.settings.configure(lm=lm)

# initialize pipeline
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
main.VERBOSE = True # Print intermediate results
main.INTERACTIVE = False # Allow the user to provide feedback
main.USE_BM25 = True # Use BM25 for retrieval (in addition to cosine similarity)
main.BM25_WEIGHT = 0.5 # Weight for BM25 in the hybrid retrieval

pipeline = main.FactCheckPipeline(
    search_provider=search_provider,
    model_name=lm,
    embedding_model=embedding_model,
    retriever_k=2
)

### Mistral

In [35]:
%reload_ext autoreload
%autoreload 2
import dotenv
import sys
import dspy
import os
sys.path.append('../pipeline_v2/')
import main 
dotenv.load_dotenv('../.env')

# Initialize search provider
main.NUM_SEARCH_RESULTS = 10 # Number of search results to retrieve
main.SCRAPE_TIMEOUT = 5 # Timeout for scraping a webpage (in seconds)
search_provider = main.SearchProvider(provider="duckduckgo")

# Initialize DSPy
lm = dspy.LM('ollama_chat/mistral', api_base='http://localhost:11434', api_key='')
dspy.settings.configure(lm=lm)

# Initialize pipeline
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
main.VERBOSE = False # Print intermediate results
main.INTERACTIVE = False # Allow the user to provide feedback
main.USE_BM25 = True # Use BM25 for retrieval (in addition to cosine similarity)
main.BM25_WEIGHT = 0.5 # Weight for BM25 in the hybrid retrieval

pipeline = main.FactCheckPipeline(
    search_provider=search_provider,
    model_name=lm,
    embedding_model=embedding_model,
    retriever_k=2
)


## AVeriTeC

In [24]:
import json

# Open and read the JSON file
with open('data_dev.json', 'r') as file:
    averitec = json.load(file)

averitec_20 = averitec[:20]

In [68]:
averitec_20

[{'claim': 'In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.',
  'required_reannotation': False,
  'label': 'Refuted',
  'justification': 'The answer and sources show that the claim was published in a fake news site so the claim is refuted.',
  'claim_date': '31-10-2020',
  'speaker': None,
  'original_claim_url': None,
  'fact_checking_article': 'https://web.archive.org/web/20201130144023/https://checkyourfact.com/2020/11/03/fact-check-sean-connery-letter-steve-jobs-apple-1998/',
  'reporting_source': 'Facebook',
  'location_ISO_code': None,
  'claim_types': ['Event/Property Claim'],
  'fact_checking_strategies': ['Written Evidence'],
  'questions': [{'question': 'Where was the claim first published',
    'answers': [{'answer': 'It was first published on Sccopertino',
      'answer_type': 'Abstractive',
      'source_url': 'https://web.archive.org/web/20201129141238/https://scoopertino.com/exposed-the-imac-disaster-that-almost-was/',
      'source_medi

In [71]:
with open("data_dev_1.json", "w") as final:
	json.dump(averitec_20, final)

In [8]:
import json
def format_prediction(questions, answers, urls, pred_label):
    evidence = [
        {"question": q, "answer": a, "url": u}
        for q, a, u in zip(questions, answers, urls)
    ]
    return {
        "evidence": evidence,
        "pred_label": pred_label
    }

# Example usage
questions = [claim.components[j].question for claim in claims for j in range(len(claim.components))] # List of questions
answers = [claim.components[j].answer.text for claim in claims for j in range(len(claim.components))]    # List of corresponding answers
urls = ['x' for i in range(len(questions))] # List of corresponding URLs
pred_label = verdict

src = format_prediction(questions, answers, urls, pred_label)

NameError: name 'claims' is not defined

In [85]:
averitec_results = []
for s in statements:
    verdict, confidence, reasoning, claims = pipeline.fact_check(s)
    questions = [claim.components[j].question for claim in claims for j in range(len(claim.components))] # List of questions
    answers = [claim.components[j].answer.text for claim in claims for j in range(len(claim.components))] # List of corresponding answers
    urls = ['x' for i in range(len(questions))] # List of corresponding URLs
    pred_label = verdict
    print('='*20)
    print(pred_label)
    averitec_results.append(format_prediction(questions, answers, urls, pred_label))

20

In [84]:
[c['pred_label'] for c in averitec_results]

['Refuted',
 'Refuted',
 'Refuted',
 'Refuted',
 'Refuted',
 'Not Enough Evidence',
 'Supported',
 'Supported',
 'Not Enough Evidence',
 'Supported',
 'Conflicting Evidence/Cherry-picking',
 'Not Enough Evidence',
 'Refuted',
 'Not Enough Evidence',
 'Not Enough Evidence',
 'Not Enough Evidence',
 'Not Enough Evidence',
 'Not Enough Evidence',
 'Not Enough Evidence',
 'Refuted']

In [86]:
with open("my_data.json", "w") as final:
	json.dump(averitec_results, final)