# Setup

In [1]:
# Controlling gpu usage for NLI model
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

In [2]:
import streamlit as st
import json
import firebase_admin
from firebase_admin import credentials, firestore
import pandas as pd

In [3]:
def initialize_firebase():
    # cred = credentials.Certificate("YOUR/PATH/TO/firebase-adminsdk-XXXXXX-XXXXXXXXXX.json") # Call once on creation
    service_account_info = {
        "type": st.secrets["firebase"]["type"],
        "project_id": st.secrets["firebase"]["project_id"],
        "private_key_id": st.secrets["firebase"]["private_key_id"],
        "private_key": st.secrets["firebase"]["private_key"].replace("\\n", "\n"),
        "client_email": st.secrets["firebase"]["client_email"],
        "client_id": st.secrets["firebase"]["client_id"],
        "auth_uri": st.secrets["firebase"]["auth_uri"],
        "token_uri": st.secrets["firebase"]["token_uri"],
        "auth_provider_x509_cert_url": st.secrets["firebase"]["auth_provider_x509_cert_url"],
        "client_x509_cert_url": st.secrets["firebase"]["client_x509_cert_url"]
    }
    if not firebase_admin._apps:
        cred = credentials.Certificate(service_account_info)
        firebase_admin.initialize_app(cred)
    return firestore.client()

# Initialize Firestore
db = initialize_firebase()

# Data preprocessing

In [None]:
def get_all_collections():
    """
    Retrieve all collection names from the Firestore database.
    """
    collections = db.collections()
    return [collection.id for collection in collections]


def get_all_documents_from_collection(collection_name):
    """
    Retrieve all documents from a specified Firestore collection.

    Args:
        collection_name (str): The name of the Firestore collection.

    Returns:
        list: A list of dictionaries representing the documents in the collection.
    """
    collection_ref = db.collection(collection_name)
    docs = collection_ref.stream()

    all_documents = []
    for doc in docs:
        document_data = doc.to_dict()
        document_data['id'] = doc.id  # Optionally include the document ID
        all_documents.append(document_data)
    
    return all_documents


# Fetch all collections and documents
collections = get_all_collections()
all_data = []

for collection in collections:
    print(f"Processing collection: {collection}")
    documents = get_all_documents_from_collection(collection)
    all_data.extend(documents)

display(f'{len(all_data)=}')
display(f'{all_data[0]=}')

In [None]:
# Apply some filtering criteria on your data if needed
filtered_data = []

for sample in all_data:
    filtered_data.append(sample)

json.dump(filtered_data, open("0_human_eval_mix_labelled.json", "w"), indent=4)

In [None]:
# Combining the questions, docs and responses with ratings

labelled = json.load(open("0_human_eval_mix_labelled.json")) # from firebase
unlabelled = json.load(open("0_human_eval_mix_unlabelled.json")) # json wth your questions, docs and responses (you must have this prior to starting this)

for label_sample in labelled:
    found = False  # Flag to indicate if a match was found
    for unlabel_sample in unlabelled:
        if label_sample['question'] == unlabel_sample['question']:
            label_sample['pos'] = unlabel_sample['GAns']
            label_sample['neg'] = unlabel_sample['output']
            label_sample['docs'] = unlabel_sample['docs']
            found = True
            break  # Exit the loop if a match is found
    if not found:
        print("missing")

In [None]:
# Getting the dataset labels from raw dataset

# Load raw data from different datasets
asqa_raw_data = json.load(open("asqa_error_instruction.json"))
eli5_raw_data = json.load(open("eli5_error_instruction.json"))
qampari_raw_data = json.load(open("qampari_error_instruction.json"))

# Add dataset labels to each sample for identification
for sample in asqa_raw_data:
    sample["dataset"] = "ASQA"
for sample in eli5_raw_data:
    sample["dataset"] = "ELI5"
for sample in qampari_raw_data:
    sample["dataset"] = "QAMPARI"

# Combine all raw data into a single list
combined_raw_data = asqa_raw_data + eli5_raw_data + qampari_raw_data

missing = 0 
missing_samples = []
final_data = []

# Function to check if a labelled sample matches a raw sample
def match_sample(label_sample, unlabel_sample):
    return label_sample['question'] == unlabel_sample['question'] 

for label_sample in labelled:
    found = False  # Flag to indicate if a match was found
    for raw_sample in combined_raw_data:
        if match_sample(label_sample, raw_sample):
            # label_sample['eval_metrics'] = raw_sample['eval_metrics']
            label_sample['dataset'] = raw_sample['dataset']  # Add dataset label
            final_data.append(label_sample)
            found = True
            break  # Exit the loop if a match is found
    if not found:
        missing +=1
        missing_samples.append(label_sample)

print(f'{missing=}')
print(f'{len(final_data)=}')


In [None]:
# json.dump(updated_final_data, open("human_eval_mix_labelled_complete.json", "w"), indent=4)
updated_final_data = json.load(open("human_eval_mix_labelled_complete.json", "r"))

In [None]:
from utils import *

# get NLI rating

# Run AutoAIS evaluation for "pos" and "neg" responses
updated_final_data = compute_autoais(updated_final_data, "pos")
updated_final_data = compute_autoais(updated_final_data, "neg")

json.dump(updated_final_data, open("1_human_eval_mix_labelled_complete.json", "w"), indent=4)

In [None]:
# Reorder and clean up data
reordered_data = []

for sample in updated_final_data:
    # Add correctness values to the NLI scores
    sample['nli']['pos']['correctness'] = 1
    sample['nli']['neg']['correctness'] = 0

    # Reorder keys and structure the sample
    reordered_sample = {
        'question_set': sample['question_set'],
        'question': sample['question'],
        'docs': sample['docs'],
        'pos': sample['pos'],
        'neg': sample['neg'],
        'human': {
            'pos': sample['response']['pos'],
            'neg': sample['response']['neg']
        },
        'nli': sample['nli'],
        'dataset': sample['dataset'],
        'id': sample['id']
    }
    reordered_data.append(reordered_sample)

json.dump(reordered_data, open("2_human_eval_mix_labelled_cleaned.json", "w"), indent=4)

In [None]:
# Define a function to binarize the human ratings
def binarize_nli(data):
    # Mapping of text-based labels to binary values
    binarize_map = {
        "Full support": 1,
        "Partial support": 1,
        "No support": 0,
        "Correct": 1,
        "Wrong": 0
    }

    for item in data:
        # Ensure the 'human' field exists in the item
        if 'human' in item:
            # Process both 'pos' and 'neg' fields in 'human'
            for response_type in ['pos', 'neg']:
                if response_type in item['human']:
                    # Convert each key-value pair in the field
                    for key, value in item['human'][response_type].items():
                        item['human'][response_type][key] = binarize_map.get(value, value)

    return data

binarized_data = binarize_nli(reordered_data)
json.dump(binarized_data, open("3_human_eval_mix_labelled_binarized.json", "w"), indent=4)


# Analysis

In [None]:
def analyze_samples(data):
    # Initialize counters
    correct_positive_samples = 0
    correct_negative_samples = 0
    correct_citation_ratings = 0
    total_citation_ratings = 0
    correct_citation_prec = 0
    total_citation_prec = 0
    correct_citation_rec = 0
    total_citation_rec = 0

    for sample in data:
        # Check if positive and negative samples are correctly rated
        if sample['human']['pos']['correctness'] == sample['nli']['pos']['correctness'] == 1:
            correct_positive_samples += 1
        if sample['human']['neg']['correctness'] == sample['nli']['neg']['correctness'] == 0:
            correct_negative_samples += 1

        # Calculate citation ratings accuracy
        for response_type in ['pos', 'neg']:
            for key, human_rating in sample['human'][response_type].items():
                # Exclude "correctness" as it’s not a citation
                if key != "correctness" and key in sample['nli'][response_type]:
                    nli_rating = sample['nli'][response_type][key]
                    
                    # Overall citation accuracy
                    if human_rating == nli_rating:
                        correct_citation_ratings += 1
                    total_citation_ratings += 1

                    # Separate into precision and recall metrics
                    if "prec" in key:
                        total_citation_prec += 1
                        if human_rating == nli_rating:
                            correct_citation_prec += 1
                    elif "recall" in key:
                        total_citation_rec += 1
                        if human_rating == nli_rating:
                            correct_citation_rec += 1

    # Output results
    print(f"Correct samples: {correct_positive_samples + correct_negative_samples} out of {len(data) * 2} ({(correct_positive_samples + correct_negative_samples) / (len(data) * 2):.2%})")
    print(f"Correct positive samples: {correct_positive_samples} out of {len(data)} ({correct_positive_samples / len(data):.2%})")
    print(f"Correct negative samples: {correct_negative_samples} out of {len(data)} ({correct_negative_samples / len(data):.2%})")
    citation_acc = (correct_citation_ratings / total_citation_ratings) if total_citation_ratings > 0 else "No citation ratings available"
    print(f"Correct citation ratings: {correct_citation_ratings} out of {total_citation_ratings} ({citation_acc:.2%})")
    print(f"Correct citation precision ratings: {correct_citation_prec} out of {total_citation_prec} ({correct_citation_prec / total_citation_prec:.2%})" if total_citation_prec > 0 else "No citation precision ratings available")
    print(f"Correct citation recall ratings: {correct_citation_rec} out of {total_citation_rec} ({correct_citation_rec / total_citation_rec:.2%})" if total_citation_rec > 0 else "No citation recall ratings available")

# Example usage with data
analyze_samples(final_data)


In [None]:
def compute_cohen_kappa(data):
    # Initialize counters for agreement and disagreement
    # Contingency table values
    agree_correct = 0  # Both human and model say "Correct"
    agree_wrong = 0    # Both human and model say "Wrong"
    human_correct_model_wrong = 0  # Human says "Correct", model says "Wrong"
    human_wrong_model_correct = 0  # Human says "Wrong", model says "Correct"

    for sample in data:
        # Check 'pos' ratings
        human_pos = sample['human']['pos']['correctness']
        model_pos = sample['nli']['pos']['correctness']
        if human_pos == 1 and model_pos == 1:
            agree_correct += 1
        elif human_pos == 0 and model_pos == 0:
            agree_wrong += 1
        elif human_pos == 1 and model_pos == 0:
            human_correct_model_wrong += 1
        elif human_pos == 0 and model_pos == 1:
            human_wrong_model_correct += 1

        # Check 'neg' ratings
        human_neg = sample['human']['neg']['correctness']
        model_neg = sample['nli']['neg']['correctness']
        if human_neg == 1 and model_neg == 1:
            agree_correct += 1
        elif human_neg == 0 and model_neg == 0:
            agree_wrong += 1
        elif human_neg == 1 and model_neg == 0:
            human_correct_model_wrong += 1
        elif human_neg == 0 and model_neg == 1:
            human_wrong_model_correct += 1

    # Total number of ratings
    total = agree_correct + agree_wrong + human_correct_model_wrong + human_wrong_model_correct

    # Observed agreement
    observed_agreement = (agree_correct + agree_wrong) / total

    # Expected agreement by chance
    prob_human_correct = (agree_correct + human_correct_model_wrong) / total
    prob_model_correct = (agree_correct + human_wrong_model_correct) / total
    prob_human_wrong = (agree_wrong + human_wrong_model_correct) / total
    prob_model_wrong = (agree_wrong + human_correct_model_wrong) / total

    expected_agreement = (prob_human_correct * prob_model_correct) + (prob_human_wrong * prob_model_wrong)

    # Cohen's Kappa
    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement) if (1 - expected_agreement) != 0 else 0

    # Display results
    print("Cohen's Kappa:", kappa)

# Example usage with data
compute_cohen_kappa(final_data)


In [None]:
def compute_cohen_kappa_citations(data):
    # Initialize counters for agreement and disagreement
    # Contingency table values for citation judgments
    agree_correct = 0  # Both human and model say "Correct" (1)
    agree_wrong = 0    # Both human and model say "Wrong" (0)
    human_correct_model_wrong = 0  # Human says "Correct" (1), model says "Wrong" (0)
    human_wrong_model_correct = 0  # Human says "Wrong" (0), model says "Correct" (1)

    # Iterate over each sample
    for sample in data:
        for response_type in ['pos', 'neg']:
            # Ensure there are citations in both human and model (nli) judgments
            human_judgments = sample['human'][response_type]
            model_judgments = sample['nli'][response_type]
            
            # Loop through citation fields only (excluding "correctness")
            for key in human_judgments:
                if key != "correctness" and key in model_judgments:
                    human_value = human_judgments[key]
                    model_value = model_judgments[key]

                    # Update counters based on agreement/disagreement
                    if human_value == 1 and model_value == 1:
                        agree_correct += 1
                    elif human_value == 0 and model_value == 0:
                        agree_wrong += 1
                    elif human_value == 1 and model_value == 0:
                        human_correct_model_wrong += 1
                    elif human_value == 0 and model_value == 1:
                        human_wrong_model_correct += 1

    # Total number of citation judgments
    total = agree_correct + agree_wrong + human_correct_model_wrong + human_wrong_model_correct

    # Observed agreement
    observed_agreement = (agree_correct + agree_wrong) / total if total > 0 else 0

    # Expected agreement by chance
    prob_human_correct = (agree_correct + human_correct_model_wrong) / total if total > 0 else 0
    prob_model_correct = (agree_correct + human_wrong_model_correct) / total if total > 0 else 0
    prob_human_wrong = (agree_wrong + human_wrong_model_correct) / total if total > 0 else 0
    prob_model_wrong = (agree_wrong + human_correct_model_wrong) / total if total > 0 else 0

    expected_agreement = (prob_human_correct * prob_model_correct) + (prob_human_wrong * prob_model_wrong)

    # Cohen's Kappa
    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement) if (1 - expected_agreement) != 0 else 0

    # Display results
    print("Cohen's Kappa for Citation Judgments:", kappa)

# Example usage with binarized data
compute_cohen_kappa_citations(final_data)
