In [2]:
import json

import pandas as pd
from scipy import stats
from src.column import Column
from src.experiments.load_experiment import Experiment, load_experiment_df
from src.models.citation_faithfulness import CitationFaithfulness
from src.models.claim_recall import ClaimRecall
from src.models.bert_score import BertScore
from src.models.manual_annotations import EvaluationBatch
from src.models.rouge_score import RougeScore
from src.models.citations_em import CitationsExactMatch

In [3]:
# spearmanr utils
def to_ranks(data: list[int]):
    current_rank = 1
    to_rank_map = {}

    data_copy = data.copy()
    
    while data_copy:
        current_value = min(data_copy)
        current_value_count = len([d for d in data_copy if d == current_value])

        to_rank_map[current_value] = sum(range(current_rank, current_rank + current_value_count)) / current_value_count

        current_rank += current_value_count
        data_copy = [d for d in data_copy if d != current_value]

    return [to_rank_map[d] for d in data]

def values_to_ranks(data: dict[str, int]) -> dict[str, int]:
    ranks = to_ranks(list(data.values()))
    ranks = {k: ranks[i] for i, k in enumerate(data.keys())}
    return ranks

print(to_ranks([1,2,1,1,3]))
print(to_ranks([1,2,3,4,5]))
print(to_ranks([1,1,1,1,1]))
print(to_ranks([1,2,2,1,4]))

print(values_to_ranks({'a': 1, 'b': 2, 'c': 1, 'd': 1, 'e': 3}))

[2.0, 4.0, 2.0, 2.0, 5.0]
[1.0, 2.0, 3.0, 4.0, 5.0]
[3.0, 3.0, 3.0, 3.0, 3.0]
[1.5, 3.5, 3.5, 1.5, 5.0]
{'a': 2.0, 'b': 4.0, 'c': 2.0, 'd': 2.0, 'e': 5.0}


In [41]:
# Auto results retrieval methods

TO_BASE_MAP = {
    Experiment.POST_HOC_LLAMA_8B: Experiment.BASE_LLAMA_8B,
    Experiment.POST_HOC_MISTRAL_7B: Experiment.BASE_MISTRAL_7B,
    Experiment.POST_HOC_SAUL_7B: Experiment.BASE_SAUL_7B,
    Experiment.POST_HOC_LLAMA_70B: Experiment.BASE_LLAMA_70B,
}

def get_claim_recall(e: Experiment, row_number: int):
    if e in TO_BASE_MAP:
        df, _ = load_experiment_df(TO_BASE_MAP[e])
    else: 
        df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    claim_recall = row[Column.CLAIM_RECALL]
    if pd.isna(claim_recall):
        print(f"Claim recall not found for {e}")
        claim_recall = 0
    else:
        claim_recall = ClaimRecall.model_validate_json(claim_recall)
        claim_recall = claim_recall.claim_recall
    return claim_recall

def get_correctness_bert(e: Experiment, row_number: int):
    if e in TO_BASE_MAP:
        df, _ = load_experiment_df(TO_BASE_MAP[e])
    else: 
        df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    bert = row[Column.CORRECTNESS_BERT]
    if pd.isna(bert):
        print(f"Bert Score not found for {e}")
        bert = 0
    else:
        bert = BertScore.model_validate_json(bert)
        bert = bert.f1
    return bert

def get_correctness_rouge(e: Experiment, row_number: int):
    if e in TO_BASE_MAP:
        df, _ = load_experiment_df(TO_BASE_MAP[e])
    else: 
        df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    rouge = row[Column.CORRECTNESS_ROUGE]
    if pd.isna(rouge):
        print(f"Rouge Score not found for {e}")
        rouge = 0
    else:
        rouge = RougeScore.model_validate_json(rouge)
        rouge = rouge.rouge_l
    return rouge

def get_citation_faithfulness_rec(e: Experiment, row_number: int):
    df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    if Column.CITATION_FAITHFULNESS not in row:
        print(f"Citation faithfulness not found for {e}")
        return 0

    citation_faithfulness = row[Column.CITATION_FAITHFULNESS]
    if pd.isna(citation_faithfulness):
        print(f"Citation faithfulness not found for {e}")
        citation_faithfulness = 0
    else:
        citation_faithfulness = CitationFaithfulness.model_validate_json(citation_faithfulness)
        citation_faithfulness = citation_faithfulness.citation_recall
    return citation_faithfulness

def get_citation_faithfulness_prec(e: Experiment, row_number: int):
    df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    if Column.CITATION_FAITHFULNESS not in row:
        print(f"Citation faithfulness not found for {e}")
        return 0

    citation_faithfulness = row[Column.CITATION_FAITHFULNESS]
    if pd.isna(citation_faithfulness):
        print(f"Citation faithfulness not found for {e}")
        citation_faithfulness = 0
    else:
        citation_faithfulness = CitationFaithfulness.model_validate_json(citation_faithfulness)
        citation_faithfulness = citation_faithfulness.citation_precision
    return citation_faithfulness

def get_citation_similarity(e, row_number):
    df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    if Column.CITATION_SIMILARITY_NLI not in row:
        print(f"Citation similarity not found for {e}")
        return 0

    citation_similarity = row[Column.CITATION_SIMILARITY_NLI]
    if pd.isna(citation_similarity):
        print(f"Citation similarity not found for {e}")
        citation_similarity = 0
    return citation_similarity 

def get_citation_similarity_em(e, row_number):
    df, _ = load_experiment_df(e)
    row = df.loc[row_number]
    if Column.CITATION_SIMILARITY_EM not in row:
        print(f"Citation similarity not found for {e}")
        return 0

    citation_similarity = row[Column.CITATION_SIMILARITY_EM]
    if pd.isna(citation_similarity):
        print(f"Citation similarity not found for {e}")
        citation_similarity = 0
    else: 
        citation_similarity = CitationsExactMatch.model_validate_json(citation_similarity)
        citation_similarity = citation_similarity.f1
    return citation_similarity

In [56]:
# Utils
def get_average_ranks(ranks: list[dict[str, int]]):
    ranks = pd.DataFrame(ranks)
    return ranks.mean()


def get_annotation_data(file_name: str):
    with open(file_name) as f:
        data = json.load(f)
        data = [EvaluationBatch.model_validate(d) for d in data if d['annotation'] != None]
        data = [d for d in data if d.annotation != None]
        return data  

# data = get_annotation_data('data/expert_annotations_rag.json')    
data = get_annotation_data('data/expert_annotations_mistral.json')
print(len(data))

20


In [57]:
def get_expert_correctness_order(e: EvaluationBatch):
    experiment_order = [g.experiment for g in e.generations]
    annotation_order = e.annotation.claim_order
    annotation_order_map = {experiment_order[i]: int(annotation_order[i]) for i in range(4)}
    return annotation_order_map

correctness_avg_ranks = get_average_ranks([values_to_ranks(get_expert_correctness_order(e)) for e in data])

def get_expert_groundedness_order(e: EvaluationBatch):
    experiment_order = [g.experiment for g in e.generations]
    annotation_order = e.annotation.citation_faithfulness_order
    annotation_order_map = {experiment_order[i]: int(annotation_order[i]) for i in range(4)}
    return annotation_order_map

groundedness_avg_ranks = get_average_ranks([values_to_ranks(get_expert_groundedness_order(e)) for e in data])

def get_expert_citation_relevance_order(e: EvaluationBatch):
    experiment_order = [g.experiment for g in e.generations]
    annotation_order = e.annotation.citation_similarity_order
    annotation_order_map = {experiment_order[i]: int(annotation_order[i]) for i in range(4)}
    return annotation_order_map

citation_relevance_avg_ranks = get_average_ranks([values_to_ranks(get_expert_citation_relevance_order(e)) for e in data])

In [58]:
def get_automatic_correctness_ranks(e: EvaluationBatch):
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    claim_recalls = [get_claim_recall(e, row_number) for e in experiment_order]
    # we negate numbers because values_to_ranks assumes lower is better
    claim_recall_map = {experiment_order[i]: -claim_recalls[i] for i in range(4)}
    claim_recall_map = values_to_ranks(claim_recall_map)
    return claim_recall_map

avg_claim_recall_ranks = get_average_ranks([get_automatic_correctness_ranks(e) for e in data])

def get_automatic_groundedness_prec_ranks(e: EvaluationBatch):
    # Note: in code prec and rec are switched
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    citation_faithfulness = [get_citation_faithfulness_rec(e, row_number) for e in experiment_order]
    citation_faithfulness_map = {experiment_order[i]: -citation_faithfulness[i] for i in range(4)}
    citation_faithfulness_map = values_to_ranks(citation_faithfulness_map)
    return citation_faithfulness_map

def get_automatic_groundedness_rec_ranks(e: EvaluationBatch):
    # Note: in code prec and rec are switched
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    citation_faithfulness = [get_citation_faithfulness_prec(e, row_number) for e in experiment_order]
    citation_faithfulness_map = {experiment_order[i]: -citation_faithfulness[i] for i in range(4)}
    citation_faithfulness_map = values_to_ranks(citation_faithfulness_map)
    return citation_faithfulness_map

avg_citation_faithfulness_ranks = get_average_ranks([get_automatic_groundedness_rec_ranks(e) for e in data])

def get_automatic_citation_relevance_ranks(e: EvaluationBatch):
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    citation_similarity = [get_citation_similarity(e, row_number) for e in experiment_order]
    citation_similarity_map = {experiment_order[i]: -citation_similarity[i] for i in range(4)}
    citation_similarity_map = values_to_ranks(citation_similarity_map)
    return citation_similarity_map

avg_automatic_citation_relevance_ranks = get_average_ranks([get_automatic_citation_relevance_ranks(e) for e in data])


In [59]:
# Print my nice latex table 
mapping1 = {
    Experiment.RAG_GTR_k_10_MISTRAL_7B.value: "RAG",
    Experiment.LLATRIEVAL_GTR_k_10_MISTRAL_7B.value: "Llatrieval",
    Experiment.POST_HOC_MISTRAL_7B.value: "Post-hoc",
    Experiment.RARR_MISTRAL_7B.value: "RARR"
}

mapping2 = {
    Experiment.RAG_GTR_k_10_MISTRAL_7B.value: "Mistral-7B",
    Experiment.RAG_GTR_k_10_SAUL_7B.value: "SaulLM-7B",
    Experiment.RAG_GTR_k_10_LLAMA_8B.value: "Llama-3-8B",
    Experiment.RAG_GTR_k_10_LLAMA_70B.value: "Llama-3-70B",
}

for k in mapping1.keys():
    print(f"{mapping1[k]} & {correctness_avg_ranks[k]:.2f} & {avg_claim_recall_ranks[k]:.2f} & {groundedness_avg_ranks[k]:.2f} & {avg_citation_faithfulness_ranks[k]:.2f} & {citation_relevance_avg_ranks[k]:.2f} & {avg_automatic_citation_relevance_ranks[k]:.2f} \\\\")


RAG & 2.45 & 2.10 & 2.15 & 1.70 & 2.15 & 2.67 \\
Llatrieval & 1.80 & 2.00 & 2.10 & 1.43 & 2.15 & 2.20 \\
Post-hoc & 2.90 & 2.95 & 3.30 & 3.90 & 3.35 & 3.15 \\
RARR & 2.85 & 2.95 & 2.45 & 2.98 & 2.35 & 1.98 \\


In [46]:
# lets get the correlation between expert correctness and claim recall

def sort_by_keys(d: dict[str, int]) -> dict[str, int]:
    return {k: d[k] for k in sorted(d.keys())}

def transform_to_flat_ranks(func: callable):
    r = [values_to_ranks(func(e)) for e in data]
    r = [sort_by_keys(r) for r in r]
    r = [list(r.values()) for r in r]
    r = to_ranks([v for r in r for v in r])
    return r

expert_correctness_ranks = transform_to_flat_ranks(get_expert_correctness_order)
auto_claim_recall_ranks = transform_to_flat_ranks(get_automatic_correctness_ranks)
cr_corr, cr_p = stats.spearmanr(auto_claim_recall_ranks, expert_correctness_ranks)
print(f"Claim Recall vs Expert Correctness: {cr_corr}, {cr_p}")

Claim Recall vs Expert Correctness: 0.09310996649714702, 0.24157076977032546


In [47]:
# lets get the correlation between expert correctness and bert score

def get_automatic_bert_ranks(e: EvaluationBatch):
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    bert_scores = [get_correctness_bert(e, row_number) for e in experiment_order]
    # we negate numbers because values_to_ranks assumes lower is better
    bert_map = {experiment_order[i]: -bert_scores[i] for i in range(4)}
    bert_map = values_to_ranks(bert_map)
    return bert_map


auto_bert_ranks = transform_to_flat_ranks(get_automatic_bert_ranks)

bert_corr, bert_p = stats.spearmanr(auto_bert_ranks, expert_correctness_ranks)
print(f"Bert vs Expert Correctness: {bert_corr}, {bert_p}")

Bert vs Expert Correctness: 0.23450358479614078, 0.0028382086911872915


In [48]:
# rouge l score vs expert correctness

def get_automatic_rouge_ranks(e: EvaluationBatch):
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    rouge_scores = [get_correctness_rouge(e, row_number) for e in experiment_order]
    # we negate numbers because values_to_ranks assumes lower is better
    rouge_map = {experiment_order[i]: -rouge_scores[i] for i in range(4)}
    rouge_map = values_to_ranks(rouge_map)
    return rouge_map

auto_rouge_ranks = transform_to_flat_ranks(get_automatic_rouge_ranks)

rouge_corr, rouge_p = stats.spearmanr(auto_rouge_ranks, expert_correctness_ranks)
print(f"Rouge vs Expert Correctness: {rouge_corr}, {rouge_p}")

Rouge vs Expert Correctness: 0.09458406019460154, 0.23416719435925173


In [49]:
# Correlation between expert groundedness and citation faithfulness

expert_groundedness_ranks = transform_to_flat_ranks(get_expert_groundedness_order)

auto_citation_faithfulness_ranks = transform_to_flat_ranks(get_automatic_groundedness_ranks)

cf_corr, cf_p = stats.spearmanr(auto_citation_faithfulness_ranks, expert_groundedness_ranks)
print(f"Citation Faithfulness Rec vs Expert Groundedness: {cf_corr}, {cf_p}")

Citation Faithfulness Rec vs Expert Groundedness: 0.3195694922029133, 3.80330930979127e-05


In [50]:
auto_citation_faithfulness_ranks = transform_to_flat_ranks(get_automatic_groundedness_prec_ranks)

cfp_corr, cfp_p = stats.spearmanr(auto_citation_faithfulness_ranks, expert_groundedness_ranks)
print(f"Citation Faithfulness Prec vs Expert Groundedness: {cfp_corr}, {cfp_p}")

Citation Faithfulness Prec vs Expert Groundedness: 0.2628380116576318, 0.0007855483491362403


In [20]:
# Correlation between expert citation relevance and citation similarity

expert_citation_relevance_ranks = transform_to_flat_ranks(get_expert_citation_relevance_order)

auto_citation_similarity_ranks = transform_to_flat_ranks(get_automatic_citation_relevance_ranks)

cs_corr, cs_p = stats.spearmanr(auto_citation_similarity_ranks, expert_citation_relevance_ranks)
print(f"Citation Similarity vs Expert Citation Relevance: {cs_corr}, {cs_p}")

Citation Similarity vs Expert Citation Relevance: 0.3839582852951068, 5.383449629975013e-07


In [21]:
# Correlation between expert citation relevance and citation similarity em

def get_automatic_citation_relevance_em_ranks(e: EvaluationBatch):
    row_number = e.question_number
    experiment_order = [g.experiment for g in e.generations]
    citation_similarity = [get_citation_similarity_em(e, row_number) for e in experiment_order]
    citation_similarity_map = {experiment_order[i]: -citation_similarity[i] for i in range(4)}
    citation_similarity_map = values_to_ranks(citation_similarity_map)
    return citation_similarity_map

auto_citation_similarity_em_ranks = transform_to_flat_ranks(get_automatic_citation_relevance_em_ranks)

cs_em_corr, cs_em_p = stats.spearmanr(auto_citation_similarity_em_ranks, expert_citation_relevance_ranks)
print(f"Citation Similarity EM vs Expert Citation Relevance: {cs_em_corr}, {cs_em_p}")

Citation Similarity EM vs Expert Citation Relevance: 0.10353380421399883, 0.192627199609641


In [51]:
# print as latex table

print(f"ROUGE-L & {rouge_corr:.2f} & {rouge_p} \\\\")
print(f"BERTScore & {bert_corr:.2f} & {bert_p} \\\\")
print(f"Claim Recall & {cr_corr:.2f} & {cr_p} \\\\")
# Note that in code Prec and Rec are swapped compared to the paper!!!
print(f"Groundedness (Rec) & {cfp_corr:.2f} & {cfp_p} \\\\")
print(f"Groundedness (Prec) & {cf_corr:.2f} & {cf_p} \\\\")
print(f"Citation EM & {cs_em_corr:.2f} & {cs_em_p} \\\\")
print(f"Citation Recall & {cs_corr:.2f} & {cs_p} \\\\")

ROUGE-L & 0.09 & 0.23416719435925173 \\
BERTScore & 0.23 & 0.0028382086911872915 \\
Claim Recall & 0.09 & 0.24157076977032546 \\
Groundedness (Rec) & 0.26 & 0.0007855483491362403 \\
Groundedness (Prec) & 0.32 & 3.80330930979127e-05 \\
Citation EM & 0.10 & 0.192627199609641 \\
Citation Recall & 0.38 & 5.383449629975013e-07 \\


In [40]:
# Better than target

def get_better_than_target(e: EvaluationBatch):
    experiment_order = [g.experiment for g in e.generations]
    mapping = {
        experiment_order[0]: "1",
        experiment_order[1]: "2",
        experiment_order[2]: "3",
        experiment_order[3]: "4"
    }

    better_than_target = e.annotation.better_than_target
    better_than_target_map = {e: True if mapping[e] in better_than_target else False for e in experiment_order}
    return better_than_target_map

better_than_target = [get_better_than_target(d) for d in data]

# How many times do we get better than target
better_total = sum([sum([1 for v in d.values() if v]) for d in better_than_target]) 
total = sum([len(d) for d in better_than_target])
print(f"Total & {(better_total / total * 100):.2f}\\% \\\\")

experiments = {e for b in better_than_target for e in b.keys()}

for e in experiments:
    better_total = sum([1 for d in better_than_target if e in d and d[e]])
    total = len([1 for d in better_than_target if e in d])
    print(f"{e} & {better_total / total * 100:.2f}\\% \\\\", better_total, total)


Total & 18.75\% \\
rag_gtr_k_10_llama_8b & 20.00\% \\ 4 20
rag_gtr_k_10_mistral_7b & 15.00\% \\ 6 40
llatrieval_gtr_k_10_mistral_7b & 25.00\% \\ 5 20
rarr_mistral_7b & 15.00\% \\ 3 20
post_hoc_mistral_7b & 15.00\% \\ 3 20
rag_gtr_k_10_saul_7b & 30.00\% \\ 6 20
rag_gtr_k_10_llama_70b & 15.00\% \\ 3 20
