## Results

Methods to obtain the results from an evaluation run

In [1]:
import json
import tiktoken
import spacy
import pandas as pd

from src.column import Column
from src.models import BertScore, RougeScore, ClaimRecall
from src.experiments.load_experiment import load_experiment_df, Experiment
from src.models.citation_faithfulness import CitationFaithfulness
from src.models.citation import SentenceWithCitations
from src.models.citations_em import CitationsExactMatch

# Evaluation of Answer Correctness

In [None]:
def calc_base_metrics(df: pd.DataFrame):
    # Correctness ROUGE-L
    correctness_rouge = df[Column.CORRECTNESS_ROUGE]
    correctness_rouge = [RougeScore.model_validate_json(x) for x in correctness_rouge]
    rouge_l = [x.rouge_l for x in correctness_rouge]
    rouge_l = sum(rouge_l) / len(rouge_l)

    # Correctness BERT
    correctness_bert = df[Column.CORRECTNESS_BERT]
    correctness_bert = [BertScore.model_validate_json(x) for x in correctness_bert]
    f1_bert = [x.f1 for x in correctness_bert]
    f1_bert = sum(f1_bert) / len(f1_bert)

    # Claim Recall (NLI)
    claim_recall = df[Column.CLAIM_RECALL]
    claim_recall = [ClaimRecall.model_validate_json(x) for x in claim_recall]
    claim_recall = [x.claim_recall for x in claim_recall]
    claim_recall = sum(claim_recall) / len(claim_recall)


    rouge_l = round(rouge_l * 100, 1)
    f1_bert = round(f1_bert * 100, 1)
    claim_recall = round(claim_recall * 100, 1)

    print(f" & {rouge_l} & {f1_bert} & {claim_recall} \\\\")

df, _ = load_experiment_df(Experiment.RAG_GTR_k_10_LLAMA_70B)

calc_base_metrics(df)

# Evaluation of Citation Faithfulness and Evidence Similarity

Note: The definitions of citation faith. rec. and prec. are switched between code and paper

In [10]:

def get_citation_metrics(experiment: Experiment):
    df, _ = load_experiment_df(experiment)

    citation_faithfulness = df[Column.CITATION_FAITHFULNESS]
    citation_faithfulness = [CitationFaithfulness.model_validate_json(x) for x in citation_faithfulness]

    avg_recall = sum([x.citation_recall for x in citation_faithfulness]) / len(citation_faithfulness)
    avg_precision = sum([x.citation_precision for x in citation_faithfulness]) / len(citation_faithfulness)

    avg_recall = round(avg_recall * 100, 1)
    avg_precision = round(avg_precision * 100, 1)

    similarity_em = df[Column.CITATION_SIMILARITY_EM]
    similarity_em = [CitationsExactMatch.model_validate_json(x) for x in similarity_em ]

    avg_f1 = sum([x.f1 for x in similarity_em]) / len(similarity_em)
    avg_f1 = round(avg_f1 * 100, 1)

    similarity_nli = df[Column.CITATION_SIMILARITY_NLI]
    similarity_nli = [0 if x == -1 else x for x in similarity_nli] # note: -1 in df if the TA had no citations

    avg_similarity_nli = sum(similarity_nli) / len(similarity_nli)
    avg_similarity_nli = round(avg_similarity_nli * 100, 1)

    print(f" & {avg_recall} & {avg_precision} & {avg_f1} & {avg_similarity_nli} \\\\")

get_citation_metrics(Experiment.RAG_GTR_k_10_MISTRAL_7B)
get_citation_metrics(Experiment.RAG_GTR_k_10_SAUL_7B)
get_citation_metrics(Experiment.RAG_GTR_k_10_LLAMA_8B)
get_citation_metrics(Experiment.RAG_GTR_k_10_LLAMA_70B)
print()
get_citation_metrics(Experiment.LLATRIEVAL_GTR_k_10_MISTRAL_7B)
get_citation_metrics(Experiment.LLATRIEVAL_GTR_k_10_SAUL_7B)
get_citation_metrics(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_8B)
get_citation_metrics(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_70B)
print()
get_citation_metrics(Experiment.POST_HOC_MISTRAL_7B)
get_citation_metrics(Experiment.POST_HOC_SAUL_7B)
get_citation_metrics(Experiment.POST_HOC_LLAMA_8B)
get_citation_metrics(Experiment.POST_HOC_LLAMA_70B)
print()
get_citation_metrics(Experiment.RARR_MISTRAL_7B)
get_citation_metrics(Experiment.RARR_SAUL_7B)
get_citation_metrics(Experiment.RARR_LLAMA_8B)
get_citation_metrics(Experiment.RARR_LLAMA_70B)

 & 61.4 & 54.2 & 6.0 & 46.0 \\
 & 17.8 & 14.6 & 3.6 & 28.5 \\
 & 70.3 & 67.3 & 6.5 & 43.8 \\
 & 72.2 & 68.5 & 7.3 & 45.7 \\

 & 76.7 & 69.7 & 5.7 & 49.0 \\
 & 19.2 & 16.9 & 2.4 & 27.2 \\
 & 74.2 & 72.4 & 5.6 & 39.7 \\
 & 73.2 & 70.3 & 7.8 & 46.7 \\

 & 4.4 & 4.4 & 0.3 & 25.4 \\
 & 4.3 & 4.3 & 0.3 & 19.3 \\
 & 4.0 & 4.0 & 0.3 & 32.3 \\
 & 4.1 & 4.1 & 0.2 & 24.4 \\

 & 25.5 & 21.5 & 3.0 & 70.7 \\
 & 19.2 & 16.6 & 3.1 & 58.9 \\
 & 23.6 & 21.2 & 3.0 & 59.0 \\
 & 31.7 & 25.6 & 3.1 & 66.6 \\


## Some more analytic metrics

In [9]:
nlp = spacy.load("en_core_web_trf")
enc = tiktoken.get_encoding("cl100k_base")


def get_sentences(response):
    # check if is a string
    if not isinstance(response, str):
        return 0
    doc = nlp(response)
    sentences = [s.text for s in doc.sents if s.text.strip()]
    len_sentences = len(sentences)
    return len_sentences

def get_tokens(text):
    if not isinstance(text, str):
        return 0
    return len(enc.encode(text))

def get_analytics(experiment: Experiment):
    df, _ = load_experiment_df(experiment)
    answer_tokens = df[Column.GENERATED_ANSWER].apply(lambda x: get_tokens(x))
    avg_answer_length = answer_tokens.mean()
    avg_answer_length = round(avg_answer_length, 1)
    answer_sentences = df[Column.GENERATED_ANSWER].apply(lambda x: get_sentences(x))
    avg_answer_sentences_length = answer_sentences.mean()
    avg_answer_sentences_length = round(avg_answer_sentences_length, 1)
    
    citations = df[Column.GENERATED_CITATIONS]
    total_citations = []
    for swcs in citations:
        swcs = json.loads(swcs)
        gc = set()
        for s in swcs:
            s = SentenceWithCitations.model_validate(s)
            for c in s.citations:
                gc.add(c)
        total_citations.append(len(gc))


    avg_citations = sum(total_citations) / len(total_citations)
    avg_citations = round(avg_citations, 1)

    print(f"{experiment} & {avg_answer_length} & {avg_answer_sentences_length} & {avg_citations} \\\\")


get_analytics(Experiment.RAG_GTR_k_10_MISTRAL_7B)
get_analytics(Experiment.RAG_GTR_k_10_SAUL_7B)
get_analytics(Experiment.RAG_GTR_k_10_LLAMA_8B)
get_analytics(Experiment.RAG_GTR_k_10_LLAMA_70B)
get_analytics(Experiment.LLATRIEVAL_GTR_k_10_MISTRAL_7B)
get_analytics(Experiment.LLATRIEVAL_GTR_k_10_SAUL_7B)
get_analytics(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_8B)
get_analytics(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_70B)
get_analytics(Experiment.POST_HOC_MISTRAL_7B)
get_analytics(Experiment.POST_HOC_SAUL_7B)
get_analytics(Experiment.POST_HOC_LLAMA_8B)
get_analytics(Experiment.POST_HOC_LLAMA_70B)
get_analytics(Experiment.RARR_MISTRAL_7B)
get_analytics(Experiment.RARR_SAUL_7B)
get_analytics(Experiment.RARR_LLAMA_8B)
get_analytics(Experiment.RARR_LLAMA_70B)

Experiment.RAG_GTR_k_10_MISTRAL_7B & 265.3 & 6.1 & 6.0 \\
Experiment.LLATRIEVAL_GTR_k_10_MISTRAL_7B & 251.9 & 5.7 & 4.7 \\
Experiment.LLATRIEVAL_GTR_k_10_SAUL_7B & 313.9 & 6.5 & 2.6 \\
Experiment.LLATRIEVAL_GTR_k_10_LLAMA_8B & 223.8 & 4.8 & 3.8 \\
Experiment.LLATRIEVAL_GTR_k_10_LLAMA_70B & 174.6 & 3.7 & 4.7 \\
Experiment.POST_HOC_MISTRAL_7B & 479.1 & 15.7 & 3.4 \\
Experiment.POST_HOC_SAUL_7B & 297.8 & 10.4 & 2.4 \\
Experiment.POST_HOC_LLAMA_8B & 505.5 & 20.1 & 4.3 \\
Experiment.POST_HOC_LLAMA_70B & 514.3 & 21.4 & 3.5 \\
Experiment.RARR_MISTRAL_7B & 479.1 & 15.7 & 18.0 \\
Experiment.RARR_SAUL_7B & 297.8 & 10.4 & 12.7 \\
Experiment.RARR_LLAMA_8B & 505.5 & 20.1 & 11.9 \\
Experiment.RARR_LLAMA_70B & 514.3 & 21.4 & 15.5 \\


In [5]:
# Lets check how many results have a claim recall of 0

def get_claim_recalls(e: Experiment):
    df, _ = load_experiment_df(e)
    claim_recall = df[Column.CLAIM_RECALL]
    claim_recall = [ClaimRecall.model_validate_json(x) for x in claim_recall]
    claim_recall = [x.claim_recall for x in claim_recall]
    return claim_recall

claim_recalls = get_claim_recalls(Experiment.RAG_GTR_k_10_MISTRAL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RAG_GTR_k_10_SAUL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RAG_GTR_k_10_LLAMA_8B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RAG_GTR_k_10_LLAMA_70B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.LLATRIEVAL_GTR_k_10_MISTRAL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.LLATRIEVAL_GTR_k_10_SAUL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_8B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.LLATRIEVAL_GTR_k_10_LLAMA_70B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.BASE_MISTRAL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.BASE_SAUL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.BASE_LLAMA_8B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.BASE_LLAMA_70B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RARR_MISTRAL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RARR_SAUL_7B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RARR_LLAMA_8B)
claim_recalls = claim_recalls + get_claim_recalls(Experiment.RARR_LLAMA_70B)

print(claim_recalls)

[0.16666666666666666, 0.0, 0.3333333333333333, 0.14285714285714285, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6666666666666666, 0.0, 0.6666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.2222222222222222, 0.0, 0.0, 0.5, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.09090909090909091, 0.2, 0.125, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.08333333333333333, 0.0, 0.0, 0.16666666666666666, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.2, 0.0, 0.0, 0.2857142857142857, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.2857142857142857, 0.0, 0.357142857

In [6]:
# now check the percentage of 0s

zeroes = sum([1 for x in claim_recalls if x == 0])
total = len(claim_recalls)

percentage = zeroes / total * 100

print(f"Percentage of 0s: {percentage}%")

Percentage of 0s: 73.89328395892947%
