# Determine Statistics for Annotation Procedure

In [85]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from nltk.metrics.agreement import AnnotationTask
from nltk.tokenize import wordpunct_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/s_hegs02/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
labels = {'c': 'condition_unsupported', 'p': 'procedure_unsupported', 'm': 'medication_unsupported', 't': 'time_unsupported', 'l': 'location_unsupported', 'n': 'number_unsupported', 'na': 'name_unsupported', 'w': 'word_unsupported', 'o': 'other_unsupported', 'co': 'contradicted_fact', 'i': 'incorrect_fact'}

In [87]:
# Read labelings in BioC format
data_path = '/home/s_hegs02/MedTator'
data_path = Path(data_path)

# Use jsonl files generated from BioC files exported with MedTator
dataset_paths = {
    # Experiment 1: label mimic summaries
    'hallucinations_100_mimic_annotator_1': data_path / '10_label_silver_examples_annotator_1' / 'hallucinations_100_mimic_annotator_1.jsonl',
    'hallucinations_100_mimic_annotator_2': data_path / '11_label_silver_examples_annotator_2' / 'hallucinations_100_mimic_annotator_2.jsonl',
    'hallucinations_100_mimic_agreed': data_path / '12_agreed_label_silver_examples' / 'hallucinations_100_mimic_agreed.jsonl',
    # 'hallucinations_10_valid_mimic_agreed': data_path / '13_agreed_label_silver_validation_examples' / 'hallucinations_10_valid_mimic_agreed.jsonl',
    # Experiment 2: label generated summaries
    'hallucinations_100_generated_annotator_1': data_path / '20_label_halus_qualitative_annotator_1' / 'hallucinations_100_generated_annotator_1.jsonl',
    'hallucinations_100_generated_annotator_2': data_path / '21_label_halus_qualitative_annotator_2' / 'hallucinations_100_generated_annotator_2.jsonl',
    'hallucinations_100_generated_agreed': data_path / '22_label_halus_qualitative_agreed' / 'hallucinations_100_generated_agreed.jsonl',
}

# Read jsonl files
def read_jsonl(file_name):
    with open(file_name, "r") as f:
        return [json.loads(line) for line in f]
    
# Read jsonl files
datasets = {k: read_jsonl(v) for k, v in dataset_paths.items()}

In [88]:
# Print basic stats

def print_stats(name, dataset):
    print(f"Dataset {name}: {len(dataset)} examples with {sum([len(d['labels']) for d in dataset])} labels")
    
for name, dataset in datasets.items():
    print_stats(name, dataset)

Dataset hallucinations_100_mimic_annotator_1: 100 examples with 239 labels
Dataset hallucinations_100_mimic_annotator_2: 100 examples with 282 labels
Dataset hallucinations_100_mimic_agreed: 100 examples with 286 labels
Dataset hallucinations_100_generated_annotator_1: 100 examples with 123 labels
Dataset hallucinations_100_generated_annotator_2: 100 examples with 118 labels
Dataset hallucinations_100_generated_agreed: 100 examples with 114 labels


In [89]:
# Read statistics collecte during agreement
# Labeling 1:
# data = pd.read_csv(data_path / '12_agreed_label_silver_examples' / 'agreement_statistics.csv', index_col=0)
# 
# # Add column to dataframe with number of annotations of annotator 1, annotator 2 and agreement
# data['entities_annotator_1'] = [len(ex['labels']) for ex in datasets['hallucinations_100_mimic_annotator_1']]
# data['entities_annotator_2'] = [len(ex['labels']) for ex in datasets['hallucinations_100_mimic_annotator_2']]
# data['entities_agreement'] = [len(ex['labels']) for ex in datasets['hallucinations_100_mimic_agreed']]

# Labeling 2:
data_permuted = pd.read_csv(data_path / '22_label_halus_qualitative_agreed' / 'agreement_statistics.csv', index_col=0)
# Experiment two was ordered into 20 examples per model, the agreement statistics contain all 5 models per summary consecutively
# So re-order agreement statistics to match the order of the data
data_permuted['doc_id'] = data_permuted.index.str.split('-').str[0].astype(int)
data_permuted['model_id'] = data_permuted.index.str.split('-').str[1].astype(int)
data_permuted = data_permuted.sort_values(['model_id', 'doc_id'])
# Now also undo permutation
# 20: [4, 0, 2, 3, 1]
hallucination_random_models = {0: [0, 4, 3, 1, 2], 1: [4, 1, 3, 2, 0], 2: [0, 2, 1, 3, 4], 3: [1, 0, 3, 4, 2], 4: [3, 2, 4, 1, 0], 5: [3, 2, 1, 4, 0], 6: [1, 2, 3, 0, 4], 7: [1, 3, 2, 4, 0], 8: [4, 0, 1, 3, 2], 9: [0, 3, 2, 4, 1], 10: [0, 4, 3, 2, 1], 11: [1, 0, 4, 3, 2], 12: [2, 4, 1, 0, 3], 13: [3, 1, 0, 4, 2], 14: [4, 2, 0, 1, 3], 15: [0, 2, 4, 3, 1], 16: [1, 4, 2, 3, 0], 17: [2, 3, 1, 0, 4], 18: [4, 0, 3, 2, 1], 19: [0, 3, 1, 2, 4], 20: [4, 0, 2, 3, 1], 21: [0, 4, 2, 1, 3], 22: [0, 2, 4, 3, 1], 23: [1, 0, 3, 4, 2], 24: [3, 1, 0, 4, 2], 25: [2, 0, 3, 4, 1], 26: [4, 3, 0, 1, 2], 27: [3, 4, 2, 1, 0], 28: [4, 2, 3, 1, 0], 29: [4, 1, 3, 0, 2], 30: [2, 3, 0, 1, 4], 31: [4, 2, 0, 3, 1], 32: [3, 0, 2, 1, 4], 33: [2, 3, 4, 1, 0], 34: [4, 1, 3, 2, 0], 35: [0, 4, 1, 3, 2], 36: [4, 1, 3, 0, 2], 37: [3, 1, 0, 4, 2], 38: [3, 2, 4, 1, 0], 39: [1, 0, 3, 4, 2], 40: [4, 3, 0, 1, 2], 41: [2, 3, 4, 0, 1], 42: [2, 4, 3, 1, 0], 43: [4, 1, 2, 0, 3], 44: [0, 4, 3, 1, 2], 45: [3, 2, 0, 1, 4], 46: [2, 4, 0, 3, 1], 47: [2, 1, 0, 4, 3], 48: [4, 2, 3, 1, 0], 49: [3, 1, 4, 2, 0]}
data = data_permuted.copy()
replaced_columns = ['same_entities_same_labels', 'same_entities_different_labels', 'entities_only_annotator_1', 'entities_only_annotator_2']
data[replaced_columns] = pd.DataFrame([[None, None, None, None]], index=data.index)
# Iterate over all rows and copy values from data_permuted to data
for i, row in data.iterrows():
    # Set all for columns in data
    doc_id = row['doc_id']
    perm_model_id = hallucination_random_models[row['doc_id']].index(row['model_id'])
    # print(doc_id, perm_model_id)
    # Get values from data_permuted with doc_id and perm_model_id
    for col in replaced_columns:
        data.at[i, col] = data_permuted.loc[(data_permuted['doc_id'] == doc_id) & (data_permuted['model_id'] == perm_model_id), col].values[0]
    data.at[i, 'perm_model_id'] = perm_model_id
    
# Add column to dataframe with number of annotations of annotator 1, annotator 2 and agreement
data['entities_annotator_1'] = [len(ex['labels']) for ex in datasets['hallucinations_100_generated_annotator_1']]
data['entities_annotator_2'] = [len(ex['labels']) for ex in datasets['hallucinations_100_generated_annotator_2']]
data['entities_agreement'] = [len(ex['labels']) for ex in datasets['hallucinations_100_generated_agreed']]



# Iterate over all rows an add counts for number of annotations removed per annotator
for i, row in data.iterrows():
    # Add counts to dataframe
    data.at[i, 'entities_removed_annotator_1'] = data.at[i, 'entities_annotator_1'] - data.at[i, 'same_entities_same_labels'] - data.at[i, 'same_entities_different_labels'] - data.at[i, 'entities_only_annotator_1']
    data.at[i, 'entities_removed_annotator_2'] = data.at[i, 'entities_annotator_2'] - data.at[i, 'same_entities_same_labels'] - data.at[i, 'same_entities_different_labels'] - data.at[i, 'entities_only_annotator_2']

# Convert complete row to int
data = data.astype(int)
    
for i, row in data.iterrows():
    # Check counts
    assert data.at[i, 'entities_removed_annotator_1'] >= 0,f"wrong count annotator 1 {i} {row}"
    assert data.at[i, 'entities_removed_annotator_2'] >= 0,f"wrong count annotator 2 {i} {row}"
    assert data.at[i, 'entities_annotator_1'] == data.at[i, 'entities_removed_annotator_1'] + data.at[i, 'same_entities_same_labels'] + data.at[i, 'same_entities_different_labels'] + data.at[i, 'entities_only_annotator_1'],f"wrong count annotator 1 {i} {row}"
    assert data.at[i, 'entities_annotator_2'] == data.at[i, 'entities_removed_annotator_2'] + data.at[i, 'same_entities_same_labels'] + data.at[i, 'same_entities_different_labels'] + data.at[i, 'entities_only_annotator_2'],f"wrong count annotator 2 {i} {row}"
    assert data.at[i, 'entities_agreement'] == data.at[i, 'same_entities_same_labels'] + data.at[i, 'same_entities_different_labels'] + data.at[i, 'entities_only_annotator_1'] + data.at[i, 'entities_only_annotator_2'],f"wrong count agreement {i} {row}"

print(data.describe())

       same_entities_same_labels  same_entities_different_labels  \
count                 100.000000                        100.0000   
mean                    0.430000                          0.2400   
std                     0.781801                          0.6215   
min                     0.000000                          0.0000   
25%                     0.000000                          0.0000   
50%                     0.000000                          0.0000   
75%                     1.000000                          0.0000   
max                     3.000000                          3.0000   

       entities_only_annotator_1  entities_only_annotator_2      doc_id  \
count                 100.000000                 100.000000  100.000000   
mean                    0.130000                   0.340000   38.550000   
std                     0.366667                   0.589813    7.201817   
min                     0.000000                   0.000000   20.000000   
25%         

In [90]:
# Format statistics as latex table
statistics_dict = {
    'Annotations annotator 1': f"{data['entities_annotator_1'].mean():.2f} ({data['entities_annotator_1'].std():.2f})",
    'Removed in agreement 1': f"{data['entities_removed_annotator_1'].mean():.2f} ({data['entities_removed_annotator_1'].std():.2f})",
    'Annotations annotator 2': f"{data['entities_annotator_2'].mean():.2f} ({data['entities_annotator_2'].std():.2f})",
    'Removed in agreement 2': f"{data['entities_removed_annotator_2'].mean():.2f} ({data['entities_removed_annotator_2'].std():.2f})",
    'Annotations agreement': f"{data['entities_agreement'].mean():.2f} ({data['entities_agreement'].std():.2f})",
    'Both annotators, same label': f"{data['same_entities_same_labels'].mean():.2f} ({data['same_entities_same_labels'].std():.2f})",
    'Both annotators, different label': f"{data['same_entities_different_labels'].mean():.2f} ({data['same_entities_different_labels'].std():.2f})",
    'Only annotator 1': f"{data['entities_only_annotator_1'].mean():.2f} ({data['entities_only_annotator_1'].std():.2f})",
    'Only annotator 2': f"{data['entities_only_annotator_2'].mean():.2f} ({data['entities_only_annotator_2'].std():.2f})",
}

# Dict to dataframe
statistics_df = pd.DataFrame.from_dict(statistics_dict, orient='index', columns=['Mean (std)'])
# Print as latex, convert second column to centered
print(statistics_df.to_latex(column_format='lc', escape=False))

    

\begin{tabular}{lc}
\toprule
 & Mean (std) \\
\midrule
Annotations annotator 1 & 1.23 (1.52) \\
Removed in agreement 1 & 0.43 (0.67) \\
Annotations annotator 2 & 1.18 (1.58) \\
Removed in agreement 2 & 0.17 (0.40) \\
Annotations agreement & 1.14 (1.70) \\
Both annotators, same label & 0.43 (0.78) \\
Both annotators, different label & 0.24 (0.62) \\
Only annotator 1 & 0.13 (0.37) \\
Only annotator 2 & 0.34 (0.59) \\
\bottomrule
\end{tabular}



In [91]:
# Get table for error types
def get_label_count(dataset):
    label_counts = {l: 0 for l in labels.values()}
    for ex in dataset:
        for label in ex["labels"]:
            label_counts[label["label"]] += 1
    return label_counts

# Labeling 1
# label_counts_1 = get_label_count(datasets['hallucinations_100_mimic_annotator_1'])
# label_counts_2 = get_label_count(datasets['hallucinations_100_mimic_annotator_2'])
# label_counts_agreement = get_label_count(datasets['hallucinations_100_mimic_agreed'])

# Labeling 2
label_counts_1 = get_label_count(datasets['hallucinations_100_generated_annotator_1'])
label_counts_2 = get_label_count(datasets['hallucinations_100_generated_annotator_2'])
label_counts_agreement = get_label_count(datasets['hallucinations_100_generated_agreed'])
        
# Convert to one dataframe with label names in first column
label_counts_1_df = pd.DataFrame.from_dict(label_counts_1, orient='index', columns=['A 1'])
label_counts_2_df = pd.DataFrame.from_dict(label_counts_2, orient='index', columns=['A 2'])
label_counts_agreement_df = pd.DataFrame.from_dict(label_counts_agreement, orient='index', columns=['Agree'])
# concat
label_counts_df = pd.concat([label_counts_1_df, label_counts_2_df, label_counts_agreement_df], axis=1)

# Add row total
label_counts_df.loc['Total'] = label_counts_df.sum(axis=0)

# Replace underscore in label names with space
label_counts_df.index = label_counts_df.index.str.replace('_', ' ')

# Output as latex table
print(label_counts_df.to_latex(column_format='lccc', escape=False))

\begin{tabular}{lccc}
\toprule
 & A 1 & A 2 & Agree \\
\midrule
condition unsupported & 25 & 27 & 27 \\
procedure unsupported & 12 & 3 & 4 \\
medication unsupported & 13 & 10 & 10 \\
time unsupported & 3 & 3 & 2 \\
location unsupported & 13 & 4 & 12 \\
number unsupported & 0 & 4 & 3 \\
name unsupported & 5 & 6 & 5 \\
word unsupported & 49 & 48 & 44 \\
other unsupported & 1 & 5 & 0 \\
contradicted fact & 2 & 8 & 7 \\
incorrect fact & 0 & 0 & 0 \\
Total & 123 & 118 & 114 \\
\bottomrule
\end{tabular}



In [93]:
# Determine inter-annotator agreement for hallucination annotations
# Determin krippendorf alpha based on annotations counts

annotation_triples = []
for i, row in data.iterrows():
    annotation_triples.append(('coder_1', i, row['entities_annotator_1']))
    annotation_triples.append(('coder_2', i, row['entities_annotator_2']))

t = AnnotationTask(annotation_triples, distance=interval_distance)
print(f"Krippendorf's alpha: {t.alpha():.3f}")

Krippendorf's alpha: 0.826


In [149]:
# Determine word-level F1 overlap

# Define tokenizer
tokenizer = lambda x: wordpunct_tokenize(x)

# Taken from hallucination evaluation script
def character_labels_to_word_labels(text, labels):
    # Convert character level labels to word level labels
    new_labels = []
    for label in labels:
        new_label = {'label': label['label']}
        new_label['start'] = len(tokenizer(text[:label['start']]))
        new_label['end'] = new_label['start'] + len(tokenizer(label['text']))
        new_label['length'] = new_label['end'] - new_label['start']
        # Copy over old text because not tokenized version, but check it contains same text without whitespace
        new_label['text'] = label['text']
        # print(label['start'], tokenizer(text)[new_label['start']:new_label['end']], 'vs', label['text'])
        assert ''.join(tokenizer(text)[new_label['start']:new_label['end']]) == label['text'].replace(' ', '')
        new_labels.append(new_label)
    return new_labels

def change_labels_to_single_words(labels):
    new_labels = []
    for label in labels:
        for i in range(label['length']):
            new_label = {'label': label['label']}
            new_label['start'] = label['start'] + i
            new_label['end'] = label['start'] + i + 1
            new_label['length'] = 1
            new_label['text'] = tokenizer(label['text'])[i]
            new_labels.append(new_label)
    return new_labels

# Convert all labeling to word level
# Labeling 1
# annotator_1_word_labels = datasets['hallucinations_100_mimic_annotator_1']
# annotator_2_word_labels = datasets['hallucinations_100_mimic_annotator_2']
# # Manually fix non-word aligned labels - only first character of word was missed - does not affect evaluation
# annotator_2_word_labels[49]['labels'][1] = {'start': 209, 'end': 257, 'length': 48, 'label': 'word_unsupported', 'text': 'The reports of these tests were provided to you.'}
# annotator_2_word_labels[49]['labels'][6] = {'start': 577, 'end': 670, 'length': 93, 'label': 'word_unsupported', 'text': 'Your primary care doctor, ___ your gastroenterologist, Dr. ___ aware of your hospitalization.'}
# annotator_2_word_labels[87]['labels'][0] = {'start': 432, 'end': 513, 'length': 81, 'label': 'other_unsupported', 'text': 'may be a manifestation of stress and may be related to your psychiatric illnesses'}

# Labeling 2
annotator_1_word_labels = datasets['hallucinations_100_generated_annotator_1']
annotator_2_word_labels = datasets['hallucinations_100_generated_annotator_2']
annotator_2_word_labels[24]['labels'][3] = {'start': 635, 'end': 651, 'length': 16, 'label': 'condition_unsupported', 'text': 'which was normal'}

# Convert to word labels
annotator_1_word_labels = [character_labels_to_word_labels(ex['summary'], ex['labels']) for ex in annotator_1_word_labels]
annotator_2_word_labels = [character_labels_to_word_labels(ex['summary'], ex['labels']) for ex in annotator_2_word_labels]                                               
# Convert to single word labels
annotator_1_word_labels_single = [change_labels_to_single_words(labels) for labels in annotator_1_word_labels]
annotator_2_word_labels_single = [change_labels_to_single_words(labels) for labels in annotator_2_word_labels]

def determine_word_level_f1(in_annotator_1, in_annotator_2):
    labels_1_in_labels_2 = 0
    total_labels_1 = 0
    labels_2_in_labels_1 = 0
    total_labels_2 = 0
    for i in range(len(in_annotator_1)):
        doc_labels_1 = in_annotator_1[i]
        doc_labels_2 = in_annotator_2[i]
        labels_1_in_labels_2 += len([l for l in doc_labels_1 if l in doc_labels_2])
        total_labels_1 += len(doc_labels_1)
        labels_2_in_labels_1 += len([l for l in doc_labels_2 if l in doc_labels_1])
        total_labels_2 += len(doc_labels_2)
    precision = labels_1_in_labels_2 / total_labels_1
    recall = labels_2_in_labels_1 / total_labels_2
    f1 = 2 * precision * recall / (precision + recall)
    return f1, precision, recall

# Class aware
f1, precision, recall = determine_word_level_f1(annotator_1_word_labels_single, annotator_2_word_labels_single)
print(f"Word level F1 (class aware): {f1:.3f}")

# Class agnostic
# Only keep start positions of labels as these are unique identifiers
annotator_1_word_labels_starts = [[label['start'] for label in labels] for labels in annotator_1_word_labels_single]
annotator_2_word_labels_starts = [[label['start'] for label in labels] for labels in annotator_2_word_labels_single]
f1, precision, recall = determine_word_level_f1(annotator_1_word_labels_starts, annotator_2_word_labels_starts)
print(f"Word level F1 (class agnostic): {f1:.3f}")

Word level F1 (class aware): 0.271
Word level F1 (class agnostic): 0.440


In [38]:
# Additional analysis for labeling2 - create label wise hallucination counts for each model
# Check for row groups of 20 (0-19, 20-39, ...) the mean and std of the number of hallucinations (column: entities_agreement)
models = [0,1,2,3,4]
model_names = ['llama_70b_original', 'llama_70b_cleaned', 'gpt4_zero_shot', 'gpt4_orig', 'gpt4_cleaned']
summaries_per_model = 20
label_counts_df = []
for model in models:
    range_idx = (model * summaries_per_model, (model + 1) * summaries_per_model)
    label_counts_agreement = get_label_count(datasets['hallucinations_100_generated_agreed'][range_idx[0]:range_idx[1]])
    label_counts_agreement_df = pd.DataFrame.from_dict(label_counts_agreement, orient='index', columns=[model_names[model]])
    label_counts_df.append(label_counts_agreement_df)
    
# Concat
label_counts_df = pd.concat(label_counts_df, axis=1)

# Add row total
label_counts_df.loc['Total'] = label_counts_df.sum(axis=0)

# Replace underscore in label names with space
label_counts_df.index = label_counts_df.index.str.replace('_', ' ')

# Reorder columns
label_counts_df = label_counts_df[['llama_70b_original', 'llama_70b_cleaned', 'gpt4_orig', 'gpt4_cleaned', 'gpt4_zero_shot']]

# Output as latex table
print(label_counts_df.to_latex(column_format='lccc', escape=False))
    

\begin{tabular}{lccc}
\toprule
 & llama_70b_original & llama_70b_cleaned & gpt4_orig & gpt4_cleaned & gpt4_zero_shot \\
\midrule
condition unsupported & 16 & 7 & 2 & 1 & 1 \\
procedure unsupported & 2 & 2 & 0 & 0 & 0 \\
medication unsupported & 9 & 1 & 0 & 0 & 0 \\
time unsupported & 1 & 1 & 0 & 0 & 0 \\
location unsupported & 4 & 5 & 1 & 2 & 0 \\
number unsupported & 3 & 0 & 0 & 0 & 0 \\
name unsupported & 3 & 1 & 1 & 0 & 0 \\
word unsupported & 11 & 10 & 10 & 5 & 8 \\
other unsupported & 0 & 0 & 0 & 0 & 0 \\
contradicted fact & 3 & 4 & 0 & 0 & 0 \\
incorrect fact & 0 & 0 & 0 & 0 & 0 \\
Total & 52 & 31 & 14 & 8 & 9 \\
\bottomrule
\end{tabular}



In [39]:
# Additional analysis for labeling2 - how many hallucinations for each model
# Check for row groups of 20 (0-19, 20-39, ...) the mean and std of the number of hallucinations (column: entities_agreement)
models = [0,1,2,3,4]
model_names = ['llama_70b_original', 'llama_70b_cleaned', 'gpt4_zero_shot', 'gpt4_orig', 'gpt4_cleaned']
summaries_per_model = 20
for model in models:
    range_idx = (model * summaries_per_model, (model + 1) * summaries_per_model)
    # Print index of summaries for each model
    # print(range_idx)
    # print(f"Model {model_names[model]}: {data.iloc[range_idx[0]:range_idx[1]]['doc_id'].values}")
    # print(f"Model {model_names[model]}: {data.iloc[range_idx[0]:range_idx[1]]['model_id'].values}")
    print(f"Model {model_names[model]}: ${data.iloc[range_idx[0]:range_idx[1]]['entities_agreement'].mean():.2f}$ (${data.iloc[range_idx[0]:range_idx[1]]['entities_agreement'].std():.2f}$)")

Model llama_70b_original: $2.60$ ($2.39$)
Model llama_70b_cleaned: $1.55$ ($1.99$)
Model gpt4_zero_shot: $0.45$ ($0.60$)
Model gpt4_orig: $0.70$ ($0.86$)
Model gpt4_cleaned: $0.40$ ($0.75$)


In [40]:
# Additional analysis for labeling2 - sanity check for model output
# Print num labels for example 33 and 34 because there every model has different number of hallucinations
def print_num_labels(doc_id):
    for model in models:
        print(f"Model {model_names[model]}: {data.loc[(data['doc_id'] == doc_id) & (data['model_id'] == model)]['entities_agreement'].values}")

print_num_labels(33)
print()
print_num_labels(34)

Model llama_70b_original: [1]
Model llama_70b_cleaned: [7]
Model gpt4_zero_shot: [2]
Model gpt4_orig: [2]
Model gpt4_cleaned: [0]

Model llama_70b_original: [6]
Model llama_70b_cleaned: [3]
Model gpt4_zero_shot: [0]
Model gpt4_orig: [1]
Model gpt4_cleaned: [1]


In [41]:
# Additional analysis for labeling2 - get number of words per summary
def get_num_words(text):
    # use this simplistic approach since same was used for performance table
    return len(text.split())

dataset_words = datasets['hallucinations_100_generated_agreed']
models = [0,1,2,3,4]
model_names = ['llama_70b_original', 'llama_70b_cleaned', 'gpt4_zero_shot', 'gpt4_orig', 'gpt4_cleaned']
summaries_per_model = 20
for model in models:
    range_idx = (model * summaries_per_model, (model + 1) * summaries_per_model)
    # Print index of summaries for each model
    num_words = np.array([get_num_words(ex['summary']) for ex in dataset_words[range_idx[0]:range_idx[1]]])
    # print(range_idx)
    assert len(num_words) == summaries_per_model
    # Mean and std over all summaries
    print(f"Model {model_names[model]}: ${np.mean(num_words):.2f}$ (${np.std(num_words):.2f}$)")



Model llama_70b_original: $97.90$ ($36.73$)
Model llama_70b_cleaned: $96.20$ ($31.82$)
Model gpt4_zero_shot: $165.05$ ($22.75$)
Model gpt4_orig: $151.10$ ($19.42$)
Model gpt4_cleaned: $158.80$ ($23.27$)


In [42]:
# Additional analysis for labeling2 - determine mean and std of key facts, medical jargon, and words

key_jargon_paths = {
    # Experiment 3: label key facts and medical jargon in the hallucination reduction task
    'key_jargon_annotator_1': data_path / '30_label_key_jargon_annotator_1' / 'key_jargon_annotator_1.xlsx',
    'key_jargon_annotator_2': data_path / '31_label_key_jargon_annotator_2' / 'key_jargon_annotator_2.xlsx',
}

# Rating columns
num_models = 5
columns = [[f"KP {i}", f"MJ {i}"] for i in range(0, num_models)]
columns = [item for sublist in columns for item in sublist] + ['Total KP']

def prepare_df(path):
    df = pd.read_excel(path, sheet_name=0, header=11)
    # Remove columns starting with Unnamed
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df.set_index('id', inplace=True)
    assert df.values.min() >= 0 and df.values.max() <= 15
    assert df.isnull().values.any() == False
    assert set(columns) == set(df.columns)
    print(f"  Number of documents: {len(df)}")
    return df

hallucination_random_models = {0: [0, 4, 3, 1, 2], 1: [4, 1, 3, 2, 0], 2: [0, 2, 1, 3, 4], 3: [1, 0, 3, 4, 2], 4: [3, 2, 4, 1, 0], 5: [3, 2, 1, 4, 0], 6: [1, 2, 3, 0, 4], 7: [1, 3, 2, 4, 0], 8: [4, 0, 1, 3, 2], 9: [0, 3, 2, 4, 1], 10: [0, 4, 3, 2, 1], 11: [1, 0, 4, 3, 2], 12: [2, 4, 1, 0, 3], 13: [3, 1, 0, 4, 2], 14: [4, 2, 0, 1, 3], 15: [0, 2, 4, 3, 1], 16: [1, 4, 2, 3, 0], 17: [2, 3, 1, 0, 4], 18: [4, 0, 3, 2, 1], 19: [0, 3, 1, 2, 4], 20: [4, 0, 2, 3, 1], 21: [0, 4, 2, 1, 3], 22: [0, 2, 4, 3, 1], 23: [1, 0, 3, 4, 2], 24: [3, 1, 0, 4, 2], 25: [2, 0, 3, 4, 1], 26: [4, 3, 0, 1, 2], 27: [3, 4, 2, 1, 0], 28: [4, 2, 3, 1, 0], 29: [4, 1, 3, 0, 2], 30: [2, 3, 0, 1, 4], 31: [4, 2, 0, 3, 1], 32: [3, 0, 2, 1, 4], 33: [2, 3, 4, 1, 0], 34: [4, 1, 3, 2, 0], 35: [0, 4, 1, 3, 2], 36: [4, 1, 3, 0, 2], 37: [3, 1, 0, 4, 2], 38: [3, 2, 4, 1, 0], 39: [1, 0, 3, 4, 2], 40: [4, 3, 0, 1, 2], 41: [2, 3, 4, 0, 1], 42: [2, 4, 3, 1, 0], 43: [4, 1, 2, 0, 3], 44: [0, 4, 3, 1, 2], 45: [3, 2, 0, 1, 4], 46: [2, 4, 0, 3, 1], 47: [2, 1, 0, 4, 3], 48: [4, 2, 3, 1, 0], 49: [3, 1, 4, 2, 0]}

def undo_randomization(df):
    # Copy dataframe
    df_unrandomized = df.copy()
    df_unrandomized[:] = -1
    # For each row use entry with id in qualitative_random_models amd reorder "Rel {i}", "Con {i}", "Flu {i}", "Coh {i}" based on the randomization
    for id, row in df.iterrows():
        df_unrandomized.loc[id, 'Total KP'] = row['Total KP']
        for i in range(0, num_models):
            df_unrandomized.loc[id, f"KP {i}"] = row[f"KP {hallucination_random_models[id].index(i)}"]
            df_unrandomized.loc[id, f"MJ {i}"] = row[f"MJ {hallucination_random_models[id].index(i)}"]
    return df_unrandomized

print("Annotator 1:")
rand_key_jargon_annotator_1 = prepare_df(key_jargon_paths['key_jargon_annotator_1'])
print("Annotator 2:")
rand_key_jargon_annotator_2 = prepare_df(key_jargon_paths['key_jargon_annotator_2'])

key_rand_annotator_1 = undo_randomization(rand_key_jargon_annotator_1)
key_rand_annotator_2 = undo_randomization(rand_key_jargon_annotator_2)

# First take average over both dataframes then calculate mean and std ant output as latex table
# This makes more sense to interpet the SD per example
key_rand_mean = pd.concat([key_rand_annotator_1, key_rand_annotator_2]).groupby(level=0).mean()
# Assert that same columns and still 20 rows
assert set(columns) == set(key_rand_mean.columns) and len(key_rand_mean) == 20
# print(key_rand_mean)

models = ['llama_70b_original', 'llama_70b_cleaned', 'gpt4_zero_shot', 'gpt4_orig', 'gpt4_cleaned']

latex_results = []
for model in range(0, num_models):
    row = []
    for metric in ["KP", "MJ"]:
        key = f"{metric} {model}"
        # With SD
        row.append(f"${key_rand_mean[key].mean():.2f}$ (${key_rand_mean[key].std():.2f}$)")
        # row.append(f"${key_rand_mean[key].mean():.2f}$")
    latex_results.append([models[model]] + row)
    
latex_results = pd.DataFrame(latex_results, columns=["Model", "KP", "MJ"])
print(latex_results.to_latex(index=False, escape=False, column_format="lcc"))

Annotator 1:
  Number of documents: 20
Annotator 2:
  Number of documents: 20
\begin{tabular}{lcc}
\toprule
Model & KP & MJ \\
\midrule
llama_70b_original & $3.77$ ($1.33$) & $1.05$ ($0.84$) \\
llama_70b_cleaned & $3.73$ ($1.45$) & $1.68$ ($1.23$) \\
gpt4_zero_shot & $0.82$ ($0.61$) & $0.70$ ($1.03$) \\
gpt4_orig & $0.93$ ($0.80$) & $1.07$ ($0.99$) \\
gpt4_cleaned & $0.97$ ($0.80$) & $1.25$ ($1.18$) \\
\bottomrule
\end{tabular}

