# Build datasets for Hallucination Evaluation and Qualitative Evaluation

In [1]:
import json
import random
import os
import shutil
import datetime
import zipfile
import re

In [2]:
# Read files
# Read jsonl files
def read_jsonl(file_name):
    with open(file_name, "r") as f:
        return [json.loads(line) for line in f]
    
def read_txt(file_name):
    list = []
    with open(file_name, "r") as f:
        list = f.read().splitlines()
    return [{"summary": s} for s in list]

def format_gpt4(gpt4_examples):
    return [{"text": e["question"], "summary": e["summary"]} for e in gpt4_examples]

# Hallucination evaluation
llama_70b_original = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/llama_70b_original_predictions_test_dict.jsonl")
llama_70b_cleaned = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/llama_70b_cleaned_predictions_test_dict.jsonl")
gpt4_zero_shot = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/gpt-4_exp4_results_prompt3.1_0shot.jsonl")
gpt4_orig = format_gpt4(read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/gpt-4_exp4_results_prompt3_5shot.jsonl"))
gpt4_cleaned = format_gpt4(read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/gpt-4_exp5_results_prompt3_5shot.jsonl"))
hallucination_models = [llama_70b_original, llama_70b_cleaned, gpt4_zero_shot, gpt4_orig, gpt4_cleaned]
num_hallucination_models = len(hallucination_models)

# Qualitative evaluation
original_examples = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/orig_test_4000_600_chars_last_50.json")
led_large_original = read_txt("/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/led-large_predictions_test_dict.txt")
llama_70b_cleaned_improved = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/llama_70b_cleaned_improved_predictions_test_dict.jsonl")
gpt4_zero_shot = read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/gpt-4_exp4_results_prompt3.1_0shot.jsonl")
gpt4_cleaned_improved = format_gpt4(read_jsonl("/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/gpt-4_exp6_results_prompt3_5shot.jsonl"))
qualitative_models = [original_examples, led_large_original, llama_70b_cleaned_improved, gpt4_zero_shot, gpt4_cleaned_improved]
num_qualitative_models = len(qualitative_models)


num_examples = len(original_examples)
# Assert each model has the same number of examples
assert all(len(model) == num_examples for model in qualitative_models)
assert all(len(model) == num_examples for model in hallucination_models)

In [None]:
# Debug print summaries with cetain id
# Print summary at position 33 and 34 for all hallucination models
# Was done to check if hallucination ratios after permutation - labelingreverse permutation are correct

# for model in hallucination_models:
#     print(model[33]["summary"])
# print()
# for model in hallucination_models:
#     print(model[34]["summary"])

In [3]:
# Count the number of newlines and repeated spaces in each summary
def count_newlines_and_repeated_spaces(examples):
    total_summary_prefixes = 0
    total_newlines = 0
    total_repeated_spaces = 0
    for e in examples:
        summary = e["summary"]
        # Count SUMMMARY: at beginning of summary
        total_summary_prefixes += summary.lower().startswith("summary:\n")
        total_newlines += summary.count("\n")
        total_repeated_spaces += summary.count("  ")
    return {"summary prefixes": total_summary_prefixes, "newlines": total_newlines, "repeated_spaces": total_repeated_spaces}

print("Hallucination models: ")
print('\n'.join([str(count_newlines_and_repeated_spaces(model)) for model in hallucination_models]))
print("Qualitative models: ")
print('\n'.join([str(count_newlines_and_repeated_spaces(model)) for model in qualitative_models]))

# Clean all summaries from newlines and repeated spaces, change to single spaces
def clean_summaries(examples):
    for e in examples:
        # Some gpt-4 examples start with SUMMARY:, to prevent identifying the model, remove it
        if e["summary"].lower().startswith("summary:\n"):
            e["summary"] = e["summary"][9:]
        # Remove newlines and repeated spaces
        e["summary"] = " ".join(e["summary"].split())
        
    return examples

print("\nCleaning summaries...")
hallucination_models = [clean_summaries(model) for model in hallucination_models]
qualitative_models = [clean_summaries(model) for model in qualitative_models]

Hallucination models: 
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 2}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 9}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 0}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 0}
{'summary prefixes': 3, 'newlines': 3, 'repeated_spaces': 0}
Qualitative models: 
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 0}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 0}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 6}
{'summary prefixes': 0, 'newlines': 0, 'repeated_spaces': 0}
{'summary prefixes': 3, 'newlines': 3, 'repeated_spaces': 0}

Cleaning summaries...


In [4]:
# Debugging
num_examples = 50

In [5]:
# Add randomness
# Set reproducible seed
random.seed(2)

def get_random_permutation(max_num):
    return random.sample(range(max_num), max_num)
    # Debug
    # return list(range(max_num))

hallucination_random_models = {}
qualitative_random_models = {}

for id in range(num_examples):
    hallucination_random_models[id] = get_random_permutation(num_hallucination_models)
    qualitative_random_models[id] = get_random_permutation(num_qualitative_models)

print("hallucination_random_models:", hallucination_random_models)
print("qualitative_random_models:", qualitative_random_models)

hallucination_random_models: {0: [0, 4, 3, 1, 2], 1: [4, 1, 3, 2, 0], 2: [0, 2, 1, 3, 4], 3: [1, 0, 3, 4, 2], 4: [3, 2, 4, 1, 0], 5: [3, 2, 1, 4, 0], 6: [1, 2, 3, 0, 4], 7: [1, 3, 2, 4, 0], 8: [4, 0, 1, 3, 2], 9: [0, 3, 2, 4, 1], 10: [0, 4, 3, 2, 1], 11: [1, 0, 4, 3, 2], 12: [2, 4, 1, 0, 3], 13: [3, 1, 0, 4, 2], 14: [4, 2, 0, 1, 3], 15: [0, 2, 4, 3, 1], 16: [1, 4, 2, 3, 0], 17: [2, 3, 1, 0, 4], 18: [4, 0, 3, 2, 1], 19: [0, 3, 1, 2, 4], 20: [4, 0, 2, 3, 1], 21: [0, 4, 2, 1, 3], 22: [0, 2, 4, 3, 1], 23: [1, 0, 3, 4, 2], 24: [3, 1, 0, 4, 2], 25: [2, 0, 3, 4, 1], 26: [4, 3, 0, 1, 2], 27: [3, 4, 2, 1, 0], 28: [4, 2, 3, 1, 0], 29: [4, 1, 3, 0, 2], 30: [2, 3, 0, 1, 4], 31: [4, 2, 0, 3, 1], 32: [3, 0, 2, 1, 4], 33: [2, 3, 4, 1, 0], 34: [4, 1, 3, 2, 0], 35: [0, 4, 1, 3, 2], 36: [4, 1, 3, 0, 2], 37: [3, 1, 0, 4, 2], 38: [3, 2, 4, 1, 0], 39: [1, 0, 3, 4, 2], 40: [4, 3, 0, 1, 2], 41: [2, 3, 4, 0, 1], 42: [2, 4, 3, 1, 0], 43: [4, 1, 2, 0, 3], 44: [0, 4, 3, 1, 2], 45: [3, 2, 0, 1, 4], 46: [2, 4, 0, 

In [6]:
# Get hallucination examples
hallucination_summaries = []
for id in range(num_examples):
    text = original_examples[id]["text"]
    summaries = []
    for j in range(num_hallucination_models):
        random_index = hallucination_random_models[id][j]
        summaries.append(hallucination_models[random_index][id]["summary"])
    hallucination_summaries.append({"text": text, "summaries": summaries})

# Get qualitative examples
qualitative_summaries = []
for id in range(num_examples):
    text = original_examples[id]["text"]
    summaries = []
    for j in range(num_qualitative_models):
        random_index = qualitative_random_models[id][j]
        summaries.append(qualitative_models[random_index][id]["summary"])
    qualitative_summaries.append({"text": text, "summaries": summaries})

In [None]:
# Print hallucination examples

for i in range(num_examples):
    print(f"Text:\n{hallucination_summaries[i]['text']}\n")
    for j in range(num_hallucination_models):
        print(f"Summary {j}:\n{hallucination_summaries[i]['summaries'][j]}\n")
    print("\n\n")

In [None]:
# Print qualitative examples

for i in range(num_examples):
    print(f"Text:\n{qualitative_summaries[i]['text']}\n")
    for j in range(num_qualitative_models):
        print(f"Summary {j}:\n{qualitative_summaries[i]['summaries'][j]}\n")
    print("\n\n")

In [34]:
# Clean results folder and store examples in the folder
hallucination_folder = "/home/s_hegs02/patient_summaries_with_llms/data/results_hallucinations"
qualitative_folder = "/home/s_hegs02/patient_summaries_with_llms/data/results_qualitative"

# Clean results folder
def clean_results_folder(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.makedirs(folder)
    
clean_results_folder(hallucination_folder)
clean_results_folder(qualitative_folder)

# Store hallucination examples
for i in range(num_examples):
    hallucination_file = os.path.join(hallucination_folder, f"{i}_hallucination.txt")
    with open(hallucination_file, "w") as f:
        f.write(f"Text:\n{hallucination_summaries[i]['text']}\n\n")
        for j in range(num_hallucination_models):
            f.write(f"Summary {j}:\n{hallucination_summaries[i]['summaries'][j]}\n\n")
        f.write("\n\n")
        
# Store qualitative examples
for i in range(num_examples):
    qualitative_file = os.path.join(qualitative_folder, f"{i}_qualitative.txt")
    with open(qualitative_file, "w") as f:
        f.write(f"Text:\n{qualitative_summaries[i]['text']}\n\n")
        for j in range(num_qualitative_models):
            f.write(f"Summary {j}:\n{qualitative_summaries[i]['summaries'][j]}\n\n")
        f.write("\n\n")

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            # ziph.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(path, '..')))
            ziph.write(os.path.join(root, file), file)
            
zip_hallucination_file_name = f"hallucinations_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.zip"
zip_qualitative_file_name = f"qualitative_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.zip"

zipf_hallucination = zipfile.ZipFile(zip_hallucination_file_name, 'w', zipfile.ZIP_DEFLATED)
zipdir(hallucination_folder, zipf_hallucination)
zipf_hallucination.close()
zipf_qualitative = zipfile.ZipFile(zip_qualitative_file_name, 'w', zipfile.ZIP_DEFLATED)
zipdir(qualitative_folder, zipf_qualitative)
zipf_qualitative.close()