In [1]:
# General
import json
import numpy as np
from collections import defaultdict
import evaluate
from rouge_score import rouge_scorer

In [2]:
# Use custom rouge function to obtain rouge 3/4 which are not available in huggingface
def get_rouge_score(gold, pred):
    rouge_scores = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_scores, use_stemmer=True)
    scores = scorer.score(gold, pred)
    return {k: scores[k].fmeasure * 100 for k in rouge_scores}

def compute_custom_metrics(srcs, golds, preds, device):
    scores = defaultdict(list)
    bertscore = evaluate.load("bertscore")
    sari = evaluate.load("sari")
    
    # For rouge and length go over examples one by one and determine mean
    for gold, pred in zip(golds, preds):
        for k, v in get_rouge_score(gold, pred).items():
            scores[k].append(v)
        scores['words'].append(len(pred.split(' ')))
    for k, v in scores.items():
        scores[k] = np.mean(v)

    # This is the default call using model_type="roberta-large"
    # This is the same as in the paper "Generation of Patient After-Visit Summaries to Support Physicians" (AVS_gen/eval_summarization.py) using the libary SummerTime
    scores['bert_score'] = np.mean((bertscore.compute(predictions=preds, references=golds, lang="en", device=device))['f1']) * 100
    # BERTScore authors recommend "microsoft/deberta-large-mnli" (https://github.com/Tiiiger/bert_score)
    scores['bert_score_deberta-large'] = np.mean((bertscore.compute(predictions=preds, references=golds, device=device, model_type="microsoft/deberta-large-mnli"))['f1']) * 100
    scores['sari'] = sari.compute(sources=srcs, predictions=preds, references=[[g] for g in golds])['sari']
    # scores['sari'] = scores['sari'][0]
    # Importing readability for dallc score not working: https://pypi.org/project/py-readability-metrics/    

    return scores

def print_metrics_as_latex(metrics):
    # Print latex table row
    order = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL', 'bert_score', 'bert_score_deberta-large', 'sari', 'words']
    print(' & '.join([f'${metrics[k]:.2f}$' for k in order]))

In [12]:
# Files
# test_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/summarization_data/exp_4_test.json"
# validation_examples = "2_short_BHC_summary_prediction/valid_4000_600_chars.json"
# preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/data/hallucination_evaluation/gpt-4_exp4_results_3shot.jsonl"
# preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/data/qualitative_evaluation/gpt-4_exp6_results_3shot.jsonl"

# Experiment 1 and 2
test_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/summarization_data/exp_1_test.json"
# preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/performance_results/gpt-4_exp1_results_prompt3.1_0shot.jsonl"
preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/performance_results/gpt-4_exp1_results_prompt3_5shot.jsonl"
# test_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/summarization_data/exp_2_test.json"
# preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/performance_results/gpt-4_exp2_results_prompt3.1_0shot.jsonl"
# preds_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/performance_results/gpt-4_exp2_results_prompt3_5shot.jsonl"



# Read jsonl files
def read_jsonl(file_name):
    with open(file_name, "r") as f:
        return [json.loads(line) for line in f]
    
# Read jsonl files
test_data = read_jsonl(test_data_file)
preds_data = read_jsonl(preds_data_file)

In [None]:
# Print included valid examples with indices
for i in range(0, 3):
    print(i)
    print(test_data[i]["text"])
    print(test_data[i]["summary"])
    print(preds_data[i]["summary"])
    print()

In [14]:
srcs = [e["text"] for e in test_data]
golds = [e["summary"] for e in test_data]
preds = [e["summary"] for e in preds_data]
metrics_test = compute_custom_metrics(srcs, golds, preds, "cuda")

metrics_test = {k: round(v, 2) for k, v in metrics_test.items()}
print("Test metrics rounded:")
print(metrics_test)
print_metrics_as_latex(metrics_test)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test metrics rounded:
{'rouge1': 38.8, 'rouge2': 10.78, 'rouge3': 3.55, 'rouge4': 1.12, 'rougeL': 21.98, 'words': 131.86, 'bert_score': 86.67, 'bert_score_deberta-large': 61.3, 'sari': 42.88}
$38.80$ & $10.78$ & $3.55$ & $1.12$ & $21.98$ & $86.67$ & $61.30$ & $42.88$ & $131.86$
