# Evaluate GPT 4 prompts

In [1]:
from pathlib import Path
import json
import numpy as np
from collections import defaultdict
import evaluate
from rouge_score import rouge_scorer

In [2]:
# Results for prompt 1 - 1 IC-example, prompt 1 - 3 IC-examples, prompt 1 - 5 IC-examples, prompt 2 - 1 IC-example, prompt 2 - 3 IC-examples, prompt 2 - 5 IC-examples, prompt 3 - 1 IC-example, prompt 3 - 3 IC-examples, prompt 3 - 5 IC-examples
prefix = Path('/home/s_hegs02/patient_summaries_with_llms/gpt-4/prompt_tuning/')
files_paths = [
    # 'gpt-4_exp4_results_prompt1_1shot.jsonl',
    # 'gpt-4_exp4_results_prompt1_3shot.jsonl',
    # 'gpt-4_exp4_results_prompt1_5shot.jsonl',
    # 'gpt-4_exp4_results_prompt2_1shot.jsonl',
    # 'gpt-4_exp4_results_prompt2_3shot.jsonl',
    # 'gpt-4_exp4_results_prompt2_5shot.jsonl',
    # # Missing
    # 'gpt-4_exp4_results_prompt3_1shot.jsonl',
    # 'gpt-4_exp4_results_prompt3_3shot.jsonl',
    # 'gpt-4_exp4_results_prompt3_5shot.jsonl',
    'gpt-4_exp4_results_prompt3_0shot.jsonl',
    'gpt-4_exp4_results_prompt3_5shot.jsonl',
]

# Read jsonl files
def read_jsonl(file_name):
    with open(file_name, "r") as f:
        return [json.loads(line) for line in f]
    
files = [read_jsonl(prefix / file_path) for file_path in files_paths]

test_data_file = "/home/s_hegs02/patient_summaries_with_llms/gpt-4/summarization_data/exp_4_test.json"
test_data = read_jsonl(test_data_file)

In [3]:
# Use custom rouge function to obtain rouge 3/4 which are not available in huggingface
def get_rouge_score(gold, pred):
    rouge_scores = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_scores, use_stemmer=True)
    scores = scorer.score(gold, pred)
    return {k: scores[k].fmeasure * 100 for k in rouge_scores}

def compute_custom_metrics(srcs, golds, preds, device):
    scores = defaultdict(list)
    bertscore = evaluate.load("bertscore")
    sari = evaluate.load("sari")
    
    # For rouge and length go over examples one by one and determine mean
    for gold, pred in zip(golds, preds):
        for k, v in get_rouge_score(gold, pred).items():
            scores[k].append(v)
        scores['words'].append(len(pred.split(' ')))
    for k, v in scores.items():
        scores[k] = np.mean(v)

    # This is the default call using model_type="roberta-large"
    # This is the same as in the paper "Generation of Patient After-Visit Summaries to Support Physicians" (AVS_gen/eval_summarization.py) using the libary SummerTime
    scores['bert_score'] = np.mean((bertscore.compute(predictions=preds, references=golds, lang="en", device=device))['f1']) * 100
    # BERTScore authors recommend "microsoft/deberta-large-mnli" (https://github.com/Tiiiger/bert_score)
    scores['bert_score_deberta-large'] = np.mean((bertscore.compute(predictions=preds, references=golds, device=device, model_type="microsoft/deberta-large-mnli"))['f1']) * 100
    scores['sari'] = sari.compute(sources=srcs, predictions=preds, references=[[g] for g in golds])['sari']
    # scores['sari'] = scores['sari'][0]
    # Importing readability for dallc score not working: https://pypi.org/project/py-readability-metrics/    

    return scores

def get_metrics_as_latex(metrics):
    # Print latex table row
    order = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL', 'bert_score', 'bert_score_deberta-large', 'sari', 'words']
    return ' & '.join([f'${metrics[k]:.2f}$' for k in order])

In [4]:
# Print performance
srcs = [e["text"] for e in test_data][:len(files[0])]
golds = [e["summary"] for e in test_data][:len(files[0])]

In [6]:
results = []
for i , f in enumerate(files):
    preds = [e["summary"] for e in f]
    metrics = compute_custom_metrics(srcs, golds, preds, "cuda")
    metrics = {k: round(v, 2) for k, v in metrics.items()}
    results.append(get_metrics_as_latex(metrics))
    print()
    
print('\n'.join(results))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



$42.50$ & $11.95$ & $4.37$ & $2.09$ & $21.49$ & $86.30$ & $61.36$ & $45.70$ & $214.40$
$41.99$ & $12.83$ & $5.22$ & $2.26$ & $22.67$ & $86.95$ & $62.35$ & $43.55$ & $138.70$


In [None]:
# Print examples along with summaries

for i in range(0, len(files[0])):
    print(f"Example {i+1}")
    print(f"Source: {srcs[i]}\n")
    print(f"Gold: {golds[i]}\n")
    for j, f in enumerate(files):
        print(f"Summary {j+1}: {' '.join(f[i]['summary'].split())}\n")
    print()