In [1]:
import pandas as pd
import re
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

def extract_text(results):
    pattern = re.compile(r'Response\s*:\s*({.*?})', re.DOTALL)
    extracted_texts = []
    count = 0
    for text in results:
        match = re.search(pattern, text)
        if match:
            extracted_texts.append(match.group(1))
        else:
            extracted_texts.append("")
            count += 1
    print(f"Pattern not found for {count} results.")
    return extracted_texts

def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])
    print(f'BLEU Score: {bleu_score}')

def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    print(f'Average ROUGE-1: {average_rouge1}')
    print(f'Average ROUGE-2: {average_rouge2}')
    print(f'Average ROUGE-L: {average_rougeL}')

def calculate_bert_score(generated_answers, ground_truth):
    scorer = BERTScorer(model_type='bert-base-uncased')
    P, R, F1 = scorer.score(generated_answers, ground_truth)
    # for i, (p, r, f1) in enumerate(zip(P, R, F1)):
    #     print(f"Pair {i + 1} - BERTScore Precision: {p.mean():.4f}, Recall: {r.mean():.4f}, F1: {f1.mean():.4f}")
    avg_precision = sum(p.mean() for p in P) / len(P)
    avg_recall = sum(r.mean() for r in R) / len(R)
    avg_f1 = sum(f1.mean() for f1 in F1) / len(F1)
    print(f"\nAverage BERTScore - Precision: {avg_precision:.4f}, Recall: {avg_recall:.4f}, F1: {avg_f1:.4f}")


In [3]:
# Read data
df = pd.read_csv('data_final_test.csv')
print(len(df))
machine_results_Finetuned = list(df["Result_Finetuned"])
machine_results_Non_Finetuned = list(df["Result_Non_Finetuned"])
reference_texts = list(df["Ground_Truth"])



800


In [7]:
machine_results_Finetuned[10]

"[INST]Answer the following question using only the text delimited by triple backticks below. What is the patient’s cancer diagnosis? Include only the diagnosis and the laterality. Output the response in JSON format with a single key called 'Diagnosis'. If you do not know the answer to the question, reply with 'Not Reported'.```PATIENT REVIEWED C/O CA BREAST + CA OVARY ON BEV MAINTANCE NO FRESH COMPLAINTS PLT COUNT / PT-INR / APTT/ URINE R/M : NORMAL ADVICE: REVIEW WITH PET CT SCAN REPORT ```[/INST] Response: {'Diagnosis': 'Carcinoma Right Breast'}"

In [8]:
machine_results_Non_Finetuned[10]

'[INST]Answer the following question using only the text delimited by triple backticks below. What is the patient’s cancer diagnosis? Include only the diagnosis and the laterality. Output the response in JSON format with a single key called \'Diagnosis\'. If you do not know the answer to the question, reply with \'Not Reported\'.```PATIENT REVIEWED C/O CA BREAST + CA OVARY ON BEV MAINTANCE NO FRESH COMPLAINTS PLT COUNT / PT-INR / APTT/ URINE R/M : NORMAL ADVICE: REVIEW WITH PET CT SCAN REPORT ```[/INST] Response:\n\n{\n"Diagnosis": {\n"Cancer": "Breast and Ovary",\n"Laterality": "Bilateral"\n}\n}'

In [9]:
reference_texts[10]

"{'Diagnosis': '1.CA BREAST (2018) POST SURGERY POST HORMONAL THERAPY 2.(2019 )HIGH GRADE SEROUS CARCINOMA OVARY'}"

In [10]:
# Extract text from results
machine_results_Finetuned_Copy = extract_text(machine_results_Finetuned)
machine_results_Non_Finetuned_Copy = extract_text(machine_results_Non_Finetuned)


Pattern not found for 230 results.
Pattern not found for 278 results.


In [11]:
machine_results_Non_Finetuned_Copy

['{\n"Diagnosis": {\n"Cancer": "Recurrent CA Ovary",\n"Laterality": "Not Reported"\n}',
 '{\n"Diagnosis": {\n"Laterality": "Ovary",\n"Type": "CIS/Gem"\n}',
 '{\n"Diagnosis": {\n"Cancer": "Not Reported"\n}',
 '{\n"Diagnosis": {\n"Cancer": "PR",\n"Laterality": "provisional"\n}',
 '{\n"Diagnosis": {\n"Cancer": {\n"Laterality": "Breast",\n"Metastatic": true\n}',
 '{\n"Diagnosis": {\n"SR B-ALL": {\n"Laterality": "IIA"\n}',
 '{\n"Diagnosis": {\n"Cancer Type": "CARCINOMA",\n"Laterality": "LEFT"\n}',
 '{\n"Diagnosis": {\n"Cancer": {\n"Laterality": "Left"\n}',
 '{\n"Diagnosis": {\n"T-NHL Stage (BM +)": {\n"Laterality": "Not Reported"\n}',
 '{\n"Diagnosis": {\n"Cancer Type": "Breast Cancer",\n"Laterality": "Right"\n}',
 '{\n"Diagnosis": {\n"Cancer": "Breast and Ovary",\n"Laterality": "Bilateral"\n}',
 '{\n"Diagnosis": {\n"Laterality": "Both",\n"Type": "Breast Cancer"\n}',
 '{\n"Diagnosis": {\n"SR B-ALL": {\n"Laterality": "IIA"\n}',
 '{\n"Diagnosis": "Recurrent Metastatic Breast Cancer (Stage 4)"

In [12]:
# Calculate BLEU score
print("BLEU Score for Finetuned:")
calculate_bleu_score(machine_results_Finetuned_Copy, reference_texts)
print("BLEU Score for Non-Finetuned:")
calculate_bleu_score(machine_results_Non_Finetuned_Copy, reference_texts)



BLEU Score for Finetuned:
BLEU Score: 0.2285170809637519
BLEU Score for Non-Finetuned:
BLEU Score: 0.022782186423563496


In [13]:
# Calculate ROUGE scores
print("ROUGE Scores for Finetuned:")
calculate_rouge_scores(machine_results_Finetuned_Copy, reference_texts)
print("ROUGE Scores for Non-Finetuned:")
calculate_rouge_scores(machine_results_Non_Finetuned_Copy, reference_texts)



ROUGE Scores for Finetuned:
Average ROUGE-1: 0.5826536223554938
Average ROUGE-2: 0.5200221612761202
Average ROUGE-L: 0.5766581341870203
ROUGE Scores for Non-Finetuned:
Average ROUGE-1: 0.3709419876455277
Average ROUGE-2: 0.25890286420600533
Average ROUGE-L: 0.3652855860585418


In [14]:
# Calculate BERTScore
print("BERTScores for Finetuned:")
calculate_bert_score(machine_results_Finetuned_Copy, reference_texts)


BERTScores for Finetuned:

Average BERTScore - Precision: 0.6566, Recall: 0.6599, F1: 0.6559




In [15]:
print("BERTScores for Non-Finetuned:")
calculate_bert_score(machine_results_Non_Finetuned_Copy, reference_texts)

BERTScores for Non-Finetuned:

Average BERTScore - Precision: 0.4932, Recall: 0.5208, F1: 0.5041


