In [1]:
# Install required packages
!pip install torch transformers pandas tqdm sentence-transformers rouge-score sacremoses sacrebleu bert-score

# Imports
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import BioGptTokenizer, BioGptForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import sacrebleu

# Load model and tokenizer
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
df = pd.read_csv("/kaggle/input/precription/health prescription data.csv")
df = df[['SUBJECT_ID', 'ROW_ID', 'HADM_ID', 'CATEGORY', 'ADMISSION_TYPE', 'DIAGNOSIS', 'TEXT']].dropna(subset=['TEXT']).reset_index(drop=True)

# Sample subset
sample_df = df.reset_index(drop=True)

# Clarification function
def clarify_text(text, max_new_tokens=200):
    prompt = f"Summarize clearly this medical report:\n\n{text}\n\nSummary:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024 - max_new_tokens,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    clarified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clarified_text.replace(prompt, "").strip()

# Generate summaries
clarified_summaries = []
for report in tqdm(sample_df['TEXT'], desc="Clarifying Medical Reports"):
    clarified = clarify_text(report)
    clarified_summaries.append(clarified)

sample_df['clarified_summary'] = clarified_summaries

# Initialize evaluation tools
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

rougeL_scores = []
cosine_scores = []
original_texts = sample_df['TEXT'].tolist()
clarified_texts = sample_df['clarified_summary'].tolist()

# Compute embeddings
original_embeddings = embed_model.encode(original_texts, convert_to_tensor=True)
clarified_embeddings = embed_model.encode(clarified_texts, convert_to_tensor=True)

# Compute ROUGE-L and Cosine Similarity
for orig, clar, orig_emb, clar_emb in tqdm(zip(original_texts, clarified_texts, original_embeddings, clarified_embeddings), total=len(original_texts)):
    rougeL_score = rouge.score(orig, clar)['rougeL'].fmeasure
    rougeL_scores.append(rougeL_score)
    cosine_sim = util.cos_sim(orig_emb, clar_emb).item()
    cosine_scores.append(cosine_sim)

avg_rougeL = np.mean(rougeL_scores)
avg_cosine_similarity = np.mean(cosine_scores)

print(f"\nAverage ROUGE-L: {avg_rougeL:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(clarified_texts, [original_texts])
bleu_score = bleu.score
print(f"Average BLEU: {bleu_score:.4f}")

# BERTScore evaluation
P, R, F1 = bertscore(clarified_texts, original_texts, lang="en")
avg_bertscore_f1 = F1.mean().item()
print(f"Average BERTScore F1: {avg_bertscore_f1:.4f}")

# Save results
sample_df.to_csv("clarified_medical_reports_full_evaluation.csv", index=False)
print("Saved extended evaluations to clarified_medical_reports_full_evaluation.csv")


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cud

2025-07-28 16:24:24.063292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753719864.254696      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753719864.322010      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]


Clarifying Medical Reports:   0%|          | 0/30 [00:00<?, ?it/s][A
Clarifying Medical Reports:   3%|▎         | 1/30 [00:02<01:25,  2.94s/it][A
Clarifying Medical Reports:   7%|▋         | 2/30 [00:05<01:18,  2.81s/it][A
Clarifying Medical Reports:  10%|█         | 3/30 [00:06<00:48,  1.78s/it][A
Clarifying Medical Reports:  13%|█▎        | 4/30 [00:06<00:31,  1.22s/it][A
Clarifying Medical Reports:  17%|█▋        | 5/30 [00:07<00:28,  1.14s/it][A
Clarifying Medical Reports:  20%|██        | 6/30 [00:08<00:23,  1.04it/s][A
Clarifying Medical Reports:  23%|██▎       | 7/30 [00:08<00:17,  1.34it/s][A
Clarifying Medical Reports:  27%|██▋       | 8/30 [00:08<00:12,  1.72it/s][A
Clarifying Medical Reports:  30%|███       | 9/30 [00:08<00:09,  2.21it/s][A
Clarifying Medical Reports:  33%|███▎      | 10/30 [00:11<00:21,  1.08s/it][A
Clarifying Medical Reports:  37%|███▋      | 11/30 [00:12<00:20,  1.08s/it][A
Clarifying Medical Reports:  40%|████      | 12/30 [00:12<00:14,  1.2

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:07<00:00,  4.27it/s]



Average ROUGE-L: 0.5030
Average Cosine Similarity: 0.9830
Average BLEU: 9.1410


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.9008
Saved extended evaluations to clarified_medical_reports_full_evaluation.csv


In [1]:
# Install required packages
!pip install torch transformers pandas tqdm sentence-transformers rouge-score sacremoses sacrebleu bert-score --quiet

# Imports
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import sacrebleu

# Load model and tokenizer
biomedlm_name = "stanford-crfm/BioMedLM"
tokenizer = AutoTokenizer.from_pretrained(biomedlm_name)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(biomedlm_name).to(device)
model.eval()

# Load data
df = pd.read_csv("/kaggle/input/precription/health prescription data.csv")
df = df[['SUBJECT_ID', 'ROW_ID', 'HADM_ID', 'CATEGORY', 'ADMISSION_TYPE', 'DIAGNOSIS', 'TEXT']].dropna(subset=['TEXT']).reset_index(drop=True)

# Sample subset
sample_df = df.reset_index(drop=True)

# Clarification function
def clarify_text(text, max_new_tokens=200):
    prompt = f"Summarize clearly this medical report:\n\n{text}\n\nSummary:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024 - max_new_tokens,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    clarified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clarified_text.replace(prompt, "").strip()

# Generate summaries
clarified_summaries = []
for report in tqdm(sample_df['TEXT'], desc="Clarifying Medical Reports"):
    try:
        clarified = clarify_text(report)
    except Exception:
        clarified = ""
    clarified_summaries.append(clarified)

sample_df['clarified_summary'] = clarified_summaries

# Initialize evaluation tools
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

rougeL_scores = []
cosine_scores = []
original_texts = sample_df['TEXT'].tolist()
clarified_texts = sample_df['clarified_summary'].tolist()

# Compute embeddings
original_embeddings = embed_model.encode(original_texts, convert_to_tensor=True)
clarified_embeddings = embed_model.encode(clarified_texts, convert_to_tensor=True)

# Compute ROUGE-L and Cosine Similarity
for orig, clar, orig_emb, clar_emb in tqdm(zip(original_texts, clarified_texts, original_embeddings, clarified_embeddings), total=len(original_texts)):
    rougeL_score = rouge.score(orig, clar)['rougeL'].fmeasure
    rougeL_scores.append(rougeL_score)
    cosine_sim = util.cos_sim(orig_emb, clar_emb).item()
    cosine_scores.append(cosine_sim)

avg_rougeL = np.mean(rougeL_scores)
avg_cosine_similarity = np.mean(cosine_scores)
print(f"\nAverage ROUGE-L: {avg_rougeL:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(clarified_texts, [original_texts])
bleu_score = bleu.score
print(f"Average BLEU: {bleu_score:.4f}")

# BERTScore evaluation
P, R, F1 = bertscore(clarified_texts, original_texts, lang="en")
avg_bertscore_f1 = F1.mean().item()
print(f"Average BERTScore F1: {avg_bertscore_f1:.4f}")

# Show sample output
print(sample_df[['TEXT', 'clarified_summary']].head())

# Save results
sample_df.to_csv("clarified_medical_reports_full_evaluation.csv", index=False)
print("Saved extended evaluations to clarified_medical_reports_full_evaluation.csv")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━

2025-07-28 16:42:42.331072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753720962.509901      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753720962.561092      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]


Clarifying Medical Reports:   0%|          | 0/30 [00:00<?, ?it/s][A
Clarifying Medical Reports:   3%|▎         | 1/30 [02:37<1:16:04, 157.41s/it][A
Clarifying Medical Reports:   7%|▋         | 2/30 [05:13<1:13:08, 156.74s/it][A
Clarifying Medical Reports:  10%|█         | 3/30 [07:49<1:10:25, 156.51s/it][A
Clarifying Medical Reports:  13%|█▎        | 4/30 [10:26<1:07:45, 156.35s/it][A
Clarifying Medical Reports:  17%|█▋        | 5/30 [13:01<1:05:03, 156.15s/it][A
Clarifying Medical Reports:  20%|██        | 6/30 [15:37<1:02:24, 156.00s/it][A
Clarifying Medical Reports:  23%|██▎       | 7/30 [18:13<59:46, 155.92s/it]  [A
Clarifying Medical Reports:  27%|██▋       | 8/30 [20:48<57:08, 155.84s/it][A
Clarifying Medical Reports:  30%|███       | 9/30 [23:24<54:32, 155.82s/it][A
Clarifying Medical Reports:  33%|███▎      | 10/30 [26:00<51:56, 155.81s/it][A
Clarifying Medical Reports:  37%|███▋      | 11/30 [28:36<49:20, 155.79s/it][A
Clarifying Medical Reports:  40%|████      |

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:06<00:00,  4.66it/s]



Average ROUGE-L: 0.4312
Average Cosine Similarity: 0.9830
Average BLEU: 6.3302


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.9876
                                                TEXT  \
0  Admission Date:  [**2192-5-20**]              ...   
1  Admission Date:  [**2137-12-5**]              ...   
2  Admission Date:  [**2152-10-19**]             ...   
3  Admission Date:  [**2129-5-23**]              ...   
4  Admission Date:  [**2184-6-23**]              ...   

                                   clarified_summary  
0  Summarize clearly this medical report:\n\nAdmi...  
1  Summarize clearly this medical report:\n\nAdmi...  
2  Summarize clearly this medical report:\n\nAdmi...  
3  Summarize clearly this medical report:\n\nAdmi...  
4  Summarize clearly this medical report:\n\nAdmi...  
Saved extended evaluations to clarified_medical_reports_full_evaluation.csv


In [1]:
# Install required packages
!pip install torch transformers pandas tqdm sentence-transformers rouge-score sacremoses sacrebleu bert-score --quiet

import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import sacrebleu

# Load MedAlpaca model and tokenizer
model_name = "medalpaca/medalpaca-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
model.eval()

# Load dataset and sample subset
df = pd.read_csv("/kaggle/input/precription/health prescription data.csv")
df = df[['SUBJECT_ID', 'ROW_ID', 'HADM_ID', 'CATEGORY', 'ADMISSION_TYPE', 'DIAGNOSIS', 'TEXT']].dropna(subset=['TEXT']).reset_index(drop=True)
sample_df = df.reset_index(drop=True)

# Clarification function
def clarify_text_medalpaca(text, max_new_tokens=100):
    prompt = f"### Instruction:\nSummarize the following medical report clearly and concisely.\n\n### Input:\n{text}\n\n### Response:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024 - max_new_tokens,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    clarified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clarified_text.replace(prompt, "").strip()

# Clarify medical reports
clarified_summaries = []
for report in tqdm(sample_df['TEXT'], desc="Clarifying Medical Reports"):
    try:
        clarified = clarify_text_medalpaca(report)
    except Exception:
        clarified = ""
    clarified_summaries.append(clarified)

sample_df['clarified_summary'] = clarified_summaries

# Initialize evaluation tools
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
original_texts = sample_df['TEXT'].tolist()
clarified_texts = sample_df['clarified_summary'].tolist()

original_embeddings = embed_model.encode(original_texts, convert_to_tensor=True, batch_size=16)
clarified_embeddings = embed_model.encode(clarified_texts, convert_to_tensor=True, batch_size=16)

rougeL_scores = []
cosine_scores = []

for orig, clar, orig_emb, clar_emb in tqdm(zip(original_texts, clarified_texts, original_embeddings, clarified_embeddings), total=len(original_texts)):
    rougeL_score = rouge.score(orig, clar)['rougeL'].fmeasure
    rougeL_scores.append(rougeL_score)
    cosine_sim = util.cos_sim(orig_emb, clar_emb).item()
    cosine_scores.append(cosine_sim)

avg_rougeL = np.mean(rougeL_scores)
avg_cosine_similarity = np.mean(cosine_scores)

print(f"\nAverage ROUGE-L: {avg_rougeL:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(clarified_texts, [original_texts])
print(f"Average BLEU: {bleu.score:.4f}")

# BERTScore evaluation
P, R, F1 = bertscore(clarified_texts, original_texts, lang="en")
print(f"Average BERTScore F1: {F1.mean().item():.4f}")

# Show example outputs
print(sample_df[['TEXT', 'clarified_summary']].head())

# Save to CSV
sample_df.to_csv("clarified_medical_reports_medalpaca_evaluation.csv", index=False)
print("Saved evaluations to clarified_medical_reports_medalpaca_evaluation.csv")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K  

2025-07-28 18:35:03.656131: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753727703.841503      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753727703.897960      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Clarifying Medical Reports: 100%|██████████| 30/30 [03:53<00:00,  7.79s/it]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:06<00:00,  4.76it/s]



Average ROUGE-L: 0.4272
Average Cosine Similarity: 0.9574
Average BLEU: 4.9406


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.9752
                                                TEXT  \
0  Admission Date:  [**2192-5-20**]              ...   
1  Admission Date:  [**2137-12-5**]              ...   
2  Admission Date:  [**2152-10-19**]             ...   
3  Admission Date:  [**2129-5-23**]              ...   
4  Admission Date:  [**2184-6-23**]              ...   

                                   clarified_summary  
0  ### Instruction:\nSummarize the following medi...  
1  ### Instruction:\nSummarize the following medi...  
2  ### Instruction:\nSummarize the following medi...  
3  ### Instruction:\nSummarize the following medi...  
4  ### Instruction:\nSummarize the following medi...  
Saved evaluations to clarified_medical_reports_medalpaca_evaluation.csv


In [1]:
# Install required packages
!pip install torch transformers pandas tqdm sentence-transformers rouge-score sacremoses sacrebleu bert-score --quiet

import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import sacrebleu

# Load EleutherAI GPT-Neo model and tokenizer
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.resize_token_embeddings(len(tokenizer))
model.eval()

# Load dataset and sample subset
df = pd.read_csv("/kaggle/input/precription/health prescription data.csv")
df = df[['SUBJECT_ID', 'ROW_ID', 'HADM_ID', 'CATEGORY', 'ADMISSION_TYPE', 'DIAGNOSIS', 'TEXT']].dropna(subset=['TEXT']).reset_index(drop=True)
sample_df = df.reset_index(drop=True)

# Clarification function for GPT-Neo
def clarify_text_gptneo(text, max_new_tokens=100):
    prompt = f"Summarize the following medical report clearly and concisely:\n{text}\nSummary:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024 - max_new_tokens,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    clarified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clarified_text.replace(prompt, "").strip()

# Clarify medical reports
clarified_summaries = []
for report in tqdm(sample_df['TEXT'], desc="Clarifying Medical Reports"):
    try:
        clarified = clarify_text_gptneo(report)
    except Exception:
        clarified = ""
    clarified_summaries.append(clarified)

sample_df['clarified_summary'] = clarified_summaries

# Evaluation tools
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
original_texts = sample_df['TEXT'].tolist()
clarified_texts = sample_df['clarified_summary'].tolist()

original_embeddings = embed_model.encode(original_texts, convert_to_tensor=True, batch_size=16)
clarified_embeddings = embed_model.encode(clarified_texts, convert_to_tensor=True, batch_size=16)

rougeL_scores = []
cosine_scores = []

for orig, clar, orig_emb, clar_emb in tqdm(zip(original_texts, clarified_texts, original_embeddings, clarified_embeddings), total=len(original_texts)):
    rougeL_score = rouge.score(orig, clar)['rougeL'].fmeasure
    rougeL_scores.append(rougeL_score)
    cosine_sim = util.cos_sim(orig_emb, clar_emb).item()
    cosine_scores.append(cosine_sim)

avg_rougeL = np.mean(rougeL_scores)
avg_cosine_similarity = np.mean(cosine_scores)

print(f"\nAverage ROUGE-L: {avg_rougeL:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(clarified_texts, [original_texts])
print(f"Average BLEU: {bleu.score:.4f}")

# BERTScore evaluation
P, R, F1 = bertscore(clarified_texts, original_texts, lang="en")
print(f"Average BERTScore F1: {F1.mean().item():.4f}")

# Show example outputs
print(sample_df[['TEXT', 'clarified_summary']].head())

# Save to CSV
sample_df.to_csv("clarified_medical_reports_gptneo_evaluation.csv", index=False)
print("Saved evaluations to clarified_medical_reports_gptneo_evaluation.csv")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━

2025-07-29 06:23:40.237037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753770220.395920      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753770220.443712      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Clarifying Medical Reports: 100%|██████████| 30/30 [01:29<00:00,  2.98s/it]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:07<00:00,  4.24it/s]



Average ROUGE-L: 0.4793
Average Cosine Similarity: 0.9732
Average BLEU: 8.5063


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.9827
                                                TEXT  \
0  Admission Date:  [**2192-5-20**]              ...   
1  Admission Date:  [**2137-12-5**]              ...   
2  Admission Date:  [**2152-10-19**]             ...   
3  Admission Date:  [**2129-5-23**]              ...   
4  Admission Date:  [**2184-6-23**]              ...   

                                   clarified_summary  
0  Summarize the following medical report clearly...  
1  Summarize the following medical report clearly...  
2  Summarize the following medical report clearly...  
3  Summarize the following medical report clearly...  
4  Summarize the following medical report clearly...  
Saved evaluations to clarified_medical_reports_gptneo_evaluation.csv


In [1]:
# Install required packages
!pip install torch transformers pandas tqdm sentence-transformers rouge-score sacremoses sacrebleu bert-score --quiet

import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bertscore
import sacrebleu

# Load MedLLaMA2 model and tokenizer
model_name = "llSourcell/medllama2_7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Load dataset and sample 30 entries
df = pd.read_csv("/kaggle/input/precription/health prescription data.csv")
df = df[['SUBJECT_ID', 'ROW_ID', 'HADM_ID', 'CATEGORY', 'ADMISSION_TYPE', 'DIAGNOSIS', 'TEXT']].dropna(subset=['TEXT']).reset_index(drop=True)
sample_df = df.reset_index(drop=True)

# Clarification function for MedLLaMA2
def clarify_text_medllama(text, max_new_tokens=100):
    prompt = f"### Instruction:\nSummarize the following medical report clearly and concisely.\n\n### Input:\n{text}\n\n### Response:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024 - max_new_tokens,
        padding="max_length"
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    clarified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clarified_text.replace(prompt, "").strip()

# Clarify medical reports
clarified_summaries = []
for report in tqdm(sample_df['TEXT'], desc="Clarifying Medical Reports"):
    try:
        clarified = clarify_text_medllama(report)
    except Exception:
        clarified = ""
    clarified_summaries.append(clarified)

sample_df['clarified_summary'] = clarified_summaries

# Evaluation tools
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=model.device)

# Compute embeddings
original_texts = sample_df['TEXT'].tolist()
clarified_texts = sample_df['clarified_summary'].tolist()

original_embeddings = embed_model.encode(original_texts, convert_to_tensor=True, batch_size=16)
clarified_embeddings = embed_model.encode(clarified_texts, convert_to_tensor=True, batch_size=16)

rougeL_scores = []
cosine_scores = []

for orig, clar, orig_emb, clar_emb in tqdm(zip(original_texts, clarified_texts, original_embeddings, clarified_embeddings), total=len(original_texts)):
    rougeL_score = rouge.score(orig, clar)['rougeL'].fmeasure
    rougeL_scores.append(rougeL_score)
    cosine_sim = util.cos_sim(orig_emb, clar_emb).item()
    cosine_scores.append(cosine_sim)

avg_rougeL = np.mean(rougeL_scores)
avg_cosine_similarity = np.mean(cosine_scores)

print(f"\nAverage ROUGE-L: {avg_rougeL:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(clarified_texts, [original_texts])
print(f"Average BLEU: {bleu.score:.4f}")

# BERTScore evaluation
P, R, F1 = bertscore(clarified_texts, original_texts, lang="en")
print(f"Average BERTScore F1: {F1.mean().item():.4f}")

# Show example outputs
print(sample_df[['TEXT', 'clarified_summary']].head())

# Save to CSV
sample_df.to_csv("clarified_medical_reports_medllama2_evaluation.csv", index=False)
print("Saved evaluations to clarified_medical_reports_medllama2_evaluation.csv")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━

2025-07-29 08:03:58.722906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753776239.058304      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753776239.158088      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Clarifying Medical Reports:   0%|          | 0/30 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Clarifying Medical Reports:   3%|▎         | 1/30 [04:46<2:18:41, 286.96s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Clarifying Medical Reports:   7%|▋         | 2/30 [08:52<2:02:35, 262.68s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Clarifying Medical Reports:  10%|█         | 3/30 [12:52<1:53:32,

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:07<00:00,  4.07it/s]



Average ROUGE-L: 0.4287
Average Cosine Similarity: 0.9574
Average BLEU: 5.0633


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.9752
                                                TEXT  \
0  Admission Date:  [**2192-5-20**]              ...   
1  Admission Date:  [**2137-12-5**]              ...   
2  Admission Date:  [**2152-10-19**]             ...   
3  Admission Date:  [**2129-5-23**]              ...   
4  Admission Date:  [**2184-6-23**]              ...   

                                   clarified_summary  
0  ### Instruction:\nSummarize the following medi...  
1  ### Instruction:\nSummarize the following medi...  
2  ### Instruction:\nSummarize the following medi...  
3  ### Instruction:\nSummarize the following medi...  
4  ### Instruction:\nSummarize the following medi...  
Saved evaluations to clarified_medical_reports_medllama2_evaluation.csv
