In [1]:
import pandas as pd

# Load data
file_path = '/kaggle/input/lab-test-results/Thyroid_Diff.csv'
data = pd.read_csv(file_path)

# Function to convert row to text summary excluding Stage and Response
def create_text_for_stage_response(row):
    return (
        f"Age: {row['Age']}, Gender: {row['Gender']}, Smoking: {row['Smoking']}, "
        f"History of Smoking: {row['Hx Smoking']}, History of Radiotherapy: {row['Hx Radiothreapy']}, "
        f"Thyroid Function: {row['Thyroid Function']}, Physical Examination: {row['Physical Examination']}, "
        f"Adenopathy: {row['Adenopathy']}, Pathology: {row['Pathology']}, "
        f"Focality: {row['Focality']}, Risk: {row['Risk']}, "
        f"T: {row['T']}, N: {row['N']}, M: {row['M']}."
        " Predict the Stage and Response."
    )

# Apply to all rows
data['Text_Summary'] = data.apply(create_text_for_stage_response, axis=1)

# Select columns for output
output_data = data[['Text_Summary', 'Stage', 'Response']]

# Save the new dataset to CSV
output_path = '/kaggle/working/agle_output_stage_response.csv'
output_data.to_csv(output_path, index=False)

print(f"Data saved to {output_path}")


Data saved to /kaggle/working/agle_output_stage_response.csv


In [2]:
!pip install transformers torch rouge-score scikit-learn sacremoses -q nltk bert-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01

In [None]:
# Step 1: Install required packages
# Already assumed installed

# Step 2: Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

# Step 3: Load dataset
data = pd.read_csv('agle_output_stage_response.csv')

# Step 4: Load llSourcell MedLLaMA2 7B model and tokenizer (on CPU)
device = torch.device("cpu")
model_name = "llSourcell/medllama2_7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model.eval()

# Step 5: Text generation
def generate_prediction(text, max_new_tokens=50):
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Step 6: Embedding extraction using hidden states
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_hidden_states=True, return_dict=True)
    last_hidden_state = outputs.hidden_states[-1]
    embedding = last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# Step 7: Initialize scorers
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores, cosine_sims, bleu_scores, bert_scores = [], [], [], []

# Step 8: Evaluation loop
for idx, row in data.head(30).iterrows():
    input_text = row['Text_Summary']
    true_stage = row['Stage']
    true_response = row['Response']

    pred_text = generate_prediction(input_text)

    # Attempt to extract predicted stage and response
    pred_stage, pred_response = '', ''
    for part in pred_text.split('.'):
        if 'stage' in part.lower():
            pred_stage = part.strip()
        if 'response' in part.lower():
            pred_response = part.strip()

    reference_text = f"Stage: {true_stage}. Response: {true_response}."
    prediction_text = f"{pred_stage}. {pred_response}"

    # ROUGE-L
    score_rouge = scorer.score(reference_text, prediction_text)
    rouge_l_scores.append(score_rouge['rougeL'].fmeasure)

    # Cosine similarity
    emb_ref = get_embedding(reference_text)
    emb_pred = get_embedding(prediction_text)
    cos_sim = cosine_similarity(emb_ref, emb_pred)[0][0]
    cosine_sims.append(cos_sim)

    # BLEU
    ref_tokens = word_tokenize(reference_text)
    pred_tokens = word_tokenize(prediction_text)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # BERTScore
    P, R, F1 = bert_score([prediction_text], [reference_text], lang="en", verbose=False, device="cpu")
    bert_scores.append(F1[0].item())

    # Print individual result
    print(f"Input: {input_text}")
    print(f"Prediction: {prediction_text}")
    print(f"Reference: {reference_text}")
    print(f"ROUGE-L: {score_rouge['rougeL'].fmeasure:.4f}, Cosine Sim: {cos_sim:.4f}, BLEU: {bleu:.4f}, BERTScore: {F1[0].item():.4f}")
    print("-" * 80)

# Step 9: Final averages
print(f"\nAverage ROUGE-L: {np.mean(rouge_l_scores):.4f}")
print(f"Average Cosine Similarity: {np.mean(cosine_sims):.4f}")
print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
print(f"Average BERTScore: {np.mean(bert_scores):.4f}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

2025-07-28 11:39:24.484151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753702764.737302      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753702764.806747      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 34, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 30, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 52, Gender: M, Smoking: Yes, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.5573, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 46, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 51, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 75, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 59, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 49, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 50, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 76, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hypothyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 42, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.5573, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.5573, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 43, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 52, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.5573, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.5045, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


In [3]:
# Step 1: Install required packages
!pip install -q transformers torch rouge-score scikit-learn sacremoses nltk bert-score

# Step 2: Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

# Step 3: Load dataset
data = pd.read_csv('agle_output_stage_response.csv')

# Step 4: Load EleutherAI GPT-Neo 1.3B model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Step 5: Text generation
def generate_prediction(text, max_new_tokens=50):
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Step 6: Embedding extraction using hidden states
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.hidden_states[-1]
    embedding = last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# Step 7: Initialize scorers
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores, cosine_sims, bleu_scores, bert_scores = [], [], [], []

# Step 8: Evaluation loop
for idx, row in data.head(30).iterrows():
    input_text = row['Text_Summary']
    true_stage = row['Stage']
    true_response = row['Response']

    # Generate prediction
    pred_text = generate_prediction(input_text)

    # Extract predicted stage and response
    pred_stage, pred_response = '', ''
    for part in pred_text.split('.'):
        if 'stage' in part.lower():
            pred_stage = part.strip()
        if 'response' in part.lower():
            pred_response = part.strip()

    reference_text = f"Stage: {true_stage}. Response: {true_response}."
    prediction_text = f"{pred_stage}. {pred_response}"

    # ROUGE-L
    score_rouge = scorer.score(reference_text, prediction_text)
    rouge_l_scores.append(score_rouge['rougeL'].fmeasure)

    # Cosine similarity
    emb_ref = get_embedding(reference_text)
    emb_pred = get_embedding(prediction_text)
    cos_sim = cosine_similarity(emb_ref, emb_pred)[0][0]
    cosine_sims.append(cos_sim)

    # BLEU
    ref_tokens = word_tokenize(reference_text)
    pred_tokens = word_tokenize(prediction_text)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # BERTScore
    P, R, F1 = bert_score([prediction_text], [reference_text], lang="en", verbose=False)
    bert_scores.append(F1[0].item())

    # Print results
    print(f"Input: {input_text}")
    print(f"Prediction: {prediction_text}")
    print(f"Reference: {reference_text}")
    print(f"ROUGE-L: {score_rouge['rougeL'].fmeasure:.4f}, Cosine Sim: {cos_sim:.4f}, BLEU: {bleu:.4f}, BERTScore: {F1[0].item():.4f}")
    print("-" * 80)

# Step 9: Final averages
print(f"\nAverage ROUGE-L: {np.mean(rouge_l_scores):.4f}")
print(f"Average Cosine Similarity: {np.mean(cosine_sims):.4f}")
print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
print(f"Average BERTScore: {np.mean(bert_scores):.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

2025-07-28 06:28:55.702527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753684135.919598      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753684135.986267      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 27, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8785, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 34, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 30, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: M, Smoking: Yes, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8785, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 46, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 51, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 75, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 59, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 49, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 50, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 76, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hypothyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 42, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8785, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8785, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8785, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 36, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 70, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 60, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 33, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Subclinical Hyperthyroidism, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 26, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 37, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Subclinical Hypothyroidism, Physical Examination: Normal, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8977, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------

Average ROUGE-L: 0.2857
Average Cosine Similarity: 0.8945
Average BLEU: 0.0248
Average BERTScore: 0.8415


In [3]:
# Step 1: Install required packages
!pip install -q transformers torch rouge-score scikit-learn sacremoses nltk bert-score

# Step 2: Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

# Step 3: Load dataset
data = pd.read_csv('agle_output_stage_response.csv')

# Step 4: Load MedAlpaca model and tokenizer
model_name = "medalpaca/medalpaca-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # required for LLaMA/MedAlpaca
    device_map="auto"
)
model.eval()

# Step 5: Text generation function
def generate_prediction(text, max_new_tokens=50):
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Step 6: Embedding extraction using hidden states
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.hidden_states[-1]  # (batch, seq_len, hidden_size)
    embedding = last_hidden_state.mean(dim=1).cpu().numpy()  # mean pooling
    return embedding

# Step 7: Evaluation setup
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores, cosine_sims, bleu_scores, bert_scores = [], [], [], []

# Step 8: Evaluation loop
for idx, row in data.head(30).iterrows():
    input_text = row['Text_Summary']
    true_stage = row['Stage']
    true_response = row['Response']

    # Generate prediction
    pred_text = generate_prediction(input_text)

    # Extract predicted stage and response
    pred_stage, pred_response = '', ''
    for part in pred_text.split('.'):
        if 'stage' in part.lower():
            pred_stage = part.strip()
        if 'response' in part.lower():
            pred_response = part.strip()

    reference_text = f"Stage: {true_stage}. Response: {true_response}."
    prediction_text = f"{pred_stage}. {pred_response}"

    # ROUGE-L
    score_rouge = scorer.score(reference_text, prediction_text)
    rouge_l_scores.append(score_rouge['rougeL'].fmeasure)

    # Cosine similarity
    emb_ref = get_embedding(reference_text)
    emb_pred = get_embedding(prediction_text)
    cos_sim = cosine_similarity(emb_ref, emb_pred)[0][0]
    cosine_sims.append(cos_sim)

    # BLEU
    ref_tokens = word_tokenize(reference_text)
    pred_tokens = word_tokenize(prediction_text)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # BERTScore
    P, R, F1 = bert_score([prediction_text], [reference_text], lang="en", verbose=False)
    bert_scores.append(F1[0].item())

    # Print results
    print(f"Input: {input_text}")
    print(f"Prediction: {prediction_text}")
    print(f"Reference: {reference_text}")
    print(f"ROUGE-L: {score_rouge['rougeL'].fmeasure:.4f}, Cosine Sim: {cos_sim:.4f}, BLEU: {bleu:.4f}, BERTScore: {F1[0].item():.4f}")
    print("-" * 80)

# Step 9: Print average metrics
print(f"\nAverage ROUGE-L: {np.mean(rouge_l_scores):.4f}")
print(f"Average Cosine Similarity: {np.mean(cosine_sims):.4f}")
print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
print(f"Average BERTScore: {np.mean(bert_scores):.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

2025-07-28 06:16:54.990249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753683415.227642      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753683415.295578      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 27, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.4891, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 34, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 30, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: M, Smoking: Yes, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: IA, Response: Unknown. Stage: IA, Response: Unknown
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.3333, Cosine Sim: 0.7904, BLEU: 0.0418, BERTScore: 0.8868
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 46, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 51, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 75, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 59, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 49, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 50, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 76, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hypothyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 42, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.4888, BLEU: 0.0364, BERTScore: 0.8703
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.4888, BLEU: 0.0364, BERTScore: 0.8703
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.4891, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 36, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 70, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 60, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 33, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Subclinical Hyperthyroidism, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 26, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Stage: Stage IA, Response: Unknown. Stage: Stage IA, Response: Unknown
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4454, BLEU: 0.0364, BERTScore: 0.8478
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 37, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Subclinical Hypothyroidism, Physical Examination: Normal, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.4478, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------

Average ROUGE-L: 0.2873
Average Cosine Similarity: 0.4632
Average BLEU: 0.0331
Average BERTScore: 0.8487


In [3]:
# Install required packages
!pip install -q transformers torch rouge-score scikit-learn sacremoses nltk bert-score

# Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

# Load dataset
data = pd.read_csv('agle_output_stage_response.csv')

# Load BioGPT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Text generation
def generate_prediction(text, max_new_tokens=50):
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Embedding extraction using BioGPT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.hidden_states[-1]
    embedding = last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# Initialize evaluators
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores, cosine_sims, bleu_scores, bert_scores = [], [], [], []

# Evaluation loop
for idx, row in data.head(30).iterrows():
    input_text = row['Text_Summary']
    true_stage = row['Stage']
    true_response = row['Response']

    # Generate prediction
    pred_text = generate_prediction(input_text)

    # Extract stage and response from prediction
    pred_stage, pred_response = '', ''
    for part in pred_text.split('.'):
        if 'stage' in part.lower():
            pred_stage = part.strip()
        if 'response' in part.lower():
            pred_response = part.strip()

    reference_text = f"Stage: {true_stage}. Response: {true_response}."
    prediction_text = f"{pred_stage}. {pred_response}"

    # ROUGE-L
    score_rouge = scorer.score(reference_text, prediction_text)
    rouge_l_scores.append(score_rouge['rougeL'].fmeasure)

    # Cosine similarity
    emb_ref = get_embedding(reference_text)
    emb_pred = get_embedding(prediction_text)
    cos_sim = cosine_similarity(emb_ref, emb_pred)[0][0]
    cosine_sims.append(cos_sim)

    # BLEU score
    ref_tokens = word_tokenize(reference_text)
    pred_tokens = word_tokenize(prediction_text)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # BERTScore (use single-sample batching)
    P, R, F1 = bert_score([prediction_text], [reference_text], lang="en", verbose=False)
    bert_scores.append(F1[0].item())

    # Print results
    print(f"Input: {input_text}")
    print(f"Prediction: {prediction_text}")
    print(f"Reference: {reference_text}")
    print(f"ROUGE-L: {score_rouge['rougeL'].fmeasure:.4f}, Cosine Sim: {cos_sim:.4f}, BLEU: {bleu:.4f}, BERTScore: {F1[0].item():.4f}")
    print("-" * 80)

# Final averages
print(f"\nAverage ROUGE-L: {np.mean(rouge_l_scores):.4f}")
print(f"Average Cosine Similarity: {np.mean(cosine_sims):.4f}")
print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
print(f"Average BERTScore: {np.mean(bert_scores):.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

2025-07-28 05:43:32.402377: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753681412.629269      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753681412.691015      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 27, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.9119, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 34, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 30, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: M, Smoking: Yes, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.9119, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 46, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 51, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 75, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 59, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 49, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 50, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 76, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hypothyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 42, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.9119, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.9119, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.9119, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 36, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 70, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 60, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 33, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Subclinical Hyperthyroidism, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 26, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 37, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Subclinical Hypothyroidism, Physical Examination: Normal, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.9222, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------

Average ROUGE-L: 0.2857
Average Cosine Similarity: 0.9205
Average BLEU: 0.0248
Average BERTScore: 0.8415


In [3]:
# Install required packages
!pip install -q transformers torch rouge-score scikit-learn sacremoses nltk bert-score

# Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

# Load dataset
data = pd.read_csv('agle_output_stage_response.csv')

# Load BioMedLM model and tokenizer
biomedlm_name = "stanford-crfm/BioMedLM"
tokenizer = AutoTokenizer.from_pretrained(biomedlm_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token if missing
model = AutoModelForCausalLM.from_pretrained(biomedlm_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Text generation
def generate_prediction(text, max_new_tokens=50):
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Embedding extraction using BioMedLM
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.hidden_states[-1]  # (batch, seq_len, hidden_size)
    embedding = last_hidden_state.mean(dim=1).cpu().numpy()  # mean pooling
    return embedding

# Initialize scorers
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores, cosine_sims, bleu_scores, bert_scores = [], [], [], []

# Evaluation loop
for idx, row in data.head(30).iterrows():
    input_text = row['Text_Summary']
    true_stage = row['Stage']
    true_response = row['Response']

    # Generate prediction
    pred_text = generate_prediction(input_text)

    # Extract predicted stage and response
    pred_stage, pred_response = '', ''
    for part in pred_text.split('.'):
        if 'stage' in part.lower():
            pred_stage = part.strip()
        if 'response' in part.lower():
            pred_response = part.strip()

    reference_text = f"Stage: {true_stage}. Response: {true_response}."
    prediction_text = f"{pred_stage}. {pred_response}"

    # ROUGE-L
    score_rouge = scorer.score(reference_text, prediction_text)
    rouge_l_scores.append(score_rouge['rougeL'].fmeasure)

    # Cosine similarity
    emb_ref = get_embedding(reference_text)
    emb_pred = get_embedding(prediction_text)
    cos_sim = cosine_similarity(emb_ref, emb_pred)[0][0]
    cosine_sims.append(cos_sim)

    # BLEU
    ref_tokens = word_tokenize(reference_text)
    pred_tokens = word_tokenize(prediction_text)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu)

    # BERTScore
    P, R, F1 = bert_score([prediction_text], [reference_text], lang="en", verbose=False)
    bert_scores.append(F1[0].item())

    # Print results
    print(f"Input: {input_text}")
    print(f"Prediction: {prediction_text}")
    print(f"Reference: {reference_text}")
    print(f"ROUGE-L: {score_rouge['rougeL'].fmeasure:.4f}, Cosine Sim: {cos_sim:.4f}, BLEU: {bleu:.4f}, BERTScore: {F1[0].item():.4f}")
    print("-" * 80)

# Final averages
print(f"\nAverage ROUGE-L: {np.mean(rouge_l_scores):.4f}")
print(f"Average Cosine Similarity: {np.mean(cosine_sims):.4f}")
print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
print(f"Average BERTScore: {np.mean(bert_scores):.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

2025-07-28 05:55:49.821635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753682150.011705      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753682150.066598      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 27, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8277, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 34, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 30, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 62, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: M, Smoking: Yes, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8277, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 46, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 51, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 75, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 59, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 49, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 50, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hyperthyroidism, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 76, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Clinical Hypothyroidism, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 42, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8277, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 40, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8277, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 52, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Indeterminate.
ROUGE-L: 0.2857, Cosine Sim: 0.8277, BLEU: 0.0248, BERTScore: 0.8433
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 44, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 36, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 70, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 60, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Multinodular goiter, Adenopathy: No, Pathology: Micropapillary, Focality: Multi-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 33, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 43, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Subclinical Hyperthyroidism, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 26, Gender: M, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-left, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 41, Gender: F, Smoking: No, History of Smoking: No, History of Radiotherapy: No, Thyroid Function: Euthyroid, Physical Examination: Single nodular goiter-right, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: Age: 37, Gender: F, Smoking: No, History of Smoking: Yes, History of Radiotherapy: No, Thyroid Function: Subclinical Hypothyroidism, Physical Examination: Normal, Adenopathy: No, Pathology: Micropapillary, Focality: Uni-Focal, Risk: Low, T: T1a, N: N0, M: M0. Predict the Stage and Response.
Prediction: Predict the Stage and Response. Predict the Stage and Response
Reference: Stage: I. Response: Excellent.
ROUGE-L: 0.2857, Cosine Sim: 0.8136, BLEU: 0.0248, BERTScore: 0.8412
--------------------------------------------------------------------------------

Average ROUGE-L: 0.2857
Average Cosine Similarity: 0.8159
Average BLEU: 0.0248
Average BERTScore: 0.8415
