In [1]:
# Install required libraries
!pip install -q transformers sentence-transformers rouge-score nltk bert_score

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore
import nltk
from tqdm import tqdm

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset from Kaggle input
df = pd.read_csv("/kaggle/input/healthcare-emotion/emotion-emotion_69k.csv")  
df = df[['Situation', 'labels']].dropna().drop_duplicates()
df.columns = ['question', 'true_answer']

# Load BioMedLM model
model_name = "stanford-crfm/biomedlm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
torch.no_grad()

# Generate answers with BioMedLM
def ask_biomedlm(question, max_new_tokens=60):
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=inputs["input_ids"].shape[1] + max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(question, '').strip()

tqdm.pandas()
df['biomedlm_answer'] = df['question'].progress_apply(ask_biomedlm)

# Load Sentence-BERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Cosine Similarity
def cosine_similarity(a, b):
    emb1 = sbert.encode(a, convert_to_tensor=True)
    emb2 = sbert.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

df['cosine'] = df.apply(lambda row: cosine_similarity(row['true_answer'], row['biomedlm_answer']), axis=1)

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

df['rougeL'] = df.apply(lambda row: compute_rouge(row['true_answer'], row['biomedlm_answer']), axis=1)

# Compute BLEU
def compute_bleu(reference, hypothesis):
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)

df['bleu'] = df.apply(lambda row: compute_bleu(row['true_answer'], row['biomedlm_answer']), axis=1)

# Compute BERTScore
P, R, F1 = bertscore(
    df['biomedlm_answer'].tolist(),
    df['true_answer'].tolist(),
    lang='en',
    rescale_with_baseline=True
)
df['bertscore_f1'] = F1

# Print evaluation scores
print("Average Cosine Similarity:", df['cosine'].mean())
print("Average ROUGE-L:", df['rougeL'].mean())
print("Average BLEU:", df['bleu'].mean())
print("Average BERTScore-F1:", df['bertscore_f1'].mean())

# Save results
df.to_csv("biomedlm_qa_results_with_bleu_bertscore.csv", index=False)
df[['question', 'true_answer', 'biomedlm_answer', 'cosine', 'rougeL', 'bleu', 'bertscore_f1']].head(10)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━

2025-07-27 14:54:20.607331: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753628060.789926      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753628060.848456      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

  0%|          | 0/30 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
  7%|▋         | 2/30 [02:03<28:48, 61.73s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 10%|█         | 3/30 [04:28<43:23, 96.44s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 13%|█▎        | 4/30 [06:36<46:51, 108.13s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 17%|█▋        | 5/30 [09:22<53:31, 128.48s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 20%|██        | 6/30 [12:24<58:23, 145.99s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 23%|██▎       | 7/30 [14:56<56:44, 148.00s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 27%|██▋       | 8/30 [18:07<59:15, 161.61s/it]Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
 30%|███       | 9/30 [20:33<54:54, 156.87s/it]Setting `pad

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Cosine Similarity: 0.07993002686028679
Average ROUGE-L: 0.04614606486856325
Average BLEU: 0.004443256575764966
Average BERTScore-F1: -1.0426409




Unnamed: 0,question,true_answer,biomedlm_answer,cosine,rougeL,bleu,bertscore_f1
49098,my sister is having her first baby,Oh fun! A little girl will be a great addition...,", and the other half of the family members, an...",0.093562,0.064516,0.008361,0.00668
60057,I woke up this morning and saw a scratch on my...,I hope so too. Its really annoying when that h...,I think it is important to be able to do it. I...,0.173824,0.035714,0.004635,-0.053166
36989,Getting my things ready for florida,The family and I are headed to florida in a co...,**,0.093602,0.0,0.0,-0.268575
21737,I was a bit shocked that someone was so nice t...,I'm off that train so they aren't coming back ...,I think it's a bit of a problem. I think it's ...,0.092706,0.028169,0.003616,-0.046193
61907,I injured my foot a couple of days ago (muscle...,"don't worry, with people like this karma hits ...",,0.104818,0.0,0.0,-4.925075
34677,Each month I am scared to look at my bank acco...,Why do you say that?,to be considered as a potential source of info...,0.086509,0.0,0.0,-0.134753
55676,When my boyfriend and I had been dating about ...,Well that's good. I'm glad so far it has worke...,I was not sure I was going to go to the hospit...,0.074471,0.061538,0.007367,-0.03508
50788,"Boy is it hard to be alone sometimes, this pas...",I know how you feel. What helped me was gettin...,I think that the authors have a good idea of w...,0.061033,0.166667,0.011876,0.192653
12507,I am nervous because I am having surgery tomorrow,is it a major or minor surgery?,", I will be able to do it. I am not sure how t...",0.059553,0.111111,0.004459,0.016527
51177,Water is not working in the house. I have been...,I have been waiting on the plumber all day and...,The authors declare no conflict of interest.\n...,0.04107,0.058824,0.008301,-0.272538


In [1]:
# Install required libraries
!pip install -q transformers sentence-transformers rouge-score nltk bert_score

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore
import nltk
from tqdm import tqdm

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset from Kaggle input
df = pd.read_csv("/kaggle/input/mental/emotion-emotion_69k.csv")  
df = df[['Situation', 'labels']].dropna().drop_duplicates()
df.columns = ['question', 'true_answer']

# Load MedAlpaca model
tokenizer = AutoTokenizer.from_pretrained("medalpaca/medalpaca-7b")
model = AutoModelForCausalLM.from_pretrained("medalpaca/medalpaca-7b", torch_dtype=torch.float16, device_map="auto")
model.eval()
torch.no_grad()

# Generate answers with MedAlpaca
def generate_answer(question, max_new_tokens=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(question, '').strip()

tqdm.pandas()
df['medalpaca_answer'] = df['question'].progress_apply(generate_answer)

# Load Sentence-BERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Cosine Similarity
def cosine_similarity(a, b):
    emb1 = sbert.encode(a, convert_to_tensor=True)
    emb2 = sbert.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

df['cosine'] = df.apply(lambda row: cosine_similarity(row['true_answer'], row['medalpaca_answer']), axis=1)

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

df['rougeL'] = df.apply(lambda row: compute_rouge(row['true_answer'], row['medalpaca_answer']), axis=1)

# Compute BLEU
def compute_bleu(reference, hypothesis):
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)

df['bleu'] = df.apply(lambda row: compute_bleu(row['true_answer'], row['medalpaca_answer']), axis=1)

# Compute BERTScore (filtered with rescaling)
valid_df = df[(df['medalpaca_answer'].str.strip() != '') & (df['true_answer'].str.strip() != '')]
P, R, F1 = bertscore(
    valid_df['medalpaca_answer'].tolist(),
    valid_df['true_answer'].tolist(),
    lang='en',
    rescale_with_baseline=True
)
df.loc[valid_df.index, 'bertscore_f1'] = F1.tolist()

# Print evaluation scores
print("Average Cosine Similarity:", df['cosine'].mean())
print("Average ROUGE-L:", df['rougeL'].mean())
print("Average BLEU:", df['bleu'].mean())
print("Average BERTScore-F1:", df['bertscore_f1'].mean())

# Save results
df.to_csv("medalpaca_qa_results_with_bleu_bertscore.csv", index=False)
df[['question', 'true_answer', 'medalpaca_answer', 'cosine', 'rougeL', 'bleu', 'bertscore_f1']].head(10)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m724.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[

2025-07-27 17:39:30.610781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753637970.811345      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753637970.874390      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

100%|██████████| 30/30 [02:07<00:00,  4.27s/it]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Cosine Similarity: 0.19836801588535308
Average ROUGE-L: 0.07977471718462321
Average BLEU: 0.0075221327018880485
Average BERTScore-F1: -0.0161176186054945


Unnamed: 0,question,true_answer,medalpaca_answer,cosine,rougeL,bleu,bertscore_f1
49098,my sister is having her first baby,Oh fun! A little girl will be a great addition...,in August and she was so excited to have a boy...,0.422627,0.131868,0.005858,0.057731
60057,I woke up this morning and saw a scratch on my...,I hope so too. Its really annoying when that h...,"I am a single mother of three children, I am a...",-0.006505,0.076923,0.002945,0.021634
36989,Getting my things ready for florida,The family and I are headed to florida in a co...,and then getting ready for the wedding!\n✔️ Pa...,0.543381,0.12,0.013948,0.026431
21737,I was a bit shocked that someone was so nice t...,I'm off that train so they aren't coming back ...,"Anyway, I am so excited to be going home for T...",0.149257,0.102564,0.00722,-0.025581
61907,I injured my foot a couple of days ago (muscle...,"don't worry, with people like this karma hits ...","I'm not crippled, but I can't run as fast as I...",0.167847,0.123077,0.005809,0.094833
34677,Each month I am scared to look at my bank acco...,Why do you say that?,for my bills and I have to borrow from Peter t...,-0.077851,0.0,0.0,-0.206115
55676,When my boyfriend and I had been dating about ...,Well that's good. I'm glad so far it has worke...,The apartment complex was run by a slumlord. T...,0.070849,0.068182,0.005488,-0.065807
50788,"Boy is it hard to be alone sometimes, this pas...",I know how you feel. What helped me was gettin...,"I went to the doctor, I took some medicine, I ...",0.181119,0.09009,0.005768,0.100056
12507,I am nervous because I am having surgery tomorrow,is it a major or minor surgery?,. The surgery will hopefully help my back and ...,0.389701,0.042105,0.002558,-0.014695
51177,Water is not working in the house. I have been...,I have been waiting on the plumber all day and...,The plumber said he would be here between 8 an...,0.568783,0.204082,0.046473,0.228326


In [1]:
# Install required libraries
!pip install -q transformers sentence-transformers rouge-score nltk bert_score

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore
import nltk
from tqdm import tqdm

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset from Kaggle input
df = pd.read_csv("/kaggle/input/mental/emotion-emotion_69k.csv")  
df = df[['Situation', 'labels']].dropna().drop_duplicates()
df.columns = ['question', 'true_answer']

# Load MedLLaMA2 model
tokenizer = AutoTokenizer.from_pretrained("llSourcell/medllama2_7b")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("llSourcell/medllama2_7b")
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
torch.no_grad()

# Generate answers with MedLLaMA2
def generate_answer(question, max_new_tokens=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(question, '').strip()

tqdm.pandas()
df['medllama2_answer'] = df['question'].progress_apply(generate_answer)

# Load Sentence-BERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Cosine Similarity
def cosine_similarity(a, b):
    emb1 = sbert.encode(a, convert_to_tensor=True)
    emb2 = sbert.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

df['cosine'] = df.apply(lambda row: cosine_similarity(row['true_answer'], row['medllama2_answer']), axis=1)

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

df['rougeL'] = df.apply(lambda row: compute_rouge(row['true_answer'], row['medllama2_answer']), axis=1)

# Compute BLEU
def compute_bleu(reference, hypothesis):
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)

df['bleu'] = df.apply(lambda row: compute_bleu(row['true_answer'], row['medllama2_answer']), axis=1)

# Compute BERTScore (filtered with rescaling)
valid_df = df[(df['medllama2_answer'].str.strip() != '') & (df['true_answer'].str.strip() != '')]
P, R, F1 = bertscore(
    valid_df['medllama2_answer'].tolist(),
    valid_df['true_answer'].tolist(),
    lang='en',
    rescale_with_baseline=True
)
df.loc[valid_df.index, 'bertscore_f1'] = F1.tolist()

# Print evaluation scores
print("Average Cosine Similarity:", df['cosine'].mean())
print("Average ROUGE-L:", df['rougeL'].mean())
print("Average BLEU:", df['bleu'].mean())
print("Average BERTScore-F1:", df['bertscore_f1'].mean())

# Save results
df.to_csv("medllama2_qa_results_with_bleu_bertscore.csv", index=False)
df[['question', 'true_answer', 'medllama2_answer', 'cosine', 'rougeL', 'bleu', 'bertscore_f1']].head(10)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━

2025-07-27 18:03:26.436890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753639406.697545      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753639406.777010      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
100%|██████████| 30/30 [1:03:54<00:00, 127.82s/it]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Cosine Similarity: 0.21874920365711054
Average ROUGE-L: 0.07752352280547284
Average BLEU: 0.005516076281162819
Average BERTScore-F1: 0.02490196693688631


Unnamed: 0,question,true_answer,medllama2_answer,cosine,rougeL,bleu,bertscore_f1
49098,my sister is having her first baby,Oh fun! A little girl will be a great addition...,. I’m so excited! I’m going to be an auntie! 😍...,0.443977,0.074074,0.005698,0.045094
60057,I woke up this morning and saw a scratch on my...,I hope so too. Its really annoying when that h...,What a bummer. I just washed it last week. I t...,0.177117,0.083333,0.006079,0.045175
36989,Getting my things ready for florida,The family and I are headed to florida in a co...,. I'm so excited!\nWe are planning a trip to f...,0.690341,0.141414,0.006898,0.136251
21737,I was a bit shocked that someone was so nice t...,I'm off that train so they aren't coming back ...,But it's important to remember that people can...,0.071981,0.08,0.003218,0.026313
61907,I injured my foot a couple of days ago (muscle...,"don't worry, with people like this karma hits ...",It's really upsetting. What can I do? [/INST...,0.160316,0.043478,0.003639,0.009117
34677,Each month I am scared to look at my bank acco...,Why do you say that?,to cover all my expenses. I have to use my cre...,-0.037879,0.023529,0.002842,-0.1589
55676,When my boyfriend and I had been dating about ...,Well that's good. I'm glad so far it has worke...,We found a really affordable one and he was so...,0.043999,0.088235,0.007949,0.028772
50788,"Boy is it hard to be alone sometimes, this pas...",I know how you feel. What helped me was gettin...,"I hope you are doing well, I know it's not eas...",0.31693,0.141414,0.006582,0.116356
12507,I am nervous because I am having surgery tomorrow,is it a major or minor surgery?,. I have been feeling a little anxious and am ...,0.22064,0.075949,0.003263,-0.014699
51177,Water is not working in the house. I have been...,I have been waiting on the plumber all day and...,"It's so hot outside, and I'm feeling so uncomf...",0.028761,0.101695,0.005978,0.145245


In [None]:
# Install required libraries
!pip install -q transformers sentence-transformers rouge-score nltk bert_score

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore
import nltk
from tqdm import tqdm

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset from Kaggle input
df = pd.read_csv("/kaggle/input/mental/emotion-emotion_69k.csv")  
df = df[['Situation', 'labels']].dropna().drop_duplicates()
df.columns = ['question', 'true_answer']

# Load EleutherAI GPT-Neo 1.3B model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
torch.no_grad()

# Generate answers with GPT-Neo
def generate_answer(question, max_new_tokens=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(question, '').strip()

tqdm.pandas()
df['gptneo_answer'] = df['question'].progress_apply(generate_answer)

# Load Sentence-BERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Cosine Similarity
def cosine_similarity(a, b):
    emb1 = sbert.encode(a, convert_to_tensor=True)
    emb2 = sbert.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

df['cosine'] = df.apply(lambda row: cosine_similarity(row['true_answer'], row['gptneo_answer']), axis=1)

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

df['rougeL'] = df.apply(lambda row: compute_rouge(row['true_answer'], row['gptneo_answer']), axis=1)

# Compute BLEU
def compute_bleu(reference, hypothesis):
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)

df['bleu'] = df.apply(lambda row: compute_bleu(row['true_answer'], row['gptneo_answer']), axis=1)

# Compute BERTScore (filtered with rescaling)
valid_df = df[(df['gptneo_answer'].str.strip() != '') & (df['true_answer'].str.strip() != '')]
P, R, F1 = bertscore(
    valid_df['gptneo_answer'].tolist(),
    valid_df['true_answer'].tolist(),
    lang='en',
    rescale_with_baseline=True
)
df.loc[valid_df.index, 'bertscore_f1'] = F1.tolist()

# Print evaluation scores
print("Average Cosine Similarity:", df['cosine'].mean())
print("Average ROUGE-L:", df['rougeL'].mean())
print("Average BLEU:", df['bleu'].mean())
print("Average BERTScore-F1:", df['bertscore_f1'].mean())

# Save results
df.to_csv("gptneo_qa_results_with_bleu_bertscore.csv", index=False)
df[['question', 'true_answer', 'gptneo_answer', 'cosine', 'rougeL', 'bleu', 'bertscore_f1']].head(10)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━

2025-07-27 19:22:00.191749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753644120.416736      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753644120.472542      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
100%|██████████| 30/30 [01:09<00:00,  2.31s/it]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Install required libraries
!pip install -q transformers sentence-transformers rouge-score nltk bert_score

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore
import nltk
from tqdm import tqdm

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset from Kaggle input
df = pd.read_csv("/kaggle/input/mental/emotion-emotion_69k.csv")  
df = df[['Situation', 'labels']].dropna().drop_duplicates()
df.columns = ['question', 'true_answer']

# Load BioGPT model
biogpt_name = "microsoft/BioGPT-Large"
tokenizer = AutoTokenizer.from_pretrained(biogpt_name)
model = AutoModelForCausalLM.from_pretrained(biogpt_name).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
torch.no_grad()

# Generate answers with BioGPT
def generate_answer(question, max_new_tokens=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(question, '').strip()

tqdm.pandas()
df['biogpt_answer'] = df['question'].progress_apply(generate_answer)

# Load Sentence-BERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Cosine Similarity
def cosine_similarity(a, b):
    emb1 = sbert.encode(a, convert_to_tensor=True)
    emb2 = sbert.encode(b, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

df['cosine'] = df.apply(lambda row: cosine_similarity(row['true_answer'], row['biogpt_answer']), axis=1)

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

df['rougeL'] = df.apply(lambda row: compute_rouge(row['true_answer'], row['biogpt_answer']), axis=1)

# Compute BLEU
def compute_bleu(reference, hypothesis):
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)

df['bleu'] = df.apply(lambda row: compute_bleu(row['true_answer'], row['biogpt_answer']), axis=1)

# Compute BERTScore (filtered with rescaling)
valid_df = df[(df['biogpt_answer'].str.strip() != '') & (df['true_answer'].str.strip() != '')]
P, R, F1 = bertscore(
    valid_df['biogpt_answer'].tolist(),
    valid_df['true_answer'].tolist(),
    lang='en',
    rescale_with_baseline=True
)
df.loc[valid_df.index, 'bertscore_f1'] = F1.tolist()

# Print evaluation scores
print("Average Cosine Similarity:", df['cosine'].mean())
print("Average ROUGE-L:", df['rougeL'].mean())
print("Average BLEU:", df['bleu'].mean())
print("Average BERTScore-F1:", df['bertscore_f1'].mean())

# Save results
df.to_csv("biogpt_qa_results_with_bleu_bertscore.csv", index=False)
df[['question', 'true_answer', 'biogpt_answer', 'cosine', 'rougeL', 'bleu', 'bertscore_f1']].head(10)
