In [1]:
!pip install -U transformers datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [3]:
!pip install scikit-learn



In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

In [11]:
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.eval().cuda()  # Use .cpu() if no GPU


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
dataset = load_dataset("pubmed_qa", "pqa_artificial", split="train[:1000]")  # 1000 samples for quick test
label2id = {'yes': 0, 'no': 1, 'maybe': 2}
id2label = {v: k for k, v in label2id.items()}


train-00000-of-00001.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [15]:
def preprocess(example):
    context_text = " ".join(example["context"]["contexts"])
    text = example["question"] + " " + context_text
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encoding["label"] = label2id[example["final_decision"]]
    return encoding


In [16]:
processed = [preprocess(example) for example in dataset]


In [19]:
preds, true_labels, pred_probs = [], [], []
from scipy.special import softmax

with torch.no_grad():
    for item in tqdm(processed):
        input_ids = item["input_ids"].squeeze(0).cuda()
        attention_mask = item["attention_mask"].squeeze(0).cuda()
        label = item["label"]

        outputs = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        logits = outputs.logits

        probs = softmax(logits.detach().cpu().numpy(), axis=1).squeeze()

        pred_probs.append(probs)
        pred_label = int(np.argmax(probs))
        preds.append(pred_label)
        true_labels.append(label)


100%|██████████| 1000/1000 [00:15<00:00, 63.93it/s]


In [25]:
accuracy = accuracy_score(true_labels, preds)
f1_macro = f1_score(true_labels, preds, average='macro')
f1 = f1_score(true_labels, preds, average='weighted')

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ F1 Score (Macro): {f1_macro:.4f}")
print(f"✅ F1 Score (weighted): {f1:.4f}")

# Exact Match
pred_texts = [id2label[p] for p in preds]
true_texts = [id2label[t] for t in true_labels]
exact_match_score = np.mean([p == t for p, t in zip(pred_texts, true_texts)])
print(f"✅ Exact Match: {exact_match_score:.4f}")


✅ Accuracy: 0.7820
✅ F1 Score (Macro): 0.3395
✅ F1 Score (weighted): 0.8287
✅ Exact Match: 0.7820


In [21]:
def mean_reciprocal_rank(probabilities, labels):
    rr_total = 0
    for i, probs in enumerate(probabilities):
        sorted_indices = np.argsort(probs)[::-1]
        rank = list(sorted_indices).index(labels[i]) + 1
        rr_total += 1.0 / rank
    return rr_total / len(probabilities)

def mean_average_precision(probabilities, labels):
    ap_total = 0
    for i, probs in enumerate(probabilities):
        sorted_indices = np.argsort(probs)[::-1]
        relevant = [1 if idx == labels[i] else 0 for idx in sorted_indices]
        precisions = [sum(relevant[:j+1]) / (j+1) for j in range(len(relevant)) if relevant[j]]
        ap_total += sum(precisions) / 1 if precisions else 0
    return ap_total / len(probabilities)

mrr = mean_reciprocal_rank(pred_probs, true_labels)
map_score = mean_average_precision(pred_probs, true_labels)
print(f"✅ MRR: {mrr:.4f}")
print(f"✅ MAP: {map_score:.4f}")


✅ MRR: 0.8823
✅ MAP: 0.8823


In [22]:
from bert_score import score as bertscore

# You need text-based explanations for BERTScore — you can use `context` or reference answers
# Here, we use question + context vs predicted/true labels
pred_texts = [id2label[p] for p in preds]
true_texts = [id2label[t] for t in true_labels]
P, R, F1 = bertscore(pred_texts, true_texts, lang="en", model_type="bert-base-uncased")
print(f"✅ BERTScore F1: {F1.mean().item():.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

✅ BERTScore F1: 0.9399
