<a href="https://colab.research.google.com/github/seloooselin/citation-analysis-project/blob/main/notebooks/medbert_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch datasets evaluate


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

In [2]:
import json
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from google.colab import drive
drive.mount('/content/drive')

claims_file = "/content/drive/MyDrive/colab_data/citation_project/claims-test.jsonl"
corpus_file = "/content/drive/MyDrive/colab_data/citation_project/corpus.jsonl"

# Load JSONL
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

claims_data = load_jsonl(claims_file)
corpus_data = load_jsonl(corpus_file)

corpus_dict = {str(doc["doc_id"]): " ".join(doc["abstract"]) for doc in corpus_data}

label_mapping = {
    "ACCURATE": "ACCURATE",
    "INDIRECT": "ACCURATE",
    "CONTRADICT": "NOT_ACCURATE",
    "NOT_SUBSTANTIATE": "NOT_ACCURATE",
    "OVERSIMPLIFY": "NOT_ACCURATE",
    "MISQUOTE": "NOT_ACCURATE",
    "ETIQUETTE": "NOT_ACCURATE",
    "IRRELEVANT": "IRRELEVANT",
    "INDIRECT_NOT_REVIEW": "IRRELEVANT"
}

def extract_evidence_text(evidence):
    extracted_text, extracted_labels = [], []
    for doc_id, details in evidence.items():
        doc_id = str(doc_id)
        if doc_id in corpus_dict:
            full_text = sent_tokenize(corpus_dict[doc_id])
            for entry in details:
                if "label" in entry and "sentences" in entry:
                    label = label_mapping.get(entry["label"], "UNKNOWN")
                    extracted_labels.append(label)
                    for sent_id in entry["sentences"]:
                        if sent_id < len(full_text):
                            extracted_text.append(full_text[sent_id])
    if not extracted_text:
        return "NO EVIDENCE", "IRRELEVANT"
    final_label = max(set(extracted_labels), key=extracted_labels.count)
    return " ".join(extracted_text), final_label

processed_data = []
for claim in claims_data:
    evidence_text, final_label = extract_evidence_text(claim["evidence"])
    data_point = {"text": claim["claim"] + " [SEP] " + evidence_text, "label": final_label}
    processed_data.append(data_point)


df = pd.DataFrame(processed_data)

# Save the cleaned dataset as a CSV clearly in your Google Drive folder
df.to_csv("/content/drive/MyDrive/colab_data/citation_project/citation_classification_dataset.csv", index=False)

# Quick check
df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


Unnamed: 0,text,label
0,FMO3 and TMAO have emerged as key components o...,NOT_ACCURATE
1,In apoliprotein E-deficient mice fed a diet wi...,NOT_ACCURATE
2,"Dietary L-carnitine and choline, compounds abu...",ACCURATE
3,"While higher plasma levels of -carnitine, in ...",ACCURATE
4,TMAO could be derived from increased consumpti...,ACCURATE


In [3]:
from datasets import load_dataset

# Load your newly created CSV dataset
data_path = "/content/drive/MyDrive/colab_data/citation_project/citation_classification_dataset.csv"
dataset = load_dataset('csv', data_files=data_path)

# Quickly verify the loaded dataset
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 606
    })
})


In [4]:
# Import AutoTokenizer for MedBERT tokenisation

from transformers import AutoTokenizer

# Load the tokenizer for MedBERT (specialised for biomedical texts)

tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

# Function to tokenize dataset text fields

def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Quick verification
print(tokenized_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 606
    })
})


In [5]:
# Import MedBERT model for sequence classification tasks
from transformers import AutoModelForSequenceClassification

# Clearly define label mappings based on your dataset
labels = ["ACCURATE", "NOT_ACCURATE", "IRRELEVANT"]

# Load pre-trained MedBERT model configured for your classification task
model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
    num_labels=len(labels),                  # Number of labels
    id2label={i: label for i, label in enumerate(labels)},  # Numeric to text label
    label2id={label: i for i, label in enumerate(labels)}   # Text to numeric label
)


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import torch
from transformers import DataCollatorWithPadding

# Convert text labels into numerical format for model training
label2id = {"ACCURATE": 0, "NOT_ACCURATE": 1, "IRRELEVANT": 2}

# Correct label conversion for batched mapping
def encode_labels(batch):
    batch["label"] = [label2id[label] for label in batch["label"]]
    return batch

tokenized_dataset = tokenized_dataset.map(encode_labels, batched=True)

# Remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Convert dataset into PyTorch format
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define a collator to dynamically pad sequences during training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Quick verification of dataset format
print(tokenized_dataset["train"][0])


Map:   0%|          | 0/606 [00:00<?, ? examples/s]

{'label': tensor(1), 'input_ids': tensor([    2,  5620,  8583,  1930, 18391,  1037,  2162,  9287,  1966,  3834,
         4178,  1927,    43,  2796,  5432, 14042,  3986,  1930,  1920,  7660,
        13665,  1956,  9824,    32,    70,  2779,    41, 12366,    70,    34,
            3, 18391,  1037,  1982,  5326,  2522,  1942,  1998,  2321,  1965,
         3911,  3880,  1930,  2458,  1956,  5321,  7481,    18,    29,  2144,
         3056, 13701,  1920,  5341,  1988,  2303,  6096, 11312, 19705,    43,
        21849,  3812,  3170,  2278,  2222,  5655, 18391,  1037,  2037,  7660,
         9825,  1930,  5527, 10843,  9824,    18,    12,    43,    13,  3170,
         1927, 21118,  1930,  7363,  1927, 21118,  1930, 15440,  4714,  1942,
        18391,  1037,    18,    54,    17, 21118,  1930, 15440,    12,  2032,
         2321,  6096, 21849, 14286,  1988,  2112,  1998, 19654,  2007,  9825,
         1942, 18391,    18, 18391,  1977,  2759,  2520, 11339,  1942, 18391,
         1037,  2007, 26758,  

In [8]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./outputs",         # Save model checkpoints here
    evaluation_strategy="epoch",    # Evaluate model at the end of each epoch
    save_strategy="epoch",          # Save model checkpoints at every epoch
    learning_rate=2e-5,             # Standard learning rate for fine-tuning BERT
    per_device_train_batch_size=8,  # Adjust if running out of GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,             # Training duration (increase if needed)
    weight_decay=0.01,              # Regularization for stability
    logging_steps=10,               # Frequency of logging
    push_to_hub=False               # Not needed for now
)

# Quick check
print(training_args)




TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [10]:
from datasets import DatasetDict

# Split dataset: 80% train, 20% validation
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)

# Create dataset with separate train & eval sets
tokenized_dataset = DatasetDict({
    "train": train_test_split["train"],
    "eval": train_test_split["test"]
})

# Update training arguments to include evaluation
training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",  # Enable evaluation
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],  # Now we have an eval set
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [11]:
from datasets import DatasetDict

# Split dataset: 80% train, 20% validation
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)

# Recreate dataset with separate train & eval sets
tokenized_dataset = DatasetDict({
    "train": train_test_split["train"],
    "eval": train_test_split["test"]
})

# Quick verification
print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 387
    })
    eval: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 97
    })
})


In [12]:
from transformers import Trainer

# Define the trainer with both train and eval datasets
trainer = Trainer(
    model=model,                         # MedBERT model
    args=training_args,                   # Training arguments
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["eval"],    # Validation dataset
    tokenizer=tokenizer,                  # Tokenizer
    data_collator=data_collator            # Handles dynamic padding
)

# Quick check
print(trainer)


<transformers.trainer.Trainer object at 0x7992e4d7b2d0>


  trainer = Trainer(


In [17]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_SILENT"] = "true"

import wandb
wandb.init(mode="disabled")  # Initialize wandb in disabled mode



In [18]:

# Start training MedBERT
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.4862,0.453168
2,0.3245,0.485931
3,0.2825,0.453857


TrainOutput(global_step=147, training_loss=0.4138522229227079, metrics={'train_runtime': 140.6567, 'train_samples_per_second': 8.254, 'train_steps_per_second': 1.045, 'total_flos': 305474677982208.0, 'train_loss': 0.4138522229227079, 'epoch': 3.0})

In [19]:
# Define the save path in Google Drive
save_path = "/content/drive/MyDrive/colab_data/citation_project/medbert_finetuned/"

# Save model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to: {save_path}")


Model saved to: /content/drive/MyDrive/colab_data/citation_project/medbert_finetuned/
