In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from datasets import load_dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, AutoTokenizer
from tqdm.notebook import tqdm
import time
import numpy as np
from sklearn.metrics import accuracy_score

2024-07-18 15:59:12.117168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 15:59:12.117273: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 15:59:12.247617: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset_range = list(range(18000, 38000))
num_epochs = 15
batch_size = 64
lr = 2e-5

In [3]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForMaskedLM.from_pretrained("bert-base-uncased")

tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = BertForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [4]:
def filter_none(example):
    return example["exp"] is not None


def mlm_map_function(rows):
    # Tokenize the text with specified tokenizer parameters
    input_info = tokenizer(
        rows["exp"],
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return {**input_info, "labels": input_info["input_ids"]}


# load MedMCQA
dataset = load_dataset("openlifescienceai/medmcqa")
# mlm_dataset = Subset(dataset["train"], trainset_range)
mlm_dataset = dataset["train"].select(trainset_range)
mlm_dataset = mlm_dataset.filter(filter_none).select_columns(["exp"])
mlm_dataset = mlm_dataset.map(
    mlm_map_function,
    batched=True,
    num_proc=2,
)
print(mlm_dataset)

collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
# train_loader = DataLoader(
#     mlm_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
# )

Downloading readme:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/17612 [00:00<?, ? examples/s]

Dataset({
    features: ['exp', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 17612
})


# Hugging Face Trainer

In [5]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_steps=len(mlm_dataset) // batch_size,  # Log per epoch
    report_to=[],  # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=mlm_dataset,
)

trainer.train()

Step,Training Loss
275,2.0317
550,1.8002
825,1.6919
1100,1.634
1375,1.5809
1650,1.5329
1925,1.507
2200,1.4965
2475,1.4688
2750,1.4491


TrainOutput(global_step=4140, training_loss=1.5548751637555551, metrics={'train_runtime': 3837.0108, 'train_samples_per_second': 68.85, 'train_steps_per_second': 1.079, 'total_flos': 1.738305225086976e+16, 'train_loss': 1.5548751637555551, 'epoch': 15.0})

In [6]:
checkpoint_file = "BioClinicalBert-MLM-Finetuned-20k-15epoch.pth"
torch.save(
    {
        "model_state_dict": model.state_dict(),
    },
    checkpoint_file,
)

In [7]:
from huggingface_hub import HfApi

# generate a token from Profile > Setting > Access Tokens with write access
api = HfApi(
    token="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",
)
api.upload_file(
    path_or_fileobj="./BioClinicalBert-MLM-Finetuned-20k-15epoch.pth",
    path_in_repo="BioClinicalBert-MLM-Finetuned-20k-15epoch.pth",
    repo_id="alibababeig/nlp-hw4",
    repo_type="model",
)

BioClinicalBert-MLM-Finetuned-20k-15epoch.pth:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alibababeig/nlp-hw4/commit/84aadc1e2d3261525651b051de2538a500119e53', commit_message='Upload BioClinicalBert-MLM-Finetuned-20k-15epoch.pth with huggingface_hub', commit_description='', oid='84aadc1e2d3261525651b051de2538a500119e53', pr_url=None, pr_revision=None, pr_num=None)