In [1]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from huggingface_hub import HfApi

2024-07-20 12:36:04.160386: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 12:36:04.160504: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 12:36:04.291905: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset_range = list(range(18000, 58000))
medmcqa_dataset_path = "openlifescienceai/medmcqa"
base_bert_path = "emilyalsentzer/Bio_ClinicalBERT"
checkpoint_file = "BioClinicalBert-MLM-Finetuned-40k-25epoch-exp-25epoch-questions.pth"
repo_id = "alibababeig/nlp-hw4"
push_model_to_huggingface = False

num_epochs = 25
batch_size = 64
lr = 2e-4

In [3]:
tokenizer = BertTokenizer.from_pretrained(base_bert_path)
model = BertForMaskedLM.from_pretrained(base_bert_path)

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [5]:
def filter_none(example):
    return (
        (example["exp"] is not None)
        and (len(example["exp"]) > 20)
        and (example["question"] is not None)
    )


def mlm_map_function(rows):
    input_info = tokenizer(
        rows["exp"],
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return {**input_info, "labels": input_info["input_ids"]}


dataset = load_dataset(medmcqa_dataset_path)
mlm_dataset = dataset["train"].select(trainset_range)
mlm_dataset = mlm_dataset.filter(filter_none).select_columns(["exp"])
mlm_dataset = mlm_dataset.map(
    mlm_map_function,
    batched=True,
    num_proc=2,
)
print(mlm_dataset)

collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Downloading readme:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/40000 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 40000
})


# Hugging Face Trainer

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    save_strategy="no",  # Disable checkpointing
    logging_steps=len(mlm_dataset) // batch_size,  # Log per epoch
    report_to=[],  # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=mlm_dataset,
)

results = trainer.train()
print(f"Training Results:\n{results}")

Step,Training Loss
625,2.1361
1250,1.8661
1875,1.7382
2500,1.6441
3125,1.5884
3750,1.5095
4375,1.4281
5000,1.3767
5625,1.3297
6250,1.2473


training results=
TrainOutput(global_step=15625, training_loss=1.199193708984375, metrics={'train_runtime': 14493.5266, 'train_samples_per_second': 68.996, 'train_steps_per_second': 1.078, 'total_flos': 6.5800031232e+16, 'train_loss': 1.199193708984375, 'epoch': 25.0})


In [8]:
torch.save(
    {
        "model_state_dict": model.state_dict(),
    },
    checkpoint_file,
)

In [10]:
if push_model_to_huggingface:
    # generate a token from Profile > Setting > Access Tokens with write access
    api = HfApi(
        token="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",
    )
    api.upload_file(
        path_or_fileobj=f"./{checkpoint_file}",
        path_in_repo=checkpoint_file,
        repo_id=repo_id,
        repo_type="model",
    )

BioClinicalBert-MLM-Finetuned-40k-25epoch-questions.pth:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alibababeig/nlp-hw4/commit/c872571a8ed8960f518ff8936c36c95c6dc21223', commit_message='Upload BioClinicalBert-MLM-Finetuned-40k-25epoch-questions.pth with huggingface_hub', commit_description='', oid='c872571a8ed8960f518ff8936c36c95c6dc21223', pr_url=None, pr_revision=None, pr_num=None)