In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from datasets import load_dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, AutoTokenizer
from tqdm.notebook import tqdm
import time
import numpy as np
from sklearn.metrics import accuracy_score

2024-07-18 12:37:18.276733: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 12:37:18.276834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 12:37:18.419535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset_range = list(range(18000, 28000))
num_epochs = 5
batch_size = 32
lr = 2e-5

In [3]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForMaskedLM.from_pretrained("bert-base-uncased")

tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = BertForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [4]:
def filter_none(example):
    return example["exp"] is not None


def mlm_map_function(rows):
    # Tokenize the text with specified tokenizer parameters
    input_info = tokenizer(
        rows["exp"],
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return {**input_info, "labels": input_info["input_ids"]}


# load MedMCQA
dataset = load_dataset("openlifescienceai/medmcqa")
# mlm_dataset = Subset(dataset["train"], trainset_range)
mlm_dataset = dataset["train"].select(trainset_range)
mlm_dataset = mlm_dataset.filter(filter_none).select_columns(["exp"])
mlm_dataset = mlm_dataset.map(
    mlm_map_function,
    batched=True,
    num_proc=2,
)
print(mlm_dataset)

collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
# train_loader = DataLoader(
#     mlm_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
# )

Downloading readme:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/8803 [00:00<?, ? examples/s]

Dataset({
    features: ['exp', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 8803
})


# Hugging Face Trainer

In [5]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_steps=len(mlm_dataset) // batch_size,  # Log per epoch
    report_to=[],  # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=mlm_dataset,
)

trainer.train()

Step,Training Loss
275,2.1178
550,1.8725
825,1.7832
1100,1.7203
1375,1.7194


TrainOutput(global_step=1380, training_loss=1.8421367507050004, metrics={'train_runtime': 668.1489, 'train_samples_per_second': 65.876, 'train_steps_per_second': 2.065, 'total_flos': 2896188374676480.0, 'train_loss': 1.8421367507050004, 'epoch': 5.0})

In [11]:
checkpoint_file = "BioClinicalBert-MLM-Finetuned.pth"
torch.save(
    {
        "model_state_dict": model.state_dict(),
    },
    checkpoint_file,
)

In [13]:
from huggingface_hub import HfApi

# generate a token from Profile > Setting > Access Tokens with write access
api = HfApi(
    token="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",
)
api.upload_file(
    path_or_fileobj="./BioClinicalBert-MLM-Finetuned.pth",
    path_in_repo="BioClinicalBert-MLM-Finetuned.pth",
    repo_id="alibababeig/nlp-hw4-dataset",
    repo_type="model",
)

BioClinicalBert-MLM-Fintuned.pth:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alibababeig/nlp-hw4-dataset/commit/d206173d9eabdd95c4c608069896e48e596a0ba4', commit_message='Upload BioClinicalBert-MLM-Fintuned.pth with huggingface_hub', commit_description='', oid='d206173d9eabdd95c4c608069896e48e596a0ba4', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
# # works for both train and test
# def run_epoch(
#     model,
#     optimizer,
#     data_loader,
#     criterion,
#     device,
#     results,
#     score_funcs=None,
#     prefix="",
#     desc=None,
# ):
#     running_loss = []
#     y_true = []
#     y_pred = []
#     start = time.time()
#     for inputs, labels in tqdm(data_loader, desc=desc, leave=False):
        
#         inputs = inputs.to(device)
#         labels = labels.to(device)

#         y_hat = model(inputs)
#         loss = criterion(y_hat, labels)

#         if model.training:
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()

#         # Store metrics
#         running_loss.append(loss.item())
#         if score_funcs is None:
#             score_funcs = {}
#         if len(score_funcs) > 0 and isinstance(labels, torch.Tensor):
#             # moving labels & predictions back to CPU for computing / storing predictions
#             labels = labels.detach().cpu().numpy()
#             y_hat = y_hat.detach().cpu().numpy()
#             # add to predictions so far
#             y_true.extend(labels.tolist())
#             y_pred.extend(y_hat.tolist())

#     end = time.time()

#     y_pred = np.asarray(y_pred)
#     if (
#         len(y_pred.shape) == 2 and y_pred.shape[1] > 1
#     ):  # We have a classification problem, convert to labels
#         y_pred = np.argmax(y_pred, axis=1)
#     # Else, we assume we are working on a regression problem

#     results[prefix + " loss"].append(np.mean(running_loss))
#     for name, score_func in score_funcs.items():
#         try:
#             results[prefix + " " + name].append(score_func(y_true, y_pred))
#         except:
#             results[prefix + " " + name].append(float("NaN"))
#     return end - start


# def train_model(
#     model,
#     criterion,
#     train_loader,
#     val_loader=None,
#     test_loader=None,
#     score_funcs=None,
#     epochs=50,
#     device="cpu",
#     checkpoint_file=None,
#     lr_scheduler=None,
#     optimizer=None,
#     disable_tqdm=False,
#     log_items=(),
# ):
#     to_track = ["epoch", "total time", "train loss"]
#     if val_loader is not None:
#         to_track.append("val loss")
#     if test_loader is not None:
#         to_track.append("test loss")
#     if score_funcs is not None:
#         for eval_score in score_funcs:
#             to_track.append("train " + eval_score)
#             if val_loader is not None:
#                 to_track.append("val " + eval_score)
#             if test_loader is not None:
#                 to_track.append("test " + eval_score)

#     # Initialization
#     total_train_time = 0
#     results = {}
#     for item in to_track:
#         results[item] = []

#     if optimizer == None:
#         optimizer = torch.optim.AdamW(model.parameters())

#     model.to(device)
#     pbar = tqdm(range(epochs), desc="Epoch", disable=disable_tqdm)
#     for epoch in pbar:
#         model = model.train()

#         total_train_time += run_epoch(
#             model=model,
#             optimizer=optimizer,
#             data_loader=train_loader,
#             criterion=criterion,
#             device=device,
#             results=results,
#             score_funcs=score_funcs,
#             prefix="train",
#             desc="Training",
#         )

#         results["epoch"].append(epoch)
#         results["total time"].append(total_train_time)

#         # Predict the validation set
#         if val_loader is not None:
#             model = model.eval()
#             with torch.no_grad():
#                 run_epoch(
#                     model=model,
#                     optimizer=optimizer,
#                     data_loader=val_loader,
#                     criterion=criterion,
#                     device=device,
#                     results=results,
#                     score_funcs=score_funcs,
#                     prefix="val",
#                     desc="Validating",
#                 )

#         if lr_scheduler is not None:
#             if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
#                 lr_scheduler.step(results["val loss"][-1])
#             else:
#                 lr_scheduler.step()

#         # Predict the validation set
#         if test_loader is not None:
#             model = model.eval()
#             with torch.no_grad():
#                 run_epoch(
#                     model=model,
#                     optimizer=optimizer,
#                     data_loader=test_loader,
#                     criterion=criterion,
#                     device=device,
#                     results=results,
#                     score_funcs=score_funcs,
#                     prefix="test",
#                     desc="Testing",
#                 )

#         log_postfix = {}
#         log_str = f"Epoch [{epoch+1}]: "
#         for log_item in log_items:
#             log_postfix[log_item] = results[log_item][-1]
#             log_str += f"{log_item}: {results[log_item][-1]:.4f}, "

#         pbar.set_postfix(log_postfix)
#         print(log_str)

#     if checkpoint_file is not None:
#         torch.save(
#             {
#                 "epoch": epoch,
#                 "model_state_dict": model.state_dict(),
#                 "optimizer_state_dict": optimizer.state_dict(),
#                 "results": results,
#             },
#             checkpoint_file,
#         )

#     return results

In [7]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
# score_funcs = {"acc": accuracy_score}
# checkpoint_file = "model_checkpoint.pth"

# results = train_model(
#     model=model,
#     criterion=criterion,
#     train_loader=train_loader,
#     val_loader=None,
#     test_loader=None,
#     optimizer=optimizer,
#     lr_scheduler=scheduler,
#     score_funcs=score_funcs,
#     epochs=num_epochs,
#     device=device,
#     checkpoint_file=checkpoint_file,
#     disable_tqdm=False,
#     log_items=("train loss", "val loss", "train acc", "val acc"),
# )

In [8]:
# # Define training loop with progress bar
# optimizer =   # Adjust learning rate as needed
# num_epochs = 3  # Adjust number of epochs for training

# for epoch in tqdm(range(num_epochs)):
#     model.train()
#     losses = []

#     for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
#         optimizer.zero_grad()
#         outputs = model(**batch)  # Unpack batch data using the model
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         losses.append(loss.item())

#     avg_loss = sum(losses) / len(losses)
#     print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

# # Save the fine-tuned model (optional)
# model.save_pretrained("fine-tuned_bert_model")
