<a href="https://colab.research.google.com/github/schwarzmarcel/MasterThesis_MSchwarz/blob/main/DAPT_full_abstracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import necessary libraries


In [None]:
!pip install transformers
!pip install datasets

In [None]:
import numpy as np 
import pandas as pd
import torch
from torch.utils import data 
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset
from datasets import load_dataset
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_MASKED_LM_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
    RobertaModel, 
    RobertaTokenizerFast, 
    RobertaForMaskedLM
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Check for GPU

Tesla P100 is recommended since this procedure takes a few hours

In [None]:
!nvidia-smi

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [None]:
device

# Prepare Data


*   load unlabeled dataset with >31,500 abstracts
*   reduce size of dataset to 80%; this is done because the training takes too long with the full set on RoBERTa; not necessary with DistilBERT



In [None]:
df = pd.read_json("pubmed_papers.json")
df = df.drop(columns=["Doi", "PMID", "Authors", "Title", "Abstract", "Extractive", "Abstractive", "Methods"])
df = df.rename(columns={"Sentences": "text"})
df.tail()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
# reduce dataset
df_train = df_train[:20176]
df_test = df_test[:5044]

In [None]:
# transform abstract from sentence list to single string
df_train["text"] = df_train["text"].apply(lambda x: (" ").join(x))
df_test["text"] = df_test["text"].apply(lambda x: (" ").join(x))

In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

# Define model and tokenization function

This implementation uses a lot of the prepared functions from the hugginface library which makes the training process easier. It works pretty much the same way as the previous DAPT notebook which I used for BERT. 

The tokenization function uses the same special token positioning as for the fine-tuning task.

In [None]:
PRE_TRAINED_MODEL_NAME = "roberta-base" # "distilbert-base-uncased"
tokenizer = RobertaTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME) # DistilBertTokenizerFast

In [None]:
model = AutoModelForMaskedLM.from_pretrained(PRE_TRAINED_MODEL_NAME)
model.to(device)

In [None]:
MAX_LEN = 512

In [None]:
def tokenize_function(examples):
  input_ids = []
  attention_mask = []
  special_tokens_mask = []
  for sentence in examples["text"]:
    encoded = tokenizer(
        sentence,
        truncation=True,
        max_length=128,
        return_special_tokens_mask=True
    )
    input_ids.extend(encoded["input_ids"])
    attention_mask.extend(encoded["attention_mask"])
    special_tokens_mask.extend(encoded["special_tokens_mask"])
  if len(input_ids) > MAX_LEN:
    input_ids = input_ids[:MAX_LEN]
    attention_mask = attention_mask[:MAX_LEN]
    special_tokens_mask = special_tokens_mask[:MAX_LEN]
    input_ids[MAX_LEN-1] = tokenizer.sep_token_id
    special_tokens_mask[MAX_LEN-1] = 1
  elif len(input_ids) < MAX_LEN:
    padding_ids = [1] * (MAX_LEN - len(input_ids))
    padding_attn = [0] * (MAX_LEN - len(input_ids))
    input_ids.extend(padding_ids)
    attention_mask.extend(padding_attn)
    special_tokens_mask.extend(padding_ids)
  
  return {
      "input_ids": input_ids,
      "attention_mask": attention_mask,
      "special_tokens_mask": special_tokens_mask 
  }

In [None]:
tokenized_train = dataset_train.map(
    tokenize_function,
    remove_columns=["text"]
)

In [None]:
tokenized_test = dataset_test.map(
    tokenize_function,
    remove_columns=["text"]
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Training and validation

Here the actual DAPT happens. The hyperparameters for training must be defined and then the Trainer from huggingface does the work for us. During the training process some metrics are returned after every epoch. On a Tesla P100 this should take ~5 hours for RoBERTa and ~3.5h with the full dataset for DistilBERT.

In [None]:
training_args = TrainingArguments(
    output_dir = "",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    per_device_train_batch_size = 10,
    per_device_eval_batch_size = 10,
    learning_rate = 6e-4, #1e-4, 3e-5
    weight_decay = 0.01,
    adam_beta2 = 0.98,
    adam_epsilon = 1e-6,
    max_grad_norm = 0.0,
    num_train_epochs = 10.0,
    warmup_ratio = 0.06,
    save_steps=5000,
    seed = RANDOM_SEED,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
last_checkpoint = None 
model_path = None

In [None]:
%%time

if last_checkpoint is not None:
  checkpoint = last_checkpoint
elif model_path is not None:
  checkpoint = model_path
else:
  checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
print(metrics)

In [None]:
model.roberta.save_pretrained("roberta_dapt") 