### Fine-tuning roberta large on Squad dataset for question-answering using adapter-lora concept -Hemant


In [2]:
!pip install -U adapter-transformers datasets

Collecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from adapter-transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from adapter-transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, AdapterTrainer, EvalPrediction
from datasets import load_dataset, load_metric

raw_datasets = load_dataset('squad_v2')

In [2]:
!mkdir outputs
!mkdir saved_model
!mkdir outputs/output_dir
!mkdir outputs/logging_dir

mkdir: cannot create directory ‘outputs’: File exists
mkdir: cannot create directory ‘saved_model’: File exists
mkdir: cannot create directory ‘outputs/output_dir’: File exists
mkdir: cannot create directory ‘outputs/logging_dir’: File exists


In [2]:
do_train = True # False
do_eval = True

# epochs, bs, GA
#evaluation_strategy = "epoch" # no
evaluation_strategy = "no"

# fp16
fp16_opt_level = 'O1'
fp16_backend = "auto"
fp16_full_eval = False

# optimizer (AdamW)
weight_decay = 0.01 # 0.0
adam_beta1 = 0.9
adam_beta2 = 0.999

# scheduler
lr_scheduler_type = 'linear'
warmup_ratio = 0.0
warmup_steps = 0

# logs
logging_strategy = "steps"
logging_first_step = True # False
logging_steps = 500     # if strategy = "steps"
eval_steps = logging_steps # logging_steps

# checkpoints
#save_strategy = "epoch" # steps
save_strategy = "steps" # steps
save_steps = 1000 # if save_strategy = "steps"
save_total_limit = 1 # None

# no cuda, seed
no_cuda = False
seed = 42

# bar
disable_tqdm = False # True
remove_unused_columns = True
path_to_outputs = "./outputs"

# subfolder for model outputs
output_dir = path_to_outputs + '/output_dir'
overwrite_output_dir = True # False

# logs
logging_dir = path_to_outputs + '/logging_dir'
batch_size = 4#16
gradient_accumulation_steps = 1

learning_rate = 1e-4
#num_train_epochs = 6
max_steps = 10

adam_epsilon = 1e-7

fp16 = True

# best model
load_best_model_at_end = True
metric_for_best_model = "loss"
greater_is_better = False

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

In [5]:
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
  model.resize_token_embeddings(len(tokenizer))

if model.config.decoder_start_token_id is None:
  raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

In [6]:
max_length = 300 # The maximum length of a feature (question and context)
doc_stride = 128
max_answer_length = 30
padding = "max_length"
max_seq_length = min(max_length, tokenizer.model_max_length)
column_names = raw_datasets["train"].column_names
question_column = "question"
context_column = "context"
answer_column = "answers"

In [7]:
train_dataset = raw_datasets["train"]
eval_examples = raw_datasets["validation"]

In [8]:
max_seq_length

300

In [9]:
def preprocess_squad_batch(examples, question_column: str, context_column: str, answer_column: str):
  questions = examples[question_column]
  contexts = examples[context_column]
  answers = examples[answer_column]
  def generate_input(_question, _context):
    return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
  inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
  targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
  return inputs, targets

def preprocess_function(examples):
  inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)

  model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
  # Tokenize targets with text_target=...
  labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)

  # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
  # padding in the loss.
  if padding == "max_length":
    labels["input_ids"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]]
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

def preprocess_validation_function(examples):
  inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
  model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True,
                           return_overflowing_tokens=True,return_offsets_mapping=True)
  # Tokenize targets with the `text_target` keyword argument
  labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)

  # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
  # padding in the loss.
  if padding == "max_length":
      labels["input_ids"] = [
          [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
      ]

  # Since one example might give us several features if it has a long context, we need a map from a feature to
  # its corresponding example. This key gives us just that.
  sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

  # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
  # corresponding example_id and we will store the offset mappings.
  model_inputs["example_id"] = []
  # Augment the overflowing tokens to the labels
  labels_out = []

  for i in range(len(model_inputs["input_ids"])):
      # One example can give several spans, this is the index of the example containing this span of text.
      sample_index = sample_mapping[i]
      model_inputs["example_id"].append(examples["id"][sample_index])
      labels_out.append(labels["input_ids"][sample_index])

  model_inputs["labels"] = labels_out
  return model_inputs

In [10]:
train_dataset = train_dataset.map(preprocess_function, batched=True,
                                  remove_columns=column_names, desc="Running tokenizer on train dataset")

In [11]:
eval_dataset = eval_examples.map(preprocess_validation_function, batched=True,
                                 remove_columns=column_names, desc="Running tokenizer on validation dataset",)

Running tokenizer on validation dataset:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [12]:
 for name, p in model.named_parameters():
   if p.requires_grad == True:
     print(name)

shared.weight
encoder.block.0.layer.0.SelfAttention.q.weight
encoder.block.0.layer.0.SelfAttention.k.weight
encoder.block.0.layer.0.SelfAttention.v.weight
encoder.block.0.layer.0.SelfAttention.o.weight
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight
encoder.block.0.layer.0.layer_norm.weight
encoder.block.0.layer.1.DenseReluDense.wi.weight
encoder.block.0.layer.1.DenseReluDense.wo.weight
encoder.block.0.layer.1.layer_norm.weight
encoder.block.1.layer.0.SelfAttention.q.weight
encoder.block.1.layer.0.SelfAttention.k.weight
encoder.block.1.layer.0.SelfAttention.v.weight
encoder.block.1.layer.0.SelfAttention.o.weight
encoder.block.1.layer.0.layer_norm.weight
encoder.block.1.layer.1.DenseReluDense.wi.weight
encoder.block.1.layer.1.DenseReluDense.wo.weight
encoder.block.1.layer.1.layer_norm.weight
encoder.block.2.layer.0.SelfAttention.q.weight
encoder.block.2.layer.0.SelfAttention.k.weight
encoder.block.2.layer.0.SelfAttention.v.weight
encoder.block.2.layer.0.SelfAttentio

In [13]:
new_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
new_params

737668096

In [14]:
from transformers.adapters.configuration import ParallelConfig

adapter_non_linearity = 'relu'
adapter_reduction_factor = 64
leave_out = []
task_name = "squad2"
adapter_config = ParallelConfig(scaling="learned", non_linearity=adapter_non_linearity,
                                reduction_factor=adapter_reduction_factor,)
model.add_adapter(task_name, config=adapter_config)
model.set_active_adapters(task_name)
model.train_adapter([task_name])

In [15]:
for name, p in model.named_parameters():
  if p.requires_grad == True:
    print(name)

encoder.block.0.layer.1.adapters.squad2.scaling
encoder.block.0.layer.1.adapters.squad2.adapter_down.0.weight
encoder.block.0.layer.1.adapters.squad2.adapter_down.0.bias
encoder.block.0.layer.1.adapters.squad2.adapter_up.weight
encoder.block.0.layer.1.adapters.squad2.adapter_up.bias
encoder.block.1.layer.1.adapters.squad2.scaling
encoder.block.1.layer.1.adapters.squad2.adapter_down.0.weight
encoder.block.1.layer.1.adapters.squad2.adapter_down.0.bias
encoder.block.1.layer.1.adapters.squad2.adapter_up.weight
encoder.block.1.layer.1.adapters.squad2.adapter_up.bias
encoder.block.2.layer.1.adapters.squad2.scaling
encoder.block.2.layer.1.adapters.squad2.adapter_down.0.weight
encoder.block.2.layer.1.adapters.squad2.adapter_down.0.bias
encoder.block.2.layer.1.adapters.squad2.adapter_up.weight
encoder.block.2.layer.1.adapters.squad2.adapter_up.bias
encoder.block.3.layer.1.adapters.squad2.scaling
encoder.block.3.layer.1.adapters.squad2.adapter_down.0.weight
encoder.block.3.layer.1.adapters.squad

In [16]:
new_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
new_params

1622832

In [17]:
import numpy as np
from datasets import load_metric

training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        do_train=do_train,
        do_eval=do_eval,
        evaluation_strategy=evaluation_strategy,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        #num_train_epochs=num_train_epochs,
        max_steps=max_steps,
        lr_scheduler_type=lr_scheduler_type,
        warmup_ratio=warmup_ratio,
        warmup_steps=warmup_steps,
        logging_dir=logging_dir,         # directory for storing logs
        logging_strategy=evaluation_strategy,
        logging_steps=logging_steps,     # if strategy = "steps"
        save_strategy=evaluation_strategy,          # model checkpoint saving strategy
        save_steps=logging_steps,        # if strategy = "steps"
        save_total_limit=save_total_limit,
        fp16=fp16,
        eval_steps=logging_steps,        # if strategy = "steps"
        load_best_model_at_end=load_best_model_at_end,
        metric_for_best_model=metric_for_best_model,
        greater_is_better=greater_is_better
        )

In [18]:
from transformers import default_data_collator

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id,
                                       pad_to_multiple_of=8)
train_adapter=True
#do_save_full_model=train_adapter, # save full model as we finetuned head + embeddings
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    )

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [19]:
trainer.train()

***** Running training *****
  Num examples = 130319
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 1622832
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10, training_loss=0.0, metrics={'train_runtime': 6.9578, 'train_samples_per_second': 5.749, 'train_steps_per_second': 1.437, 'total_flos': 51538275102720.0, 'train_loss': 0.0, 'epoch': 0.0})

In [21]:
raw_predictions = trainer.predict(eval_dataset)

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 13095
  Batch size = 4


OutOfMemoryError: ignored

In [22]:
!nvidia-smi

Mon Oct  2 16:18:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |  13323MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))


In [None]:
def compute_metrics(p):
  return metric.compute(predictions=p.predictions, references=p.label_ids)

# Post-processing:
def post_processing_function(examples, features, outputs, stage="eval"):
  # Decode the predicted tokens.
  preds = outputs.predictions
  if isinstance(preds, tuple):
      preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Build a map example to its corresponding features.
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
  predictions = {}
  # Let's loop over all the examples!
  for example_index, example in enumerate(examples):
      # This is the index of the feature associated to the current example.
      feature_index = feature_per_example[example_index]
      predictions[example["id"]] = decoded_preds[feature_index]

  # Format the result to the format the metric expects.
  formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()]
  references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
  return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [None]:
final_predictions = postprocess_qa_predictions(raw_datasets["validation"], validation_features, raw_predictions.predictions)


Post-processing 11873 example predictions split into 12711 features.


100%|███████████████████████████████████| 11873/11873 [00:08<00:00, 1370.27it/s]


In [None]:
metric = load_metric("squad_v2")
#formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
#references = [{"id": ex["id"], "answers": ex["answers"]} for ex in raw_datasets["validation"]]
compute_metrics(final_predictions)


  metric = load_metric("squad_v2")


{'exact': 83.17190263623347,
 'f1': 86.18698935577652,
 'total': 11873,
 'HasAns_exact': 78.66059379217273,
 'HasAns_f1': 84.69941373500916,
 'HasAns_total': 5928,
 'NoAns_exact': 87.67031118587047,
 'NoAns_f1': 87.67031118587047,
 'NoAns_total': 5945,
 'best_exact': 83.17190263623347,
 'best_exact_thresh': 0.0,
 'best_f1': 86.18698935577635,
 'best_f1_thresh': 0.0}

In [None]:
trainer.model.save_adapter("./saved_model", adapter_name="squad", with_head=True)

Configuration saved in ./saved_model/adapter_config.json
Module weights saved in ./saved_model/pytorch_adapter.bin
Configuration saved in ./saved_model/head_config.json
Module weights saved in ./saved_model/pytorch_model_head.bin
