# Explain

I've changed the transformers/models/roberta/modeling_roberta.py file which contains classes for Roberta model.

I added a new adapter class and used it befor layerNorms in the RobertaModel class.

I sent the changed file as well.

changes are in lines 66-81, 308, 312, 391, 396

# Import and setup

In [1]:
from google.colab import output

In [2]:
!pip install datasets
!pip install transformers
!pip install --upgrade accelerate
output.clear()

In [2]:
from transformers import AutoTokenizer, RobertaForSequenceClassification
from datasets import load_dataset
import numpy as np
from datasets import load_metric
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Dataset

In [4]:
imdb = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
train_dataset = imdb["train"].shuffle(seed=42)
test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])
t= train_dataset.train_test_split(test_size=0.2)
train_dataset = t['train'].select([i for i in list(range(1000))])
eval_dataset = t['test'].select([i for i in list(range(300))])



# Loading and Tokenizing

In [6]:
check_point = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(check_point)
sentiment_model = RobertaForSequenceClassification.from_pretrained(check_point, num_labels=2).to(device)
output.clear()

In [7]:
sentiment_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [8]:
for params in sentiment_model.roberta.parameters():
  params.requires_grad = False

In [9]:
for i in range(12):
  for params in sentiment_model.roberta.encoder.layer[i].attention.output.adapter.parameters():
    params.requires_grad = True
  for params in sentiment_model.roberta.encoder.layer[i].output.adapter.parameters():
    params.requires_grad = True

for params in sentiment_model.classifier.parameters():
  params.requires_grad = True

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True) 




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]



# Training

In [11]:
%pip install evaluate
output.clear()

In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs = 5,
)

trainer = Trainer(
    model= sentiment_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset= tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692258,0.49
2,No log,0.683748,0.683333
3,No log,0.678181,0.796667
4,No log,0.673843,0.8


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692258,0.49
2,No log,0.683748,0.683333
3,No log,0.678181,0.796667
4,No log,0.673843,0.8


# Testing

In [21]:
import torch
input = tokenizer('This movie was amazing.')
sentiment_model.cpu()
sentiment_model(torch.tensor(input['input_ids']).view(1,-1))

SequenceClassifierOutput(loss=None, logits=tensor([[0.1879, 0.1278]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
sentiment_model.to(device)
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [23]:
metric = evaluate.load('accuracy')
acc= metric.compute(predictions=preds, references=predictions.label_ids)
metric = evaluate.load('recall')
rec = metric.compute(predictions=preds, references=predictions.label_ids)
metric = evaluate.load('precision')
pre = metric.compute(predictions=preds, references=predictions.label_ids)
metric = evaluate.load('f1')
f1 = metric.compute(predictions=preds, references=predictions.label_ids)
print(f1, rec, pre, acc, end='\n')

{'f1': 0.7315175097276264} {'recall': 0.6266666666666667} {'precision': 0.8785046728971962} {'accuracy': 0.77}
