In [1]:
import datasets
import numpy as np
import transformers as trf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class configs:
    dataset = 'yelp_review_full'
    encoder = 'bert-base-uncased'
    out_dir = 'results'

# Finetuning

In [3]:
data_set = datasets.load_dataset(configs.dataset)
data_set

Reusing dataset yelp_review_full (/home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:00<00:00, 112.82it/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [4]:
tokenizer = trf.AutoTokenizer.from_pretrained(configs.encoder, max_seq_length=128)
def tokenize(text):
    return tokenizer(text["text"], max_length=128, padding="max_length", truncation=True)

In [5]:
# using subset of data for faster experimentations
small_train_dataset = data_set["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = data_set["test"].shuffle(seed=42).select(range(1000))

Loading cached shuffled indices for dataset at /home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-a0e621c27d9b360e.arrow
Loading cached shuffled indices for dataset at /home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-61e0da4d9cd46a2c.arrow


In [6]:
small_train_dataset = small_train_dataset.map(tokenize, batched=True, writer_batch_size=32)
small_eval_dataset = small_eval_dataset.map(tokenize, batched=True, writer_batch_size=32)

100%|██████████| 1/1 [00:00<00:00,  1.21ba/s]
100%|██████████| 1/1 [00:00<00:00,  1.60ba/s]


In [7]:
training_args = trf.TrainingArguments(
    output_dir=configs.out_dir,
    per_device_train_batch_size=4,
    num_train_epochs=1,
)

In [8]:
model = trf.AutoModelForSequenceClassification.from_pretrained(configs.encoder, num_labels=5, output_attentions=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
trainer = trf.Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
)

In [10]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 250


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=250, training_loss=1.59669482421875, metrics={'train_runtime': 33.7404, 'train_samples_per_second': 29.638, 'train_steps_per_second': 7.41, 'total_flos': 65779535616000.0, 'train_loss': 1.59669482421875, 'epoch': 1.0})

# Visualization

In [11]:
import bertviz as bv

In [12]:
# Get the BERT layers from the whole model
bert = model.bert

In [13]:
text = 'I love this restaurant'
encoded = tokenizer.encode(text, return_tensors="pt").to(bert.device)
outs = bert(encoded)
attention = outs[-1]

In [14]:
tokens = tokenizer.convert_ids_to_tokens(encoded[0])
bv.head_view(attention, tokens)

<IPython.core.display.Javascript object>

In [15]:
bv.model_view(attention, tokens)

<IPython.core.display.Javascript object>