In [1]:
import datasets
import numpy as np
import transformers as trf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class configs:
    dataset = 'yelp_review_full'
    encoder = 'bert-base-uncased'
    out_dir = 'results'

# Finetuning

In [3]:
data_set = datasets.load_dataset(configs.dataset)
data_set

Reusing dataset yelp_review_full (/home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:00<00:00, 73.74it/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [4]:
tokenizer = trf.AutoTokenizer.from_pretrained(configs.encoder)
def tokenize(text):
    return tokenizer(text["text"], padding="max_length", truncation=True)

In [5]:
# using subset of data for faster experimentations
small_train_dataset = data_set["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = data_set["test"].shuffle(seed=42).select(range(2000))

Loading cached shuffled indices for dataset at /home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-a0e621c27d9b360e.arrow
Loading cached shuffled indices for dataset at /home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-61e0da4d9cd46a2c.arrow


In [6]:
small_train_dataset = small_train_dataset.map(tokenize, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize, batched=True)

100%|██████████| 10/10 [00:16<00:00,  1.68s/ba]
100%|██████████| 2/2 [00:02<00:00,  1.13s/ba]


In [7]:
metric = datasets.load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
training_args = trf.TrainingArguments(output_dir=configs.out_dir, evaluation_strategy="epoch")

In [9]:
model = trf.AutoModelForSequenceClassification.from_pretrained(configs.encoder, num_labels=5, output_attentions=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
trainer = trf.Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Visualization

In [None]:
import bertviz as bv

In [None]:
# Get the BERT layers from the whole model
bert = model.bert

In [None]:
text = 'I love this restaurant'
encoded = tokenizer.encode(text, return_tensors="pt").to(bert.device)
outs = bert(encoded)
attention = outs[-1]

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded[0])
bv.head_view(attention, tokens)

In [None]:
bv.model_view(attention, tokens)