In [1]:
import datasets
import numpy as np
import transformers as trf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class configs:
    dataset = 'yelp_review_full'
    encoder = 'bert-base-uncased'
    out_dir = 'results'

# Finetuning

In [3]:
data_set = datasets.load_dataset(configs.dataset)
data_set

Reusing dataset yelp_review_full (/home/shahad/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:01<00:00,  1.86it/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [4]:
tokenizer = trf.AutoTokenizer.from_pretrained(configs.encoder)
def tokenize(text):
    return tokenizer(text["text"], padding="max_length", truncation=True)

In [None]:
# using subset of data for faster experimentations
small_train_dataset = data_set["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = data_set["test"].shuffle(seed=42).select(range(2000))

In [None]:
small_train_dataset = small_train_dataset.map(tokenize, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize, batched=True)

In [None]:
metric = datasets.load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = trf.TrainingArguments(output_dir=configs.out_dir, evaluation_strategy="epoch")

In [None]:
model = trf.AutoModelForSequenceClassification.from_pretrained(configs.encoder, num_labels=5, output_attentions=True)

In [None]:
trainer = trf.Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Visualization

In [None]:
import bertviz as bv

In [None]:
bert = model.bert

In [None]:
text = 'I love this restaurant'
encoded = tokenizer.encode(text, return_tensors="pt").to(bert.device)
outs = bert(encoded)
attention = outs[-1]
# attention.shape

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded[0])
bv.head_view(attention, tokens)

In [None]:
bv.model_view(attention, tokens)