In [1]:
# Let's compare BERT with XLNET
from transformers import BertTokenizer, BertModel
  
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

bert_model = BertModel.from_pretrained("bert-base-cased")


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
one_encoded = bert_tokenizer.encode_plus('How much will this cost?', add_special_tokens=True, return_tensors='pt')
two_encoded = bert_tokenizer.encode_plus('Is it expensive?', add_special_tokens=True, return_tensors='pt')


In [3]:
# the CLS token is at the beginning in BERT
one_embedded = bert_model(**one_encoded).last_hidden_state[:,0,:]
two_embedded = bert_model(**two_encoded).last_hidden_state[:,0,:]


In [4]:
import torch

torch.nn.CosineSimilarity()(one_embedded, two_embedded)

tensor([0.9723], grad_fn=<DivBackward0>)

In [5]:
from transformers import XLNetTokenizer, XLNetModel
  
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

xlnet_model = XLNetModel.from_pretrained("xlnet-base-cased")


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
xlnet_model

XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, b

In [8]:
one_encoded = xlnet_tokenizer.encode_plus('How much will this cost?', add_special_tokens=True, return_tensors='pt')
two_encoded = xlnet_tokenizer.encode_plus('Is it expensive?', add_special_tokens=True, return_tensors='pt')

In [11]:
xlnet_tokenizer.convert_ids_to_tokens(one_encoded['input_ids'][0])

['▁How', '▁much', '▁will', '▁this', '▁cost', '?', '<sep>', '<cls>']

In [12]:
# the CLS token is at the end in XLNET
one_embedded = xlnet_model(**one_encoded).last_hidden_state[:,-1,:]
two_embedded = xlnet_model(**two_encoded).last_hidden_state[:,-1,:]


In [13]:
torch.nn.CosineSimilarity()(one_embedded, two_embedded)

tensor([0.9734], grad_fn=<DivBackward0>)

## Fine-tuning XLNET

In [34]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from datasets import Dataset

In [35]:
# Ingest 100 tweets from the Kaggle disaster tweet comopetition
import pandas as pd

tweets = pd.read_csv('../data/disaster_sample.csv')

tweets.head(2)

Unnamed: 0,index,id,keyword,location,text,target,label
0,7138,10224,volcano,,@MrMikeEaton @Muazimus_Prime hill hill mountai...,1,1
1,2151,3086,deaths,Blackpool,Cancers equate for around 25% of all deaths in...,1,1


In [41]:
tweet_dataset = Dataset.from_pandas(tweets)

# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

tweet_dataset = tweet_dataset.map(preprocess, batched=True, batch_size=len(tweet_dataset))

# Dataset has a built in train test split method

tweet_dataset = tweet_dataset.train_test_split(test_size=0.2)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [42]:
xlnet_sequence_classification_model = XLNetForSequenceClassification.from_pretrained(
    'xlnet-base-cased', num_labels=2
)


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [49]:
from transformers import TrainingArguments, Trainer
import numpy as np

training_args = TrainingArguments(
    output_dir='./xlnet_clf',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=len(tweet_dataset['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

# Define accuracy metric:

from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define the trainer:

trainer = Trainer(
    model=xlnet_sequence_classification_model,
    args=training_args,
    train_dataset=tweet_dataset['train'],
    eval_dataset=tweet_dataset['test'],
    compute_metrics=compute_metrics
)

# Get initial metrics
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: id, location, target, text, index, keyword.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


{'eval_loss': 0.7227197885513306,
 'eval_accuracy': 0.525,
 'eval_runtime': 7.1168,
 'eval_samples_per_second': 5.62,
 'eval_steps_per_second': 0.281}

In [50]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: id, location, target, text, index, keyword.
***** Running training *****
  Num examples = 160
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 50


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6619,0.701364,0.55
2,0.6219,0.661617,0.575
3,0.6361,0.61425,0.65
4,0.4532,0.570361,0.725
5,0.3847,0.571109,0.775
6,0.3734,0.495012,0.825
7,0.4251,0.658546,0.825
8,0.247,0.775116,0.725
9,0.2106,0.788002,0.825
10,0.1801,0.721048,0.85


The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: id, location, target, text, index, keyword.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf/checkpoint-5
Configuration saved in ./xlnet_clf/checkpoint-5/config.json
Model weights saved in ./xlnet_clf/checkpoint-5/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: id, location, target, text, index, keyword.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf/checkpoint-10
Configuration saved in ./xlnet_clf/checkpoint-10/config.json
Model weights saved in ./xlnet_clf/checkpoint-10/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.for

TrainOutput(global_step=50, training_loss=0.4311085373163223, metrics={'train_runtime': 1017.7722, 'train_samples_per_second': 1.572, 'train_steps_per_second': 0.049, 'total_flos': 70329819014400.0, 'train_loss': 0.4311085373163223, 'epoch': 10.0})

In [51]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: id, location, target, text, index, keyword.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


{'eval_loss': 0.49501198530197144,
 'eval_accuracy': 0.825,
 'eval_runtime': 7.3625,
 'eval_samples_per_second': 5.433,
 'eval_steps_per_second': 0.272,
 'epoch': 10.0}

In [None]:
# We used the same dataset in a previous BERT lesson and our final accuracy on the validation set of 80%