In [1]:
#%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.0.0


In [2]:
from transformers import (TFBertForSequenceClassification, 
                          BertTokenizer,
                          TFRobertaForSequenceClassification, 
                          RobertaTokenizer)

In [3]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
roberta_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [5]:
sequence = "Systolic arrays are cool. This 🐳 is cool too."

bert_tokenized_sequence = bert_tokenizer.tokenize(sequence)
roberta_tokenized_sequence = roberta_tokenizer.tokenize(sequence)

print("BERT:", bert_tokenized_sequence)
print("RoBERTa:", roberta_tokenized_sequence)

BERT: ['S', '##ys', '##to', '##lic', 'array', '##s', 'are', 'cool', '.', 'This', '[UNK]', 'is', 'cool', 'too', '.']
RoBERTa: ['Sy', 'st', 'olic', 'Ġarrays', 'Ġare', 'Ġcool', '.', 'ĠThis', 'ĠðŁ', 'Ĳ', '³', 'Ġis', 'Ġcool', 'Ġtoo', '.']


In [6]:
import tensorflow_datasets
data = tensorflow_datasets.load("glue/mrpc")

train_dataset = data["train"]
validation_dataset = data["validation"]

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/home/jupyter/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /home/jupyter/tensorflow_datasets/glue/mrpc/0.0.2


In [7]:
example = list(train_dataset.__iter__())[0]
print('',
    'idx:      ', example['idx'],       '\n',
    'label:    ', example['label'],     '\n',
    'sentence1:', example['sentence1'], '\n',
    'sentence2:', example['sentence2'],
)

 idx:       tf.Tensor(201, shape=(), dtype=int32) 
 label:     tf.Tensor(1, shape=(), dtype=int64) 
 sentence1: tf.Tensor(b'Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .', shape=(), dtype=string) 
 sentence2: tf.Tensor(b'Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .', shape=(), dtype=string)


In [8]:
seq0 = example['sentence1'].numpy().decode('utf-8')  # Obtain bytes from tensor and convert it to a string
seq1 = example['sentence2'].numpy().decode('utf-8')  # Obtain bytes from tensor and convert it to a string

print("First sequence:", seq0)
print("Second sequence:", seq1)

First sequence: Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .
Second sequence: Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .


In [9]:
encoded_bert_sequence = bert_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)
encoded_roberta_sequence = roberta_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)


In [10]:
print("BERT tokenizer separator, cls token id:   ", bert_tokenizer.sep_token_id, bert_tokenizer.cls_token_id)
print("RoBERTa tokenizer separator, cls token id:", roberta_tokenizer.sep_token_id, roberta_tokenizer.cls_token_id)


BERT tokenizer separator, cls token id:    102 101
RoBERTa tokenizer separator, cls token id: 2 0


In [11]:
bert_special_tokens = [bert_tokenizer.sep_token_id, bert_tokenizer.cls_token_id]
roberta_special_tokens = [roberta_tokenizer.sep_token_id, roberta_tokenizer.cls_token_id]


In [12]:
def print_in_red(string):
    print("\033[91m" + str(string) + "\033[0m", end=' ')


In [13]:
print("\nBERT tokenized sequence")
output = [print_in_red(tok) if tok in bert_special_tokens else print(tok, end=' ') for tok in encoded_bert_sequence]



BERT tokenized sequence
[91m101[0m 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 2452 1106 1103 19585 2858 17762 117 1756 1419 119 [91m102[0m 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 1122 1163 119 [91m102[0m 

In [14]:
print("\n\nRoBERTa tokenized sequence")
output = [print_in_red(tok) if tok in roberta_special_tokens else print(tok, end=' ') for tok in encoded_roberta_sequence]



RoBERTa tokenized sequence
[91m0[0m 565 1452 876 34 341 5 29110 42057 766 187 8148 13 484 9 63 806 785 2156 309 7 5 21065 18402 2156 886 138 479 [91m2[0m [91m2[0m 565 1452 876 34 341 5 29110 42057 766 187 8148 13 484 9 63 806 785 2156 24 26 479 [91m2[0m 

In [15]:
from transformers import glue_convert_examples_to_features

bert_train_dataset = glue_convert_examples_to_features(train_dataset, bert_tokenizer, 128, 'mrpc')
bert_train_dataset = bert_train_dataset.shuffle(100).batch(32).repeat(2)

bert_validation_dataset = glue_convert_examples_to_features(validation_dataset, bert_tokenizer, 128, 'mrpc')
bert_validation_dataset = bert_validation_dataset.batch(64)

In [22]:
def token_type_ids_removal(example, label):
    del example["token_type_ids"]
    return example, label

roberta_train_dataset = glue_convert_examples_to_features(train_dataset, roberta_tokenizer, 128, 'mrpc')
roberta_train_dataset = roberta_train_dataset.map(token_type_ids_removal)
roberta_train_dataset = roberta_train_dataset.shuffle(100).batch(32).repeat(2)

roberta_validation_dataset = glue_convert_examples_to_features(validation_dataset, roberta_tokenizer, 128, 'mrpc')
roberta_validation_dataset = roberta_validation_dataset.map(token_type_ids_removal)
roberta_validation_dataset = roberta_validation_dataset.batch(64)

In [23]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [24]:
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [25]:
print("Fine-tuning BERT on MRPC")
bert_history = bert_model.fit(bert_train_dataset, epochs=3, validation_data=bert_validation_dataset)


Fine-tuning BERT on MRPC
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [26]:
print("\nFine-tuning RoBERTa on MRPC")
roberta_history = roberta_model.fit(roberta_train_dataset, epochs=3, validation_data=roberta_validation_dataset)


Fine-tuning RoBERTa on MRPC


ValueError: in converted code:
    relative to /usr/local/lib/python3.5/dist-packages:

    transformers/modeling_tf_roberta.py:364 call  *
        outputs = self.roberta(inputs, **kwargs)
    tensorflow_core/python/keras/engine/base_layer.py:842 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    transformers/modeling_tf_bert.py:514 call  *
        token_type_ids = tf.fill(input_shape, 0)
    tensorflow_core/python/ops/array_ops.py:171 fill
        result = gen_array_ops.fill(dims, value, name=name)
    tensorflow_core/python/ops/gen_array_ops.py:3602 fill
        "Fill", dims=dims, value=value, name=name)
    tensorflow_core/python/framework/op_def_library.py:545 _apply_op_helper
        (input_name, err))

    ValueError: Tried to convert 'dims' to a tensor and failed. Error: Cannot convert a partially known TensorShape to a Tensor: (None, 128)


In [27]:
print("Evaluating the BERT model")
bert_model.evaluate(bert_validation_dataset)


Evaluating the BERT model


[0.5527119423661914, 0.84068626]