<a href="https://colab.research.google.com/github/sourcecode369/TensorFlow-2.0/blob/master/tensorflow_2.0_docs/TensorFlow%20Core/Tutorials/Text/Transformers%20Sequence%20Classification/HuggingFace_Transformers_Sequence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

TensorFlow 2.x selected.
2.0.0


In [2]:
!pip install transformers



In [0]:
from transformers import TFBertForSequenceClassification, BertTokenizer, TFRobertaForSequenceClassification, RobertaTokenizer, TFGPT2Model, GPT2Tokenizer

In [4]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

roberta_model = TFRobertaForSequenceClassification.from_pretrained("roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

A sequence with no special tokens has been passed to the RoBERTa model. This model requires special tokens in order to work. Please specify add_special_tokens=True in your encoding.
A sequence with no special tokens has been passed to the RoBERTa model. This model requires special tokens in order to work. Please specify add_special_tokens=True in your encoding.


In [0]:
sequence = "Systolic arrays are cool. This is cool too. Artificial Intelligence is the coolest of all."
bert_tokenized_sequence = bert_tokenizer.tokenize(sequence)
roberta_tokenized_sequence = roberta_tokenizer.tokenize(sequence)

In [6]:
print(f"Bert: {bert_tokenized_sequence}")
print(f"Robert: {roberta_tokenized_sequence}")

Bert: ['S', '##ys', '##to', '##lic', 'array', '##s', 'are', 'cool', '.', 'This', 'is', 'cool', 'too', '.', 'Art', '##ific', '##ial', 'Intelligence', 'is', 'the', 'cool', '##est', 'of', 'all', '.']
Robert: ['Sy', 'st', 'olic', 'Ġarrays', 'Ġare', 'Ġcool', '.', 'ĠThis', 'Ġis', 'Ġcool', 'Ġtoo', '.', 'ĠArtificial', 'ĠIntelligence', 'Ġis', 'Ġthe', 'Ġcoolest', 'Ġof', 'Ġall', '.']


In [0]:
# !pip install --upgrade tensorflow-gpu

In [8]:
import tensorflow as tf
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
print("Tensorflow version: ",tf.__version__)

Tensorflow version:  2.0.0


In [9]:
data = tfds.load('glue/mrpc')

train_dataset = data["train"]
validation_dataset = data["validation"]

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/root/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /root/tensorflow_datasets/glue/mrpc/0.0.2


In [0]:
example = list(train_dataset.__iter__())[0]

In [11]:
example

{'idx': <tf.Tensor: id=13353, shape=(), dtype=int32, numpy=201>,
 'label': <tf.Tensor: id=13354, shape=(), dtype=int64, numpy=1>,
 'sentence1': <tf.Tensor: id=13355, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .'>,
 'sentence2': <tf.Tensor: id=13356, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .'>}

In [12]:
train_dataset.element_spec

{'idx': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'label': TensorSpec(shape=(), dtype=tf.int64, name=None),
 'sentence1': TensorSpec(shape=(), dtype=tf.string, name=None),
 'sentence2': TensorSpec(shape=(), dtype=tf.string, name=None)}

In [13]:
next(iter(train_dataset))

{'idx': <tf.Tensor: id=28030, shape=(), dtype=int32, numpy=201>,
 'label': <tf.Tensor: id=28031, shape=(), dtype=int64, numpy=1>,
 'sentence1': <tf.Tensor: id=28032, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .'>,
 'sentence2': <tf.Tensor: id=28033, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .'>}

In [0]:
seq0 = example['sentence1'].numpy().decode('utf8')
seq1 = example['sentence2'].numpy().decode('utf8')

In [0]:
encoded_bert_sequence = bert_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)
encoded_roberta_sequence = roberta_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)

In [16]:
print(f"Bert tokenizer separator: {bert_tokenizer.sep_token_id} {bert_tokenizer.cls_token_id}")
print(f"RoBerta tokenizer separator: {roberta_tokenizer.sep_token_id} {roberta_tokenizer.cls_token_id}")

Bert tokenizer separator: 102 101
RoBerta tokenizer separator: 2 0


In [0]:
bert_special_tokens = [bert_tokenizer.cls_token_id, bert_tokenizer.sep_token_id]
roberta_special_tokens = [roberta_tokenizer.cls_token_id, roberta_tokenizer.sep_token_id]

In [0]:
def print_in_red(string):
  print("\033[91m" + str(string) + "\033[0m", end=' ')

In [19]:
print("\nBERT tokenized sequence")
output = [print_in_red(tok) if tok in bert_special_tokens else print(tok, end=' ') for tok in encoded_bert_sequence]

print("\nRoBERTa tokenized sequence")
output = [print_in_red(tok) if tok in roberta_special_tokens else print(tok, end=' ') for tok in encoded_roberta_sequence]


BERT tokenized sequence
[91m101[0m 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 2452 1106 1103 19585 2858 17762 117 1756 1419 119 [91m102[0m 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 1122 1163 119 [91m102[0m 
RoBERTa tokenized sequence
[91m0[0m 565 1452 876 34 341 5 29110 42057 766 187 8148 13 484 9 63 806 785 2156 309 7 5 21065 18402 2156 886 138 479 [91m2[0m [91m2[0m 565 1452 876 34 341 5 29110 42057 766 187 8148 13 484 9 63 806 785 2156 24 26 479 [91m2[0m 

In [0]:
from transformers import glue_convert_examples_to_features

bert_train_dataset = glue_convert_examples_to_features(train_dataset, bert_tokenizer, 128, 'mrpc')
bert_train_dataset = bert_train_dataset.shuffle(100).batch(32).repeat(2).prefetch(tf.data.experimental.AUTOTUNE)

bert_validation_dataset = glue_convert_examples_to_features(validation_dataset, bert_tokenizer, 128, 'mrpc')
bert_validation_dataset = bert_validation_dataset.batch(64)

In [0]:
def delete_token_type_ids(example, label):
  del example["token_type_ids"]
  return example, label

In [0]:
roberta_train_dataset = glue_convert_examples_to_features(train_dataset,roberta_tokenizer,128,'mrpc')
roberta_train_dataset = roberta_train_dataset.map(delete_token_type_ids,tf.data.experimental.AUTOTUNE).shuffle(100).batch(32).repeat(2).prefetch(tf.data.experimental.AUTOTUNE)

roberta_validation_dataset = glue_convert_examples_to_features(validation_dataset, roberta_tokenizer, 128,'mrpc')
roberta_validation_dataset = roberta_validation_dataset.map(delete_token_type_ids, tf.data.experimental.AUTOTUNE).batch(64).prefetch(tf.data.experimental.AUTOTUNE)

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [24]:
# Now for the beauty of TensorFlow and Keras
bert_history = bert_model.fit(bert_train_dataset, 
                              epochs=1, 
                              validation_data=bert_validation_dataset,
                              callbacks = [
                                           tf.keras.callbacks.TensorBoard(log_dir='bert_logs/'),                                    
                              ])







In [0]:
# roberta_history = roberta_model.fit(roberta_train_dataset, 
#                               epochs=1, 
#                               validation_data=roberta_validation_dataset,
#                               callbacks = [
#                                            tf.keras.callbacks.TensorBoard(log_dir='roberta_logs/'),
#                               ])

In [30]:
bert_model.evaluate(bert_validation_dataset, verbose=1)



[0.4082875038896288, 0.8308824]