In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6'
from interaction_effects import utils
utils.set_up_environment()

In [2]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/homes/gws/psturm/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /homes/gws/psturm/tensorflow_datasets/glue/mrpc/0.0.2


Train for 115 steps, validate for 7 steps
Epoch 1/2
Epoch 2/2


In [3]:
for item in data['train'].take(1):
    print(item)

{'idx': <tf.Tensor: id=57251, shape=(), dtype=int32, numpy=201>, 'label': <tf.Tensor: id=57252, shape=(), dtype=int64, numpy=1>, 'sentence1': <tf.Tensor: id=57253, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .'>, 'sentence2': <tf.Tensor: id=57254, shape=(), dtype=string, numpy=b'Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .'>}


In [4]:
for item in train_dataset.take(1):
    print(item)

({'input_ids': <tf.Tensor: id=57263, shape=(32, 128), dtype=int32, numpy=
array([[  101,  2892,  1209, ...,     0,     0,     0],
       [  101,  1130,   170, ...,     0,     0,     0],
       [  101,   138, 19959, ...,     0,     0,     0],
       ...,
       [  101,   138,  2370, ...,     0,     0,     0],
       [  101,  1124,  7005, ...,     0,     0,     0],
       [  101,  1153,  4567, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: id=57262, shape=(32, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: id=57264, shape=(32, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,