In [1]:
!pip install transformers



In [23]:
# CURRENT ISSUE:
# TPU sets loss to NAN for some examples, which is not the case for CPU (what about GPU?)
# Apparently, this may be related to TPU not working properly without batch dimensions or 
# for small batches

# PLAN:
# Switch to GPU if possible, otherwise run (*very* slow) on CPU
# Ask someone to look at code?
# Maybe average losses across batches

# URGENT TO DO:
# Check that it trains properly

# MINOR ISSUES IN CURRENT IMPLEMENTATION:
# Pad to maximum and subset to make training faster and avoid retracing?
# Add batching (i.e. update loss every n examples)
# Check metrics
# Implement validation
# Add checkpoints
# Add baseline bag of words model
# Try model2?


In [2]:
from tensorflow import keras
import tensorflow as tf
import numpy as np
import keras.backend as K
from tensorflow.keras import layers, Input
from transformers import TFDistilBertModel
import glob
from tqdm import tqdm
from tensorflow.keras.utils import Progbar
from google.colab import drive
import os

In [3]:
drive.mount('/content/drive/')
%cd 'drive/My Drive/personality_reddit'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/personality_reddit


In [4]:
from tools.datasets import load_tfrecord_triplet_nn1

In [5]:
try:
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address)
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu)
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])  
  print("Number of accelerators: ", strategy.num_replicas_in_sync)
except ValueError:
  print('TPU failed to initialize.')

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.78.63.114:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.78.63.114:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Running on TPU  ['10.78.63.114:8470']
Number of accelerators:  8


In [6]:
encoder_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


## Set hyperparameters

In [7]:
loss_margin = .5
batch_size = 16
learning_rate = 10e-5
dense_shape = 128 # FFN
n_dense = 3 # FFN

## Define triplet loss

In [8]:
class TripletLoss(keras.losses.Loss):

    ''' Compute triplet loss for Bert encodings'''

    def __init__(self, margin=0, raw_enc=True):
        super().__init__()
        self.margin = margin
        self.raw_enc = raw_enc
    
    def call(self, y_true, y_pred):
      
        if self.raw_enc:
            a_emb = tf.reduce_mean(y_pred.last_hidden_state[:, :-2, :], axis=1)
            p_emb = y_pred.last_hidden_state[:, -2, :]
            n_emb = y_pred.last_hidden_state[:, -1, :]
        else:
            a_emb = y_pred[0]
            p_emb = y_pred[1]
            n_emb = y_pred[2]
        
        d_pos = tf.reduce_sum(tf.square(a_emb - p_emb), 1)
        d_neg = tf.reduce_sum(tf.square(a_emb - n_emb), 1)
        loss_val = tf.maximum(0.0, self.margin + d_pos - d_neg)
        loss_val = tf.reduce_mean(loss_val)
        return loss_val

In [9]:
class TripletAccuracy(tf.keras.metrics.Metric):
      
    ''' Metric tracking whether model assigns lower distance to
        positive example '''
    
    def __init__(self, name='accuracy', margin=.5, **kwargs):
        super(TripletAccuracy, self).__init__(name=name, **kwargs)
        self.correct = tf.Variable(0, dtype=tf.float32)
        self.margin = tf.constant(margin, dtype=tf.float32)
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        corr = tf.cast(tf.math.greater(self.margin, y_pred), tf.float32)
        self.correct.assign(tf.reduce_mean(corr))

    def result(self):
        return self.correct

    def reset_states(self):
        self.correct.assign(0.)

## NNet 1

In [10]:
AUTO = tf.data.experimental.AUTOTUNE

In [14]:
input_ids = Input(shape=(512), dtype='int32', name='input_ids')
encoder_model.trainable = True
encodings_trainable = encoder_model(input_ids)
encoder_model.trainable = False
encodings_frozen = encoder_model(input_ids)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f1fcbf72e58> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f1fcbf72e58> is not a module, class, method, function, traceback, frame, or code object


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).


Cause: while/else statement not yet supported


The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Cause: while/else statement not yet supported


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [15]:
model1.trainable = True
model1.summary()

NameError: ignored

## NNet 2

In [13]:
class FFNModel(keras.Model):

    ''' BERT encodings are passed to feedforward layers 
        Number and size can vary '''

    def __init__(self, encoder, 
                dense_shape=128, n_dense=3, 
                trainable=False,
                dense_act='relu', name=None):
        super(FFNModel, self).__init__(name=name)

        self.encoder = encoder
        self.encoder.trainable = trainable
        self.dense_layers = keras.Sequential([keras.layers.Dense(dense_shape, 
                                                                 activation=dense_act) 
                                            for _ in range(n_dense)])
        self.output_signature = tf.float32

    def call(self, input):
        enc = self.encoder(input)
        x_p = self.dense_layers(enc.last_hidden_state[:,-2,:])
        x_n = self.dense_layers(enc.last_hidden_state[:,-1,:])
        x_a = keras.backend.mean(enc.last_hidden_state[:,:-2,:], axis=1)
        x_a = self.dense_layers(x_a)
        return x_a, x_p, x_n

In [14]:
ffn_out = FFNModel(encoder_model, dense_shape=dense_shape, n_dense=n_dense)(input_ids)
model2 = keras.Model(input_ids, ffn_out)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [15]:
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 512)]             0         
_________________________________________________________________
ffn_model (FFNModel)         ((None, 128), (None, 128) 66494336  
Total params: 66,494,336
Trainable params: 131,456
Non-trainable params: 66,362,880
_________________________________________________________________


## Load and split dataset

In [13]:
from google.colab import auth
auth.authenticate_user()
!gcloud config set project thematic-cursor-254011

Updated property [core/project].


In [14]:
fre = 'gs://personality_reddit/triplet_nn1/*-of-999.tfrecord'
fnames = tf.io.gfile.glob(fre)

In [15]:
ds_triplet = load_tfrecord_triplet_nn1(filenames=fnames,
                                       deterministic=False,
                                       num_parallel_calls=AUTO)
ds_triplet = ds_triplet.prefetch(AUTO) # Map

In [16]:
n_examples = 167320
n_train = int(167320 * .7)
n_val = int(167320 * .1)
n_test = 167320 - (n_train + n_val)

In [17]:
#ds_triplet = ds_triplet.shuffle(100, seed=0)
#ds_triplet = ds_triplet.map(lambda x: x['input_ids'][:2,:])
ds_train = ds_triplet.take(n_train)
ds_test = ds_triplet.skip(n_train + n_val)
ds_val = ds_triplet.skip(n_train).take(n_val)

In [18]:
ds_train = strategy.experimental_distribute_dataset(ds_train)
ds_val = strategy.experimental_distribute_dataset(ds_val)
ds_test = strategy.experimental_distribute_dataset(ds_test)

## Training protocol

In [138]:
metric = TripletAccuracy(margin=.5)
loss_model1 = TripletLoss(margin=.5, raw_enc=True)
loss_model2 = TripletLoss(margin=.5, raw_enc=False)
optimizer = keras.optimizers.Adam(learning_rate=10e-5)
#model2 = keras.Model(input_ids, ffn_out)

In [19]:
with strategy.scope():

  model = keras.Model(input_ids, encodings_trainable)
  metric_obj = TripletAccuracy(margin=loss_margin)
  loss_obj = TripletLoss(margin=loss_margin, raw_enc=True)
  loss_obj.reduction = tf.keras.losses.Reduction.NONE
  optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

  def compute_loss(enc):
    l = loss_obj(y_true=None, y_pred=enc)
    return tf.math.divide(l, strategy.num_replicas_in_sync)
  
  @tf.function
  def distributed_train_step(dataset_inputs):
    pr_losses, pr_metrics = strategy.run(train_step, args=(dataset_inputs,))
    red_fn = lambda x: strategy.reduce(tf.distribute.ReduceOp.SUM, x, axis=None)
    return red_fn(pr_losses), red_fn(pr_metrics)

  def train_step(inputs):
    with tf.GradientTape() as tape:
      enc = model(inputs)
      loss = compute_loss(enc)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    metric_obj.update_state(y_true=None, y_pred=loss)
    return loss, metric_obj.result()

In [22]:
EPOCHS = 1

with strategy.scope():
  for epoch in range(EPOCHS):

    pb_i = Progbar(n_train, stateful_metrics=['loss', 'correct'])
    tf.print(f'Epoch {int(epoch)}')

    epoch_losses = []
    epoch_metrics = []
    num_batches = 0

    for x in ds_train:
      loss, correct = distributed_train_step(x)
      epoch_losses.append(loss)
      epoch_metrics.append(correct)
      num_batches += 1

      pb_i.add(1, values=[('loss', loss), ('correct', int(correct))])

    metric_obj.reset_states()  







The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


StagingError: ignored

### Legacy code snippets

In [None]:
  #@tf.function
  #def distributed_test_step(dataset_inputs):
  #  strategy.run(test_step, args=(dataset_inputs,))
  #def test_step(inputs):
  #  images, labels = inputs
  #
  #  predictions = model(images)
  #  loss = loss_object(labels, predictions)
  #
  #  test_loss.update_state(loss)
  #  test_accuracy.update_state(labels, predictions)

In [None]:
#def forward_step(x, model, loss_obj):
#    enc = model(x)
#    loss = loss_obj(y_true=None, y_pred=enc)
#    return enc, loss

In [None]:
    # TESTING LOOP
    #for x in get_validation_dataset():
    #  distributed_test_step(x)

    #template = ("Epoch {}, Loss: {:.2f}, Accuracy: {:.2f}, Test Loss: {:.2f}, "
    #            "Test Accuracy: {:.2f}")
    #print (template.format(epoch+1, train_loss,
    #                       train_accuracy.result()*100, test_loss.result() / strategy.num_replicas_in_sync,
    #                       test_accuracy.result()*100))

    #test_loss.reset_states()
    #rain_accuracy.reset_states()
    #test_accuracy.reset_states()

In [None]:
# Possibly change batch loop into more efficient map function?
def run_epoch(opt, model, loss_obj, metric_obj, 
              training_data, n_examples, forward_only):
    ''' Run single training epoch and return list of losses and metrics'''
    batch_losses, batch_metrics = [], []
    losses, metrics = [], []
    c = 0
    pb_i = Progbar(n_examples, stateful_metrics=['loss', 'correct'])

    # Also edit this (should also be according to batch size)
    if forward_only is True:
      pb_i = Progbar(n_examples, stateful_metrics=['loss', 'correct'])
      for x in training_data:
        enc, loss = forward_step(x, model, loss_obj)
        losses.append(loss)
        metric_obj.update_state(y_true=None, y_pred=loss)
        metrics.append(metric_obj.result())
        pb_i.add(1, values=[('loss', loss), ('correct', int(metric_obj.result()))])
    
    # 
    else:
      pb_i = Progbar(int(n_examples/batch_size), 
                     stateful_metrics=['loss', 'correct'])
      for x in training_data:
        c += 1
        if c % batch_size != 0:
          enc, loss = forward_step(x, model, loss_obj)
          batch_losses.append(loss)
          metric_obj.update_state(y_true=None, y_pred=loss)
          batch_metrics.append(metric_obj.result())
        else:

          with tf.GradientTape() as tape:
              enc, loss = forward_step(x, model, loss_obj)
              batch_losses.append(loss)
              batch_loss = tf.math.reduce_mean(batch_losses)
              gradients = tape.gradient(batch_loss, model.trainable_weights)
              opt.apply_gradients(zip(gradients, model.trainable_weights))
              metric_obj.update_state(y_true=None, y_pred=loss)
              batch_metrics.append(metric_obj.result())
              batch_metric = tf.math.reduce_mean(batch_metrics)

          losses.append(batch_loss)
          batch_losses = []
          metrics.append(batch_metric)
          batch_metrics = []
          pb_i.add(1, values=[('loss', batch_loss), ('correct', int(batch_metric))])
    return losses, metrics

Define general training function

In [None]:
def run_training(n_epochs, 
                 optimizer, model, 
                 loss_fn, metric_obj,
                 training_data, 
                 n_examples,
                 forward_only=False):
    ''' Run full training loop 
    Args:
        n_epochs (int): number of epochs to train
        optimizer (keras.optimizers or str): optimizer
        model (keras.Model): model ot train
        loss_fn (keras.losses.Loss, function or str): loss function
        training_data (TFDataset): dataset to train on 
        forward_only (bool): if True, does not update gradients
    '''
    for epoch in range(n_epochs):
        tf.print(f'Epoch {int(epoch)}')
        losses_train, metrics_train = run_epoch(optimizer, model, 
                                                loss_fn, metric_obj,
                                                training_data, 
                                                n_examples,
                                                forward_only)
        metric_obj.reset_states()
    return losses_train, metrics_train

Train FNN 

In [None]:
losses_nn1_nt, metric_nn1_nt = run_training(n_epochs=1, 
                                            optimizer=optimizer, 
                                            model=model1, 
                                            loss_fn=loss_model1, 
                                            metric_obj=metric,
                                            training_data=ds_train.take(1600),
                                            n_examples=1600,
                                            forward_only=True)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.scatterplot(x=range(len(losses_nn1_nt)), y=[l.numpy() for l in losses_nn1_nt])
plt.show()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.scatterplot(x=range(len(metric_nn1_nt)), y=[l.numpy() for l in metric_nn1_nt])
plt.show()

In [None]:
losses_nn1, metrics_nn1 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model1, 
                                      loss_fn=loss_model1, 
                                      metric_obj=metric,
                                      training_data=ds_train.take(10),
                                      n_examples=10,
                                      forward_only=False)

  [n for n in tensors.keys() if n not in ref_input_names])


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.scatterplot(x=range(len(losses_nn1)), y=[l.numpy() for l in losses_nn1])
plt.show()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.scatterplot(x=range(len(metrics_nn1)), y=[l.numpy() for l in metrics_nn1])
plt.show()

In [None]:
losses_nn2, metric_nn2 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model2, 
                                      loss_fn=loss_model2, 
                                      metric_obj=metric,
                                      training_data=ds_train.take(10), 
                                      n_examples=10)
# Fix:     
# /Users/rr48396/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:702 invalid_creator_scope
# "tf.function-decorated function tried to create "
# ValueError: tf.function-decorated function tried to create variables on non-first call.