In [1]:
from tensorflow import keras
import tensorflow as tf
import numpy as np
import keras.backend as K
from tensorflow.keras import layers, Input
from transformers import TFDistilBertModel
import glob
from tqdm import tqdm
from tools.datasets import load_tfrecord_triplet_nn1
from tensorflow.keras.utils import Progbar

Using TensorFlow backend.


In [None]:
# OUTSTANDING ISSUES
# Why is model re-initialized in graph?
# Several calls to tf.function?
# Set up parallelization strategies
# Prefetch?
# Fix repeated training for model2

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]

In [3]:
encoder_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [5]:
class TripletLoss(keras.losses.Loss):

    ''' Compute triplet loss for Bert encodings'''

    def __init__(self, margin=0, raw_enc=True):
        super().__init__()
        self.margin = margin
        self.raw_enc = raw_enc
    
    def call(self, y_true, y_pred):
        if self.raw_enc:
            a_emb = tf.reduce_mean(y_pred.last_hidden_state[:, :-2, :], axis=1)
            p_emb = y_pred.last_hidden_state[:, -2, :]
            n_emb = y_pred.last_hidden_state[:, -1, :]
        else:
            a_emb = y_pred[0]
            p_emb = y_pred[1]
            n_emb = y_pred[2]
        d_pos = tf.reduce_sum(tf.square(a_emb - p_emb), 1)
        d_neg = tf.reduce_sum(tf.square(a_emb - n_emb), 1)
        loss_val = tf.maximum(0.0, self.margin + d_pos - d_neg)
        loss_val = tf.reduce_mean(loss_val)
        return loss_val

In [6]:
class TripletAccuracy(tf.keras.metrics.Metric):
      
    ''' Metric tracking whether model assigns lower distance to
        positive example '''
    
    def __init__(self, name='accuracy', margin=.5, **kwargs):
        super(TripletAccuracy, self).__init__(name=name, **kwargs)
        self.correct = tf.Variable(0, dtype=tf.float32)
        self.margin = tf.constant(margin, dtype=tf.float32)
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        corr = tf.cast(tf.math.greater(self.margin, y_pred), tf.float32)
        self.correct.assign(tf.reduce_mean(corr))

    def result(self):
        return self.correct

    def reset_states(self):
        self.correct.assign(0.)

## NNet 1

In [7]:
input_ids = Input(shape=(512), dtype='int32', name='input_ids')
encodings = encoder_model(input_ids)
model1 = keras.Model(input_ids, encodings)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [8]:
model1.trainable = True
model1.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 512)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 66362880  
Total params: 66,362,880
Trainable params: 66,362,880
Non-trainable params: 0
_________________________________________________________________


In [9]:
loss_model1 = TripletLoss(margin=.5, raw_enc=True)
optimizer = keras.optimizers.Adam(learning_rate=5e-5)

## NNet 2

In [10]:
dense_shape = 128
n_dense = 3

In [11]:
class FFNModel(keras.Model):

    ''' BERT encodings are passed to feedforward layers 
        Number and size can vary '''

    def __init__(self, encoder, 
                dense_shape=128, n_dense=3, 
                trainable=False,
                dense_act='relu', name=None):
        super(FFNModel, self).__init__(name=name)

        self.encoder = encoder
        self.encoder.trainable = trainable
        self.dense_layers = keras.Sequential([keras.layers.Dense(dense_shape, 
                                                                 activation=dense_act) 
                                            for _ in range(n_dense)])
        self.output_signature = tf.float32

    def call(self, input):
        enc = self.encoder(input)
        x_p = self.dense_layers(enc.last_hidden_state[:,-2,:])
        x_n = self.dense_layers(enc.last_hidden_state[:,-1,:])
        x_a = keras.backend.mean(enc.last_hidden_state[:,:-2,:], axis=1)
        x_a = self.dense_layers(x_a)
        return x_a, x_p, x_n

In [12]:
ffn_out = FFNModel(encoder_model, dense_shape=dense_shape, n_dense=n_dense)(input_ids)
model2 = keras.Model(input_ids, ffn_out)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [13]:
model2.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 512)]             0         
_________________________________________________________________
ffn_model (FFNModel)         ((None, 128), (None, 128) 66494336  
Total params: 66,494,336
Trainable params: 131,456
Non-trainable params: 66,362,880
_________________________________________________________________


In [14]:
loss_model2 = TripletLoss(margin=.5, raw_enc=False)

## Load and split dataset

In [4]:
fnames_triplet = glob.glob('datasets/example/triplet_nn1/*')
ds_triplet = load_tfrecord_triplet_nn1(filenames=fnames_triplet, 
                                       compression_type='GZIP')

In [15]:
n_examples = 167320
n_train = int(167320 * .7)
n_val = int(167320 * .1)
n_test = 167320 - (n_train + n_val)

In [16]:
ds_triplet = ds_triplet.shuffle(100, seed=0)
ds_train = ds_triplet.take(n_train)
ds_test = ds_triplet.skip(n_train + n_val)
ds_val = ds_triplet.skip(n_train).take(n_val)

## Training protocol

In [17]:
metric = TripletAccuracy(margin=.5)

In [54]:
def forward_step(x, model, loss_obj):
    enc = model(x)
    loss = loss_obj(y_true=None, y_pred=enc)
    return enc, loss

In [60]:
@tf.function(experimental_relax_shapes=True)
def training_step(opt, model, loss_obj, x, forward_only):
    ''' Training step with gradient tape'''
    if forward_only:
        enc, loss = forward_step(x, model, loss_obj)     
    else:
        with tf.GradientTape() as tape:
            enc, loss = forward_step(x, model, loss_obj)
            gradients = tape.gradient(loss, model.trainable_weights)
            opt.apply_gradients(zip(gradients, model.trainable_weights))
    return enc, loss

In [61]:
def run_epoch(opt, model, loss_obj, metric_obj, 
              training_data, n_examples, forward_only):
    ''' Run single training epoch and return list of losses and metrics'''
    losses, metrics = [], []
    pb_i = Progbar(n_examples, stateful_metrics=['loss', 'correct'])

    for x in training_data:
        enc, loss = training_step(opt, model, loss_obj, x, forward_only)
        losses.append(loss)

        metric_obj.update_state(y_true=None, y_pred=loss)
        metrics.append(metric_obj.result())
        
        pb_i.add(1, values=[('loss', loss), ('correct', int(metric_obj.result()))])
    return losses, metrics

Define general training function

In [62]:
def run_training(n_epochs, 
                 optimizer, model, 
                 loss_fn, metric_obj,
                 training_data, 
                 n_examples,
                 forward_only=False):
    ''' Run full training loop 
    Args:
        n_epochs (int): number of epochs to train
        optimizer (keras.optimizers or str): optimizer
        model (keras.Model): model ot train
        loss_fn (keras.losses.Loss, function or str): loss function
        training_data (TFDataset): dataset to train on 
        forward_only (bool): if True, does not update gradients
    '''
    for epoch in range(n_epochs):
        tf.print(f'Epoch {int(epoch)}')
        losses_train, metrics_train = run_epoch(optimizer, model, 
                                                loss_fn, metric_obj,
                                                training_data, 
                                                n_examples,
                                                forward_only)
        metric_obj.reset_states()
    return losses_train, metrics_train

Train FNN 

In [63]:
losses_nn1_nt, metric_nn1_nt = run_training(n_epochs=1, 
                                            optimizer=optimizer, 
                                            model=model1, 
                                            loss_fn=loss_model1, 
                                            metric_obj=metric,
                                            training_data=ds_train.take(10),
                                            n_examples=10,
                                            forward_only=False)

Epoch 0
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
 1/10 [==>...........................] - ETA: 40s - loss: 0.3590 - correct: 1.0000The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
 2/10 [=====>........................] - ETA: 35s - loss: 0.1741 - correct: 1.0000The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config 

In [None]:
losses_nn1, metric_nn1 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model1, 
                                      loss_fn=loss_model1, 
                                      metric_obj=metric,
                                      training_data=ds_train.take(10),
                                      n_examples=10)

In [None]:
losses_nn2, metric_nn2 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model2, 
                                      loss_fn=loss_model2, 
                                      metric_obj=metric,
                                      training_data=ds_train.take(10), 
                                      n_examples=10)
# Fix:     
# /Users/rr48396/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:702 invalid_creator_scope
# "tf.function-decorated function tried to create "
# ValueError: tf.function-decorated function tried to create variables on non-first call.

In [None]:
#Other:
# Need to add baseline model
# Add validation