In [60]:
from tensorflow import keras
import tensorflow as tf
import numpy as np
import keras.backend as K
from tensorflow.keras import layers, Input
from transformers import TFDistilBertModel
import glob
from tqdm import tqdm
from tools.datasets import load_tfrecord_triplet_nn1

In [97]:
# Need to add baseline model
# Define validation loop
# Check log metrics
# Check if loss significantly reduces speed

In [62]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]

In [63]:
encoder_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [95]:
class TripletLoss(keras.losses.Loss):

    ''' Compute triplet loss for Bert encodings'''

    def __init__(self, margin=0, raw_enc=True):
        super().__init__()
        self.margin = tf.constant(margin, dtype=tf.float32)
        self.raw_enc = raw_enc

    def call(self, y_true, y_pred):
        if self.raw_enc:
            a_emb = keras.backend.mean(y_pred.last_hidden_state[:, :-2, :], axis=1)
            p_emb = y_pred.last_hidden_state[:, -2, :]
            n_emb = y_pred.last_hidden_state[:, -1, :]
        else:
            a_emb = y_pred[0]
            p_emb = y_pred[1]
            n_emb = y_pred[2]
        print(a_emb.shape)
        d_pos = tf.reduce_sum(tf.square(a_emb - p_emb), 1)
        d_neg = tf.reduce_sum(tf.square(a_emb - n_emb), 1)
        loss_val = tf.maximum(0.0, self.margin + d_pos - d_neg)
        return tf.reduce_mean(loss_val)

In [96]:
class TripletAccuracy(tf.keras.metrics.Metric):
      
    ''' Metric tracking whether model assigns lower distance to
        positive example '''
    
    def __init__(self, name='accuracy', **kwargs):
        super(TripletAccuracy, self).__init__(name=name, **kwargs)
        self.correct = tf.Variable(0, dtype=tf.float32)
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        corr = 1. if y_pred == 0.0 else 0.
        self.correct.assign_add(tf.reduce_mean(corr))

    def result(self):
        return self.correct

    def reset_states(self):
        self.correct.assign(0)

## NNet 1

In [66]:
input_ids = Input(shape=(512), dtype='int32', name='input_ids')
encodings = encoder_model(input_ids)
model1 = keras.Model(input_ids, encodings)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [67]:
model1.trainable = True
model1.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 512)]             0         
_________________________________________________________________
tf_distil_bert_model_1 (TFDi TFBaseModelOutput(last_hi 66362880  
Total params: 66,362,880
Trainable params: 66,362,880
Non-trainable params: 0
_________________________________________________________________


In [68]:
loss_model1 = TripletLoss(margin=.5, raw_enc=True)
optimizer = keras.optimizers.Adam(learning_rate=5e-5)

## NNet 2

In [70]:
dense_shape = 128
n_dense = 3

In [71]:
class FFNModel(keras.Model):

    ''' BERT encodings are passed to feedforward layers 
        Number and size can vary '''

    def __init__(self, encoder, 
                dense_shape=128, n_dense=3, 
                trainable=False,
                dense_act='relu', name=None):
        super(FFNModel, self).__init__(name=name)

        self.encoder = encoder
        self.encoder.trainable = trainable
        self.dense_layers = keras.Sequential([keras.layers.Dense(dense_shape, 
                                                                 activation=dense_act) 
                                            for _ in range(n_dense)])
        self.output_signature = tf.float32

    def call(self, input):
        enc = self.encoder(input)
        x_p = self.dense_layers(enc.last_hidden_state[:,-2,:])
        x_n = self.dense_layers(enc.last_hidden_state[:,-1,:])
        x_a = keras.backend.mean(enc.last_hidden_state[:,:-2,:], axis=1)
        x_a = self.dense_layers(x_a)
        return x_a, x_p, x_n

In [72]:
ffn_out = FFNModel(encoder_model, dense_shape=dense_shape, n_dense=n_dense)(input_ids)
model2 = keras.Model(input_ids, ffn_out)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [73]:
model2.summary()

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 512)]             0         
_________________________________________________________________
ffn_model_2 (FFNModel)       ((None, 128), (None, 128) 66494336  
Total params: 66,494,336
Trainable params: 131,456
Non-trainable params: 66,362,880
_________________________________________________________________


In [74]:
loss_model2 = TripletLoss(margin=.5, raw_enc=False)

## Load the datasets

In [75]:
fnames_triplet = glob.glob('datasets/example/triplet_nn1/*')
ds_triplet = load_tfrecord_triplet_nn1(filenames=fnames_triplet, 
                                       compression_type='GZIP')

## Training protocol

In [76]:
metric = TripletAccuracy()

In [77]:
def apply_gradient(opt, model, loss_obj, x):
    with tf.GradientTape() as tape:
        enc = model(x)
        loss = loss_obj(y_true=None, y_pred=enc)
    gradients = tape.gradient(loss, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    return enc, loss

In [82]:
#@tf.function
def run_epoch(opt, model, loss_obj, training_data):
    losses = []
    metrics = []
    pbar = tqdm(total=len(list(enumerate(training_data))), leave=False)
    for step, x in enumerate(training_data):
        enc, loss = apply_gradient(opt, model, loss_obj, x)
        losses.append(loss)
        
        metric.update_state(y_true=None, y_pred=loss)
        metric.append(metric.result)

        pbar.set_description(f'Step {int(step)}: loss={float(loss)}')
        pbar.update()
    return losses

Define general training function

In [83]:
def run_training(n_epochs, optimizer, model, loss_fn, training_data):
    for epoch in range(n_epochs):
        print(f'Start of epoch: {int(epoch)}')
        losses_train = run_epoch(optimizer, model, 
                                 loss_fn, training_data, training)
        losses_train_mean = np.mean(losses_train)
        metric.reset_states()
        print(f'Epoch {int(epoch)}; \
              train loss {losses_train_mean}')
    return losses_train, metric

Train FNN 

In [84]:
losses_nn1_nt, metric_nn1_nt = run_training(n_epochs=1, 
                                            optimizer=optimizer, 
                                            model=model1, 
                                            loss_fn=loss_model1, 
                                            training_data=ds_triplet.take(10)) 
# run with validation loop

Start of epoch: 0


  0%|          | 0/10 [00:00<?, ?it/s]

Step 0: loss=0.49529749155044556:   0%|          | 0/10 [00:14<?, ?it/s]

Step 0: loss=0.49529749155044556:  10%|█         | 1/10 [00:14<02:07, 14.17s/it]

Step 1: loss=0.6795383095741272:  10%|█         | 1/10 [00:20<02:07, 14.17s/it] 

Step 1: loss=0.6795383095741272:  20%|██        | 2/10 [00:20<01:34, 11.81s/it]

Step 2: loss=1.2755873203277588:  20%|██        | 2/10 [00:33<01:34, 11.81s/it]

Step 2: loss=1.2755873203277588:  30%|███       | 3/10 [00:33<01:25, 12.16s/it]

Step 3: loss=0.2298920899629593:  30%|███       | 3/10 [00:37<01:25, 12.16s/it]

Step 3: loss=0.2298920899629593:  40%|████      | 4/10 [00:37<00:57,  9.62s/it]

Step 4: loss=2.9019365310668945:  40%|████      | 4/10 [00:43<00:57,  9.62s/it]

Step 4: loss=2.9019365310668945:  50%|█████     | 5/10 [00:43<00:43,  8.61s/it]

Step 5: loss=0.1821185052394867:  50%|█████     | 5/10 [00:49<00:43,  8.61s/it]

Step 5: loss=0.1821185052394867:  60%|██████    | 6/10

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>

In [None]:
losses_nn1, metric_nn1 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model1, 
                                      loss_fn=loss_model1, 
                                      training_data=ds_triplet.take(10))

In [None]:
losses_nn2, metric_nn2 = run_training(n_epochs=1, 
                                      optimizer=optimizer, 
                                      model=model2, 
                                      loss_fn=loss_model2, 
                                      training_data=ds_triplet.take(10))