In [1]:
import json
import tensorflow as tf
import time
from tf_transformers.utils.tokenization import BasicTokenizer
from absl import logging

logging.set_verbosity("INFO")

In [2]:
from tf_transformers.data.squad_utils import (
    read_squad_examples,
    post_clean_train_squad,
    example_to_features_using_fast_sp_alignment,
)

In [3]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
basic_tokenizer = BasicTokenizer(do_lower_case=False)

In [4]:
SPECIAL_PEICE = 'Ġ'

In [5]:
input_file_path = '/Users/PRVATE/official_datasets/squad_v1/train-v1.1.json'
is_training = True

start_time = time.time()
train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=is_training,
      version_2_with_negative=False,
      translated_input_folder=None)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

# Postprocess (clean text to avoid some unwanted unicode charcaters)
train_examples_updated, failed_examples = post_clean_train_squad(train_examples[:100], basic_tokenizer)


# Convert question, context and answer to proper features (tokenized words)
all_features = []
# This is a generator
for feature in example_to_features_using_fast_sp_alignment(tokenizer, train_examples_updated, is_training, max_seq_length = 384, 
                                                           max_query_length=64, doc_stride=128, SPECIAL_PIECE=SPECIAL_PEICE):
    all_features.append(feature)
end_time = time.time()
print("time taken {} seconds".format(end_time-start_time))

INFO:absl:Time taken 0.08691596984863281


Time taken 0.7810819149017334
time taken 1.1453638076782227 seconds


In [6]:
# Convert tokens to id and add type_ids
# input_mask etc
# This is user specific/ tokenizer specific
# Eg: Roberta has input_type_ids = 0, BERT has input_type_ids = [0, 1]

def process_features():
    result = {}
    for f in all_features:
        input_ids = tokenizer.convert_tokens_to_ids(f['input_ids'])
        input_type_ids = tf.zeros_like(input_ids).numpy().tolist()
        input_mask = tf.ones_like(input_ids).numpy().tolist()
        result['input_ids'] = input_ids
        result['input_type_ids'] = input_type_ids
        result['input_mask'] = input_mask
        result['start_position'] = f['start_position']
        result['end_position']   = f['end_position']
        yield result
        

# Lets write using TF Writer
# Use TFProcessor for smalled data
from tf_transformers.data import TFWriter

schema = {'input_ids': ("var_len", "int"), 
         'input_type_ids': ("var_len", "int"), 
         'input_mask': ("var_len", "int"), 
         'start_position': ("var_len", "int"), 
         'end_position': ("var_len", "int")}
tfwriter = TFWriter(schema=schema, 
                    file_name='squad_tfrecord', 
                    model_dir='squad_tfrecord_roberta',
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=process_features())

INFO:absl:Total individual observations/examples written is 100
INFO:absl:All writer objects closed


In [7]:
# Read Data
from tf_transformers.data import TFReader
import glob
all_files = glob.glob("squad_tfrecord_roberta/*.tfrecord")
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['start_position', 'end_position']
tf_dataset = tf_reader.read_record(auto_batch=True, 
                                  batch_size=5, 
                                  x_keys = x_keys, 
                                  y_keys = y_keys,
                                  )

In [9]:
for sample_inputs, sample_labels in tf_dataset:
    print(sample_inputs, sample_labels)
    break

{'input_ids': <tf.Tensor: shape=(5, 338), dtype=int32, numpy=
array([[    0, 10777,   141, ...,     0,     0,     0],
       [    0,  1121, 35284, ...,     0,     0,     0],
       [    0,  6179,   171, ...,  7077,     4,     2],
       [    0,   970,    32, ...,     0,     0,     0],
       [    0,  6179,   171, ...,  7077,     4,     2]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(5, 338), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>, 'input_type_ids': <tf.Tensor: shape=(5, 338), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>} {'end_position': <tf.Tensor: shape=(5, 1), dtype=int32, numpy=
array([[58],
       [21],
       [25],
       [88],
       [31]], dtype=int32)>, 'start_position': 

In [10]:
from tf_transformers.models import RobertaModel
model_layer, model, config = RobertaModel(model_name='roberta-base')
model.load_checkpoint("/Users/PRVATE/tf_transformers_models/roberta-base/")

INFO:absl:We are overwriding `is_training` is False to `is_training` to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


In [21]:
from tf_transformers.core import LegacyModel, LegacyLayer


class Span_Selection(LegacyLayer):
    def __init__(self, model, use_all_layers=False, activation="tanh", **kwargs):
        super(Span_Selection, self).__init__(**kwargs)
        self.model = model
        if isinstance(model, LegacyModel):
            self.model_config = model.model_config
        elif isinstance(model, tf.keras.layers.Layer):
            self.model_config = model._config_dict
        self.use_all_layers = use_all_layers
        self.logits_layer = tf.keras.layers.Dense(
            2,
            activation=activation,
            use_bias=True,
            kernel_initializer="glorot_uniform",
            bias_initializer="zeros",
        )

    def call(self, inputs):
        result = self.model(inputs)
        start_logits_outputs = []
        end_logits_outputs   = []
        if self.use_all_layers:
            # each layer token embeddings
            for token_embeddings in result["all_layer_token_embeddings"]:
                outputs = self.logits_layer(token_embeddings)
                start_logits = outputs[:, :, 0]
                end_logits = outputs[:, :, 1]
                start_logits_outputs.append(start_logits)
                end_logits_outputs.append(end_logits)
            return {'start_logits': start_logits_outputs, 'end_logits': end_logits_outputs}
            

        else:
            # last layer token embeddings
            token_embeddings = result["token_embeddings"]
            outputs = self.logits_layer(token_embeddings)
            start_logits = outputs[:, :, 0]
            end_logits = outputs[:, :, 1]
            return {
                    "start_logits": start_logits,
                    "end_logits": end_logits,
            }
        
    def get_model(self):
        layer_output = self(self.model.input)
        model = LegacyModel(inputs=self.model.input, outputs=layer_output, name='span_selection')
        model.model_config = self.model_config
        return model
        

In [22]:
tf.keras.backend.clear_session()
span_selection_layer = Span_Selection(model=model,
                                      use_all_layers=True, 
                                      is_training=True)
span_selection_model = span_selection_layer.get_model()

In [41]:
def span_loss(position, logits):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.squeeze(position)))
    return loss

    
def start_span_loss_all_layers(y_true_dict, y_pred_dict):
    
    layer_loss = []
    model_outputs = y_pred_dict['start_logits']
    for start_logits in model_outputs:
        loss = span_loss(y_true_dict['start_position'], start_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

def end_span_loss_all_layers(y_true_dict, y_pred_dict):
    
    layer_loss = []
    model_outputs = y_pred_dict['end_logits']
    for end_logits in model_outputs:
        loss = span_loss(y_true_dict['end_position'], end_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

In [42]:
results = span_selection_model(sample_inputs)

In [43]:
loss_fn = {'start_logits': start_span_loss_all_layers, 
           'end_logits': end_span_loss_all_layers}
span_selection_model.compile2(optimizer=tf.keras.optimizers.Adam(), 
                            loss=None, 
                            custom_loss=loss_fn)

In [44]:
span_selection_model.fit(tf_dataset, epochs=1, steps_per_epoch=10)



















<tensorflow.python.keras.callbacks.History at 0x15ad95f70>