In [8]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

In [9]:
%%time
model_dir = "pretrained/google/electra-small-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print(f"""{repr(tokenizer)}
model_input_names={repr(tokenizer.model_input_names)}
""")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 23.1 ms


In [10]:
%%time
model = TFAutoModelForQuestionAnswering.from_pretrained(model_dir)
print(repr(model.config))

Some layers from the model checkpoint at pretrained/google/electra-small-discriminator were not used when initializing TFElectraForQuestionAnswering: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForQuestionAnswering were not initialized from the model checkpoint at pretrained/google/electra-small-discriminator and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraConfig {
  "_name_or_path": "pretrained/google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

CPU times: user 750 ms, sys: 141 ms, total: 891 ms
Wall time: 619 ms


In [11]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=5e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)
model.summary()

Model: "tf_electra_for_question_answering_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
electra (TFElectraMainLayer) multiple                  13483008  
_________________________________________________________________
qa_outputs (Dense)           multiple                  514       
Total params: 13,483,522
Trainable params: 13,483,522
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.save_pretrained("tmp")

In [13]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors="tf")
print(repr(inputs))

{'input_ids': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=
array([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,
         2001,  1037,  3835, 13997,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [14]:
outputs = model(inputs)
print(repr(outputs))

TFQuestionAnsweringModelOutput(loss=None, start_logits=<tf.Tensor: shape=(1, 14), dtype=float32, numpy=
array([[ 0.14587751,  0.10114153, -0.02014393, -0.17181236, -0.2846812 ,
        -0.25314462,  0.14477026, -0.54829764, -0.48553377, -0.03554715,
        -0.11357768, -0.2037542 ,  0.11834311,  0.14511745]],
      dtype=float32)>, end_logits=<tf.Tensor: shape=(1, 14), dtype=float32, numpy=
array([[-0.05195689,  0.00217859,  0.04442274, -0.17873889, -0.22025372,
         0.15250057, -0.05077677, -0.20025066, -0.3070005 , -0.09159248,
        -0.21574506, -0.00376507, -0.12839645, -0.05079529]],
      dtype=float32)>, hidden_states=None, attentions=None)
