In [183]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
from tqdm import tqdm

In [184]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [185]:
%%time
model_dir = "pretrained/google/electra-small-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print(f"""{repr(tokenizer)}
model_input_names={repr(tokenizer.model_input_names)}
""")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 29.2 ms


In [186]:
train = pd.read_parquet("input/squad/train.parquet")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   question       130319 non-null  object
 3   answer_start   130319 non-null  int16 
 4   answer_end     130319 non-null  int16 
 5   answer_text    130319 non-null  object
 6   context        130319 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 4.6+ MB


In [187]:
def preprocess(df, tokenizer):
    def _gen():
        for t in df.itertuples():
            q = getattr(t, "question")
            c = getattr(t, "context")
            inputs = tokenizer(q, c, max_length=512, truncation=False, return_tensors="np")
            answer_start, answer_end = 0, 0
            if getattr(t, "is_impossible") == 0:
                pass
            yield (
                {
                    "input_ids": inputs["input_ids"][0],
                    "token_type_ids": inputs["token_type_ids"][0],
                    "attention_mask": inputs["attention_mask"][0],
                },
                {
                    "answer_start": answer_start,
                    "answer_end": answer_end,
                },
            )
    
    return tf.data.Dataset.from_generator(
        _gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=[None], dtype=tf.int32),
                "token_type_ids": tf.TensorSpec(shape=[None], dtype=tf.int32),
                "attention_mask": tf.TensorSpec(shape=[None], dtype=tf.int32),
            },
            {
                "answer_start": tf.TensorSpec(shape=[], dtype=tf.int32),
                "answer_end": tf.TensorSpec(shape=[], dtype=tf.int32),
            }
        )
    )

In [188]:
train = preprocess(train, tokenizer=tokenizer)
list(train.take(2))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[({'input_ids': <tf.Tensor: shape=(174,), dtype=int32, numpy=
   array([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102,
          20773, 21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170,
          23432, 29715,  3501, 29678, 12325, 29685,  1013, 10506,  1011,
          10930,  2078,  1011,  2360,  1007,  1006,  2141,  2244,  1018,
           1010,  3261,  1007,  2003,  2019,  2137,  3220,  1010,  6009,
           1010,  2501,  3135,  1998,  3883,  1012,  2141,  1998,  2992,
           1999,  5395,  1010,  3146,  1010,  2016,  2864,  1999,  2536,
           4823,  1998,  5613,  6479,  2004,  1037,  2775,  1010,  1998,
           3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,  2599,
           3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
           1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010,
          25436, 22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,
           2088,  1005,  1055,  2190,  1011,  4855,  2611,  29

In [189]:
%%time
model = TFAutoModelForQuestionAnswering.from_pretrained(model_dir)
print(repr(model.config))

Some layers from the model checkpoint at pretrained/google/electra-small-discriminator were not used when initializing TFElectraForQuestionAnswering: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForQuestionAnswering were not initialized from the model checkpoint at pretrained/google/electra-small-discriminator and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraConfig {
  "_name_or_path": "pretrained/google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

CPU times: user 594 ms, sys: 172 ms, total: 766 ms
Wall time: 548 ms


In [190]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=5e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)
model.summary()

Model: "tf_electra_for_question_answering_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
electra (TFElectraMainLayer) multiple                  13483008  
_________________________________________________________________
qa_outputs (Dense)           multiple                  514       
Total params: 13,483,522
Trainable params: 13,483,522
Non-trainable params: 0
_________________________________________________________________


In [191]:
model.fit(train.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE), epochs=2)

Epoch 1/2


ValueError: in user code:

    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:864 train_function  *
        return step_function(self, iterator)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:851 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1308 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2824 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3545 _call_for_each_replica
        return fn(*args, **kwargs)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:842 run_step  **
        outputs = model.train_step(data)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:812 train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/compile_utils.py:403 update_state
        self.build(y_pred, y_true)
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/compile_utils.py:330 build
        y_pred, self._get_metric_objects, self._metrics, y_true, y_pred
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/util/nest.py:1212 map_structure_up_to
        **kwargs
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/util/nest.py:1295 map_structure_with_tuple_paths_up_to
        expand_composites=expand_composites,
    /mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/util/nest.py:916 assert_shallow_structure
        input_length=len(input_tree), shallow_length=len(shallow_tree)

    ValueError: The two structures don't have the same sequence length. Input structure has length 4, while shallow structure has length 2.


In [None]:
model.save_pretrained("tmp")

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors="tf")
print(repr(inputs))

In [None]:
outputs = model(inputs)
print(repr(outputs))