In [204]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
from tqdm import tqdm

In [205]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [206]:
%%time
model_dir = "pretrained/google/electra-small-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print(f"""{repr(tokenizer)}
model_input_names={repr(tokenizer.model_input_names)}
""")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 22.9 ms


In [207]:
train = pd.read_parquet("input/squad/train.parquet")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   question       130319 non-null  object
 3   answer_start   130319 non-null  int16 
 4   answer_end     130319 non-null  int16 
 5   answer_text    130319 non-null  object
 6   context        130319 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 4.6+ MB


In [208]:
def preprocess(df, tokenizer):
    def _gen():
        for t in df.itertuples():
            q = getattr(t, "question")
            c = getattr(t, "context")
            inputs = tokenizer(q, c, max_length=512, truncation=False, return_tensors="np")
            answer_start, answer_end = 0, 0
            if getattr(t, "is_impossible") == 0:
                pass
            yield (
                {
                    "input_ids": inputs["input_ids"][0],
                    "attention_mask": inputs["attention_mask"][0],
                },
                (answer_start, answer_end,)
            )
    
    return tf.data.Dataset.from_generator(
        _gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=[None], dtype=tf.int32),
                "token_type_ids": tf.TensorSpec(shape=[None], dtype=tf.int32),
                "attention_mask": tf.TensorSpec(shape=[None], dtype=tf.int32),
            },
            (tf.TensorSpec(shape=[], dtype=tf.int32), tf.TensorSpec(shape=[], dtype=tf.int32),)
        )
    )

In [209]:
train = preprocess(train, tokenizer=tokenizer)
list(train.take(2))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


InvalidArgumentError: TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ({'input_ids': tf.int32, 'token_type_ids': tf.int32, 'attention_mask': tf.int32}, (tf.int32, tf.int32)), but the yielded element was ({'input_ids': array([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102,
       20773, 21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170,
       23432, 29715,  3501, 29678, 12325, 29685,  1013, 10506,  1011,
       10930,  2078,  1011,  2360,  1007,  1006,  2141,  2244,  1018,
        1010,  3261,  1007,  2003,  2019,  2137,  3220,  1010,  6009,
        1010,  2501,  3135,  1998,  3883,  1012,  2141,  1998,  2992,
        1999,  5395,  1010,  3146,  1010,  2016,  2864,  1999,  2536,
        4823,  1998,  5613,  6479,  2004,  1037,  2775,  1010,  1998,
        3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,  2599,
        3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
        1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010,
       25436, 22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,
        2088,  1005,  1055,  2190,  1011,  4855,  2611,  2967,  1997,
        2035,  2051,  1012,  2037, 14221,  2387,  1996,  2713,  1997,
       20773,  1005,  1055,  2834,  2201,  1010, 20754,  1999,  2293,
        1006,  2494,  1007,  1010,  2029,  2511,  2014,  2004,  1037,
        3948,  3063,  4969,  1010,  3687,  2274,  8922,  2982,  1998,
        2956,  1996,  4908,  2980,  2531,  2193,  1011,  2028,  3895,
        1000,  4689,  1999,  2293,  1000,  1998,  1000,  3336,  2879,
        1000,  1012,   102]), 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}, (0, 0)).
Traceback (most recent call last):

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 959, in generator_py_func
    flattened_values = nest.flatten_up_to(output_types, values)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 407, in flatten_up_to
    assert_shallow_structure(shallow_tree, input_tree)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 334, in assert_shallow_structure
    shallow_branch, input_branch, check_types=check_types

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 319, in assert_shallow_structure
    % (len(input_tree), len(shallow_tree))

ValueError: The two structures don't have the same sequence length. Input structure has length 2, while shallow structure has length 3.


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 252, in __call__
    ret = func(*args)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 968, in generator_py_func
    sys.exc_info()[2],

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/six.py", line 753, in reraise
    raise value.with_traceback(tb)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 959, in generator_py_func
    flattened_values = nest.flatten_up_to(output_types, values)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 407, in flatten_up_to
    assert_shallow_structure(shallow_tree, input_tree)

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 334, in assert_shallow_structure
    shallow_branch, input_branch, check_types=check_types

  File "/mnt/s/dev/seahrh/kaggle-coleridge-initiative/venv/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py", line 319, in assert_shallow_structure
    % (len(input_tree), len(shallow_tree))

TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ({'input_ids': tf.int32, 'token_type_ids': tf.int32, 'attention_mask': tf.int32}, (tf.int32, tf.int32)), but the yielded element was ({'input_ids': array([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102,
       20773, 21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170,
       23432, 29715,  3501, 29678, 12325, 29685,  1013, 10506,  1011,
       10930,  2078,  1011,  2360,  1007,  1006,  2141,  2244,  1018,
        1010,  3261,  1007,  2003,  2019,  2137,  3220,  1010,  6009,
        1010,  2501,  3135,  1998,  3883,  1012,  2141,  1998,  2992,
        1999,  5395,  1010,  3146,  1010,  2016,  2864,  1999,  2536,
        4823,  1998,  5613,  6479,  2004,  1037,  2775,  1010,  1998,
        3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,  2599,
        3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
        1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010,
       25436, 22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,
        2088,  1005,  1055,  2190,  1011,  4855,  2611,  2967,  1997,
        2035,  2051,  1012,  2037, 14221,  2387,  1996,  2713,  1997,
       20773,  1005,  1055,  2834,  2201,  1010, 20754,  1999,  2293,
        1006,  2494,  1007,  1010,  2029,  2511,  2014,  2004,  1037,
        3948,  3063,  4969,  1010,  3687,  2274,  8922,  2982,  1998,
        2956,  1996,  4908,  2980,  2531,  2193,  1011,  2028,  3895,
        1000,  4689,  1999,  2293,  1000,  1998,  1000,  3336,  2879,
        1000,  1012,   102]), 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}, (0, 0)).


	 [[{{node PyFunc}}]]

In [None]:
%%time
model = TFAutoModelForQuestionAnswering.from_pretrained(model_dir)
print(repr(model.config))

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=5e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)
model.summary()

In [None]:
model.fit(train.shuffle(1000).batch(16).prefetch(tf.data.experimental.AUTOTUNE), epochs=2)

In [None]:
model.save_pretrained("tmp")

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors="tf")
print(repr(inputs))

In [None]:
outputs = model(inputs)
print(repr(outputs))