# Imports and dataset loading

In [52]:
from datasets import load_dataset
from transformers import AutoTokenizer
import tensorflow as tf

dataset = load_dataset("squad_v2")["train"][:50000]

In [41]:
dataset["context"][0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

# Second model - can a pair of targets be generated?

# BART

In [4]:
from transformers import BartTokenizerFast

In [5]:
article_length = 512
summary_length = 64
batch_size     = 4

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

In [9]:
dataset["answers"][0]

{'text': ['in the late 1990s'], 'answer_start': [269]}

In [6]:
tokenizer.add_tokens("<ans>")

1

In [14]:
dataset["answers"][0]["text"][0]

'in the late 1990s'

In [53]:
len(dataset["answers"])

50000

In [None]:
answers = []
for answer in dataset["answers"]:
    if answer["text"] != []:
        answers.append(answer["text"][0])
    else:
        answers.append("")

qs_and_as = []
for i in range(len(answers)):
    qs_and_as.append(dataset["question"][i] + " <ans> " + answers[i])

In [58]:
context_tokens = tokenizer(dataset["context"], padding="max_length", truncation=True, max_length = article_length)
#question_tokens = tokenizer(dataset["question"], truncation=True, max_length=32)
#answer_tokens = tokenizer(answers, truncation=True, max_length=32)
qs_and_as_tokens = tokenizer(qs_and_as, padding="max_length", truncation=True, max_length = summary_length)

In [None]:
# sep token
dict(sorted({v:k for k, v in tokenizer.vocab.items()}.items()))

In [None]:
qs_and_as_tokens.input_ids

In [None]:
tokenizer.vocab["<s>"]

In [None]:
# pad input_ids with 1s in BART 
# pad attention_mask with 0s
# decoder_input_ids = []
# decoder_attention_masks = []
# for i in range(len(question_tokens["input_ids"])):
#     decoder_input_ids.append(question_tokens.input_ids[i] + answer_tokens.input_ids[i][1:])
#     decoder_attention_masks.append(question_tokens.attention_mask[i] + answer_tokens.attention_mask[i][1:])
# decoder_input_ids[0]

In [None]:
# fill_input_ids = [1] * summary_length
# fill_attention_masks = [0] * summary_length
# decoder_input_ids_padded = [sublist[:summary_length] + fill_input_ids[len(sublist):] for sublist in decoder_input_ids]
# attention_masks_padded = [sublist[:summary_length] + fill_attention_masks[len(sublist):] for sublist in decoder_attention_masks]

In [None]:
train_data = {}
train_data["input_ids"] = context_tokens.input_ids
train_data["attention_mask"] = context_tokens.attention_mask
train_data["decoder_input_ids"] = qs_and_as_tokens.input_ids
train_data["decoder_attention_mask"] = qs_and_as_tokens.attention_mask
train_data["labels"] = qs_and_as_tokens.input_ids.copy()
#train_data["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in train_data["labels"]]

In [None]:
tf_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(2)

In [None]:
# from transformers import TFBartForConditionalGeneration

# model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-base")
# model.compile(optimizer="adam")
# model.resize_token_embeddings(len(tokenizer))

In [None]:
# tf.config.run_functions_eagerly(True)
# #model.fit(tf_dataset, epochs=20)

In [None]:
# test_string = dataset["context"][0]
# test_tokens = tokenizer(test_string, padding ="max_length", truncation=True, max_length = article_length)
# test_data = {}
# test_data["input_ids"] = [test_tokens.input_ids]
# test_data["attention_mask"] = [test_tokens.attention_mask]
# test_data["decoder_input_ids"] = [[0] + [-100] * (summary_length - 1)]
# test_data["decoder_attention_mask"] = [[1] + [-100] * (summary_length - 1)]
# test_data["labels"] = [[0] + [-100] * (summary_length - 1)]
# test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(1)
# y_pred = model.predict(test_dataset).logits

In [None]:
# tf.argmax(y_pred[0], axis=1)

## T5

In [59]:
from transformers import T5TokenizerFast

article_length = 512
summary_length = 64
batch_size     = 4

tokenizer = T5TokenizerFast.from_pretrained('t5-small')
tokenizer.add_tokens("<ans>")

1

In [61]:
answers = []
for answer in dataset["answers"]:
    if answer["text"] != []:
        answers.append(answer["text"][0])
    else:
        answers.append("")

qs_and_as = []
for i in range(len(answers)):
    qs_and_as.append(dataset["question"][i] + " <ans> " + answers[i])

context_tokens = tokenizer(dataset["context"], padding="max_length", truncation=True, max_length = article_length)
#question_tokens = tokenizer(dataset["question"], truncation=True, max_length=32)
#answer_tokens = tokenizer(answers, truncation=True, max_length=32)
qs_and_as_tokens = tokenizer(qs_and_as, padding="max_length", truncation=True, max_length = summary_length)

In [104]:
train_data = {}
train_data["input_ids"] = context_tokens.input_ids
train_data["attention_mask"] = context_tokens.attention_mask
train_data["decoder_input_ids"] = qs_and_as_tokens.input_ids
train_data["decoder_attention_mask"] = qs_and_as_tokens.attention_mask
train_data["labels"] = qs_and_as_tokens.input_ids.copy()
train_data["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in train_data["labels"]]
tf_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(2)

In [107]:
from transformers import TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
model.compile(optimizer="adam")
model.resize_token_embeddings(len(tokenizer))

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


<keras.src.layers.core.embedding.Embedding at 0x7f7d617ecf40>

In [119]:
model.fit(tf_dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [129]:
test_string = dataset["context"][1]
test_tokens = tokenizer(test_string, padding ="max_length", truncation=True, max_length = article_length)
test_data = {}
test_data["input_ids"] = [test_tokens.input_ids]
test_data["attention_mask"] = [test_tokens.attention_mask]
test_data["decoder_input_ids"] = [[0] + [-100] * 63]
test_data["decoder_attention_mask"] = [[1] * 64]
test_data["labels"] = [[0] + [-100] * 63]
test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(1)
y_pred = model.predict(test_dataset).logits



In [130]:
test_tokens = tf.argmax(y_pred[0], axis=1)

In [131]:
token_lookup = dict(sorted({v:k for k, v in tokenizer.vocab.items()}.items()))

In [132]:
tokenizer.decode(test_tokens)

'<pad> best best best best best best best best best best best best best best best best best best best best best best best best best best best best best best best best bestssssssssssssssssssssssssssssss'

In [127]:
decoder = model.get_decoder()

In [128]:
test_dataset.get_single_element()

{'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
 array([[  493,    63,   106,    75,   154,  3156,     7,   693,  8900,
           965,    18,  6936,   449,    41,    87,   115,    23,     2,
           354,     2,    29,     7,    15,     2,    87,    36,    15,
            18,   476,  4170,    18,  8735,    61,    41,  7473,  1600,
          6464, 15465,    61,    19,    46,   797,  7634,     6,     3,
         21101,     6,  1368,  8211,    11, 15676,     5, 12896,    11,
          3279,    16,  8018,     6,  2514,     6,   255,  3032,    16,
           796,  8782,    11, 10410,  2259,     7,    38,     3,     9,
           861,     6,    11,  4659,    12, 10393,    16,     8,  1480,
          5541,     7,    38,   991,  7634,    13,   391,   184,   279,
          3202,    18, 10739, 19344,    63,    31,     7,  9364,     5,
         19607,    26,    57,   160,  2353,     6,  9762,    15,   210,
          8900,   965,     6,     8,   563,  1632,    80,    13,     8,
  

In [84]:
decoder(test_dataset.get_single_element())

TypeError: Exception encountered when calling layer 'decoder' (type TFT5MainLayer).

TFT5MainLayer.call() got an unexpected keyword argument 'decoder_input_ids'

Call arguments received by layer 'decoder' (type TFT5MainLayer):
  • input_ids={'input_ids': 'tf.Tensor(shape=(512,), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(512,), dtype=int32)', 'decoder_input_ids': 'tf.Tensor(shape=(512,), dtype=int32)', 'decoder_attention_mask': 'tf.Tensor(shape=(512,), dtype=int32)', 'labels': 'tf.Tensor(shape=(512,), dtype=int32)'}
  • attention_mask=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • inputs_embeds=None
  • head_mask=None
  • encoder_head_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [118]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
pipe(dataset["context"][0])

[{'generated_text': 'with with with with with with with with with with with with with with with with with with'}]

In [133]:
model.save_weights("t5checkpoint.h5")