# Imports and dataset loading

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import tensorflow as tf

dataset = load_dataset("squad_v2")["train"][:100]

2023-11-28 17:03:37.861940: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 17:03:37.861970: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 17:03:37.862007: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 17:03:37.868735: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset["context"][0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [3]:
dataset["answers"]

[{'text': ['in the late 1990s'], 'answer_start': [269]},
 {'text': ['singing and dancing'], 'answer_start': [207]},
 {'text': ['2003'], 'answer_start': [526]},
 {'text': ['Houston, Texas'], 'answer_start': [166]},
 {'text': ['late 1990s'], 'answer_start': [276]},
 {'text': ["Destiny's Child"], 'answer_start': [320]},
 {'text': ['Dangerously in Love'], 'answer_start': [505]},
 {'text': ['Mathew Knowles'], 'answer_start': [360]},
 {'text': ['late 1990s'], 'answer_start': [276]},
 {'text': ['lead singer'], 'answer_start': [290]},
 {'text': ['Dangerously in Love'], 'answer_start': [505]},
 {'text': ['2003'], 'answer_start': [526]},
 {'text': ['five'], 'answer_start': [590]},
 {'text': ['lead singer'], 'answer_start': [290]},
 {'text': ['Dangerously in Love'], 'answer_start': [505]},
 {'text': ['acting'], 'answer_start': [207]},
 {'text': ['Jay Z'], 'answer_start': [369]},
 {'text': ['six'], 'answer_start': [565]},
 {'text': ['Dreamgirls'], 'answer_start': [260]},
 {'text': ['2010'], 'answe

# Second model - can a pair of targets be generated?

# BART

In [4]:
from transformers import BartTokenizerFast

In [5]:
article_length = 512
summary_length = 64
batch_size     = 4

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

In [6]:
answers = [v["text"][0] for v in dataset["answers"]]

context_tokens = tokenizer(dataset["context"], padding="max_length", truncation=True, max_length = article_length)
question_tokens = tokenizer(dataset["question"], truncation=True, max_length=32)
answer_tokens = tokenizer(answers, truncation=True, max_length=32)

In [7]:
# sep token
{v:k for k, v in tokenizer.vocab.items()}[2]

'</s>'

In [8]:
print(len(question_tokens.attention_mask[0]))
print(len(answer_tokens.attention_mask[0]))

10
7


In [9]:
print(len(question_tokens.input_ids[0]))
print(len(answer_tokens.input_ids[0]))

10
7


In [10]:
# pad input_ids with 1s in BART 
# pad attention_mask with 0s
decoder_input_ids = []
decoder_attention_masks = []
for i in range(len(question_tokens["input_ids"])):
    decoder_input_ids.append(question_tokens.input_ids[i] + answer_tokens.input_ids[i][1:])
    decoder_attention_masks.append(question_tokens.attention_mask[i] + answer_tokens.attention_mask[i][1:])
decoder_input_ids[0]

[0, 1779, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 179, 5, 628, 4525, 29, 2]

In [11]:
fill_input_ids = [1] * summary_length
fill_attention_masks = [0] * summary_length
decoder_input_ids_padded = [sublist[:summary_length] + fill_input_ids[len(sublist):] for sublist in decoder_input_ids]
attention_masks_padded = [sublist[:summary_length] + fill_attention_masks[len(sublist):] for sublist in decoder_attention_masks]

In [12]:
train_data = {}
train_data["input_ids"] = context_tokens.input_ids
train_data["attention_mask"] = context_tokens.attention_mask
train_data["decoder_input_ids"] = decoder_input_ids_padded
train_data["decoder_attention_mask"] = attention_masks_padded
train_data["labels"] = decoder_input_ids_padded.copy()
train_data["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in train_data["labels"]]

In [13]:
train_data["labels"]

[[0,
  1779,
  222,
  12674,
  1755,
  386,
  1959,
  1406,
  116,
  2,
  179,
  5,
  628,
  4525,
  29,
  2,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [0,
  2264,
  911,
  222,
  12674,
  1755,
  3511,
  11,
  77,
  79,
  21,
  1197,
  62,
  116,
  2,
  26058,
  154,
  8,
  7950,
  2,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [0,
  1779,
  2

In [14]:
tf_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(2)

2023-11-28 17:03:40.673482: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-28 17:03:40.677801: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-28 17:03:40.677992: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [15]:
from transformers import TFBartForConditionalGeneration

model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.compile(optimizer="adam")

2023-11-28 17:03:41.421937: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [16]:
tf.config.run_functions_eagerly(True)
model.fit(tf_dataset, epochs=10)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
test_string = dataset["context"][0]
test_tokens = tokenizer(test_string, padding ="max_length", truncation=True, max_length = article_length)
test_data = {}
test_data["input_ids"] = [test_tokens.input_ids]
test_data["attention_mask"] = [test_tokens.attention_mask]
#test_data["decoder_input_ids"] = [test_tokens.input_ids]
#test_data["decoder_attention_mask"] = [test_tokens.attention_mask]
#test_data["labels"] = [test_tokens.input_ids]
test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(1)
y_pred = model.predict(test_dataset)

In [None]:
y_pred.logits.shape

In [None]:
tf.argmax(y_pred.logits[0], axis=1)