# Exploring the SQuAD dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")

explore_df = dataset["train"].to_pandas()

text = explore_df.head(1)["context"][0]
text

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

# Build a `keras` transformer model

## Imports

In [86]:
from keras_nlp.layers import TokenAndPositionEmbedding
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

## Building vectorizer

In [11]:
context_list = explore_df["context"].unique()[:100]

In [4]:
# Maximum context length, in words
max_len = max([len(context.split()) for context in context_list])
max_len

326

In [5]:
context_list[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [6]:
vectorize_layer = TextVectorization(
    standardize = "lower",
    output_mode="int",
    output_sequence_length=max_len,

)
vectorize_layer.adapt(context_list)

2023-11-27 15:36:27.660001: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2023-11-27 15:36:27.660054: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: rob-laptop
2023-11-27 15:36:27.660062: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: rob-laptop
2023-11-27 15:36:27.660154: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.129.3
2023-11-27 15:36:27.660178: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.129.3
2023-11-27 15:36:27.660184: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.129.3


In [57]:
vocab = vectorize_layer.get_vocabulary()

vocab_size = len(vocab)
vocab_size

4200

In [10]:
index_lookup = dict(zip(range(len(vocab)), vocab))
index_lookup

{0: '',
 1: '[UNK]',
 2: 'the',
 3: 'in',
 4: 'and',
 5: 'of',
 6: 'a',
 7: 'to',
 8: 'her',
 9: 'beyoncé',
 10: 'on',
 11: 'was',
 12: 'for',
 13: 'with',
 14: 'his',
 15: 'at',
 16: 'as',
 17: 'she',
 18: 'he',
 19: 'that',
 20: 'by',
 21: 'from',
 22: 'which',
 23: 'chopin',
 24: 'first',
 25: 'music',
 26: 'their',
 27: 'an',
 28: 'also',
 29: 'had',
 30: 'album',
 31: 'is',
 32: 'has',
 33: 'best',
 34: "beyoncé's",
 35: 'one',
 36: 'after',
 37: 'it',
 38: 'most',
 39: 'i',
 40: 'this',
 41: 'song',
 42: 'two',
 43: 'released',
 44: 'who',
 45: 'million',
 46: 'during',
 47: 'new',
 48: 'its',
 49: 'artist',
 50: 'were',
 51: 'single',
 52: 'other',
 53: 'have',
 54: 'second',
 55: 'billboard',
 56: 'solo',
 57: 'performed',
 58: 'not',
 59: 'jay',
 60: "destiny's",
 61: "chopin's",
 62: 'would',
 63: 'where',
 64: 'when',
 65: 'video',
 66: 'number',
 67: 'piano',
 68: 'made',
 69: 'group',
 70: 'concert',
 71: 'became',
 72: 'be',
 73: 'all',
 74: 'they',
 75: 'songs',
 76: 're

In [14]:
sentence_as_tokens = vectorize_layer(context_list[0])
translated_back = [index_lookup[token] for token in sentence_as_tokens.numpy().tolist()]
translated_back

['beyoncé',
 'giselle',
 'knowles-carter',
 '(/biːˈjɒnseɪ/',
 'bee-yon-say)',
 '(born',
 'september',
 '4,',
 '1981)',
 'is',
 'an',
 'american',
 'singer,',
 'songwriter,',
 'record',
 'producer',
 'and',
 'actress.',
 'born',
 'and',
 'raised',
 'in',
 'houston,',
 'texas,',
 'she',
 'performed',
 'in',
 'various',
 'singing',
 'and',
 'dancing',
 'competitions',
 'as',
 'a',
 'child,',
 'and',
 'rose',
 'to',
 'fame',
 'in',
 'the',
 'late',
 '1990s',
 'as',
 'lead',
 'singer',
 'of',
 'r&b',
 'girl-group',
 "destiny's",
 'child.',
 'managed',
 'by',
 'her',
 'father,',
 'mathew',
 'knowles,',
 'the',
 'group',
 'became',
 'one',
 'of',
 'the',
 "world's",
 'best-selling',
 'girl',
 'groups',
 'of',
 'all',
 'time.',
 'their',
 'hiatus',
 'saw',
 'the',
 'release',
 'of',
 "beyoncé's",
 'debut',
 'album,',
 'dangerously',
 'in',
 'love',
 '(2003),',
 'which',
 'established',
 'her',
 'as',
 'a',
 'solo',
 'artist',
 'worldwide,',
 'earned',
 'five',
 'grammy',
 'awards',
 'and',
 'f

In [16]:
all_tokenized = [vectorize_layer(context) for context in context_list]

## Creating masked tokens & targets

In [30]:
def X_y_creator(sequence_tensor):
    # tile / copy the input sequence up to max_len -1
    tiled_sequence = tf.tile(tf.expand_dims(sequence_tensor, 0), [max_len - 1, 1])
    # use band_part to diagonally mask the sequence of tensors, so that there is a sequence of incremental strings
    X_s = tf.linalg.band_part(tiled_sequence, -1, 0)
    # y is simply the next word in the sequence
    y_s = sequence_tensor[1:]
    return X_s, y_s

In [106]:
Xs = []
ys = []
for sequence in all_tokenized:
    X, y = X_y_creator(sequence)
    Xs.append(X)
    ys.append(y)

In [107]:
Xs[0]

<tf.Tensor: shape=(325, 326), dtype=int64, numpy=
array([[   9,    0,    0, ...,    0,    0,    0],
       [   9, 1115,    0, ...,    0,    0,    0],
       [   9, 1115, 2598, ...,    0,    0,    0],
       ...,
       [   9, 1115, 2598, ...,    0,    0,    0],
       [   9, 1115, 2598, ...,    0,    0,    0],
       [   9, 1115, 2598, ...,    0,    0,    0]])>

In [108]:
# Create a 2d tensor for X - this will have shape ((max_len - 1) * n_sequences, max_len)
X = tf.concat(Xs, axis = 0)
# Create a 1d tensor for y - this will have shape (max_len - 1)
y = tf.concat(ys, axis=0)

In [111]:
X[0]

<tf.Tensor: shape=(326,), dtype=int64, numpy=
array([9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     

In [112]:
# Remove any sequences where the next predicted y is 0 - ie, where the next predicted word is null.
X = X[y != 0]
y = y[y != 0]

In [104]:
y.shape

TensorShape([13546])

In [113]:
X.shape

TensorShape([13546, 326])

In [141]:
embedding_dimensions = 256

inputs = layers.Input(shape=(max_len,), dtype=tf.int32)
x = TokenAndPositionEmbedding(vocab_size, max_len, embedding_dimensions, mask_zero=True)(inputs)
x = layers.Dense(vocab_size, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(vocab_size, activation="relu")(x)
x = tf.reduce_mean(x, axis=1)
outputs = layers.Dense(vocab_size, activation="softmax")(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [142]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 326)]             0         
                                                                 
 token_and_position_embeddi  (None, 326, 256)          1158656   
 ng_17 (TokenAndPositionEmb                                      
 edding)                                                         
                                                                 
 dense_43 (Dense)            (None, 326, 4200)         1079400   
                                                                 
 dropout_2 (Dropout)         (None, 326, 4200)         0         
                                                                 
 dense_44 (Dense)            (None, 326, 4200)         17644200  
                                                                 
 tf.math.reduce_mean_2 (TFO  (None, 4200)              0   

In [144]:
history = model.fit(X, y, epochs = 10, workers= 5)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 