#IMPORTING AND DEFINING PARAMETERS

In [1]:
import numpy as np
import tensorflow as tf

# This is enabled so that we can use pre-defined functions inside the tensorflow.keras.Model()
# In this case, we use the function "create_look_ahead_mask"
tf.config.run_functions_eagerly(True)

# The values below are suggested in the GPT-2 paper
embed_dim = 768
num_head = 12
seq_len = 5
key_dim = embed_dim//num_head

#TEXT PRE-PROCESSING, VOCABULARY CREATION & TOKENIZATION

In [10]:
with open('SherlockHolmesStory.txt', 'r', encoding='utf-8') as file:
  raw_text = file.read()
text = ''.join(raw_text.split('\n'))

altered_text = ''
for word in text.split():
  altered_word = ''
  for character in word:
    if character not in {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
                         'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}:
      altered_word += ' ' + character + ' '
    else: altered_word += character
  altered_text += altered_word + ' ' if altered_word[-1] != ' ' else altered_word

vocab_size = 0
vocabulary = {'<PAD>': 0}
vocab_size += len(vocabulary)
for word in altered_text.split():
  if word not in vocabulary:
    vocabulary[word] = vocab_size
    vocab_size += 1
print(vocab_size, vocabulary)

tokenized_text = [vocabulary[i] for i in altered_text.split()]
print([i for i in altered_text.split()])
print(tokenized_text)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 9, 21, 22, 23, 4, 24, 25, 26, 7, 27, 28, 16, 29, 9, 30, 31, 32, 33, 34, 35, 18, 36, 37, 38, 39, 40, 41, 42, 9, 43, 44, 45, 25, 33, 46, 47, 45, 48, 49, 38, 22, 50, 45, 51, 52, 53, 54, 55, 9, 56, 31, 45, 10, 57, 58, 45, 7, 59, 60, 61, 25, 62, 63, 33, 7, 64, 65, 66, 45, 52, 67, 68, 69, 34, 70, 11, 71, 72, 73, 68, 74, 75, 9, 56, 76, 77, 28, 7, 78, 79, 45, 80, 81, 68, 82, 25, 68, 83, 9, 84, 48, 85, 86, 40, 7, 87, 88, 88, 89, 40, 90, 7, 91, 92, 93, 94, 95, 96, 25, 97, 9, 98, 40, 7, 99, 100, 38, 101, 102, 103, 104, 22, 105, 106, 25, 107, 108, 109, 31, 38, 110, 68, 111, 112, 113, 114, 115, 68, 116, 117, 118, 22, 119, 120, 9, 121, 73, 68, 122, 123, 45, 124, 68, 125, 73, 46, 28, 22, 105, 126, 88, 127, 128, 45, 70, 32, 129, 130, 131, 132, 68, 133, 36, 73, 68, 134, 102, 67, 22, 9, 135, 136, 137, 31, 52, 46, 8, 38, 14, 45, 25, 33, 8, 31, 7, 138, 41, 42, 45, 28, 139, 25, 140, 141, 9, 10, 142, 66, 143, 28, 3, 144, 9, 145, 146, 14

#DATASET CREATION

In [3]:
broken_sentences = []
X, Y = [], []
for i in range(0, len(tokenized_text)):
  broken_sentences.append(tokenized_text[i:i+seq_len+1])

for item in broken_sentences:
  X.append(item[0:seq_len])
  Y.append([item[-1]])
X_Train = np.array(tf.keras.preprocessing.sequence.pad_sequences(X, maxlen = seq_len, padding = 'post'))
Y_Train = np.array(Y)
X_Train.shape, Y_Train.shape, X_Train, Y_Train

((128739, 5),
 (128739, 1),
 array([[   1,    2,    3,    4,    5],
        [   2,    3,    4,    5,    6],
        [   3,    4,    5,    6,    7],
        ...,
        [2209, 1252,    9,    0,    0],
        [1252,    9,    0,    0,    0],
        [   9,    0,    0,    0,    0]], dtype=int32),
 array([[6],
        [7],
        [8],
        ...,
        [9],
        [9],
        [9]]))

##DEFINING MODEL

In [None]:
# Look ahead mask for restricting the architecture to attend to future tokens
def create_look_ahead_mask(seq_len):
  mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  return 1-mask
look_ahead_mask = create_look_ahead_mask(seq_len)

# Function to extract the last words' embedding from the matrix of word embeddings in previous layer
def expand_last_timestep(x):
  seq_len = tf.shape(x)[1]
  return tf.expand_dims(x[:, seq_len - 1, :], axis=1)

# Defining the 1-layer Transformer model as said in GPT-2 paper
def MODEL(seq_len, embed_dim, num_head, key_dim, vocab_size):
  # Input
  inp = tf.keras.Input((seq_len,))
  length = seq_len

  # Token/Word Embedding
  TokenEmbedding = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embed_dim)(inp)

  # Position Embedding
  positions = tf.range(start=0, limit=length, delta=1)
  PositionEmbedding = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embed_dim)(positions)

  # Merged Token & Position Embedding
  inputs = TokenEmbedding + PositionEmbedding

  ##############################################################################==LAYER 1

  # Key X Weight matrix
  key_1 = tf.keras.layers.Dense(embed_dim)(inputs)

  # Query X Weight matrix
  query_1 = tf.keras.layers.Dense(embed_dim)(inputs)

  # Value X Weight matrix
  value_1 = tf.keras.layers.Dense(embed_dim)(inputs)

  # Multi-head attention layer
  multi_head_attention_1 = tf.keras.layers.MultiHeadAttention(num_heads = num_head, key_dim = key_dim)
  multi_head_attention_1 = multi_head_attention_1(
    query=query_1,
    value=value_1,
    key=key_1,
    attention_mask=look_ahead_mask[tf.newaxis, :, :]
  )

  # Add + Layer Normalize
  add_11 = inputs + multi_head_attention_1
  layerNorm_11 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_11)

  # Feed forward neural network
  ffn_1 = tf.keras.Sequential(
      [
          tf.keras.layers.Dense(units=4*embed_dim, activation="relu"),
          tf.keras.layers.Dense(embed_dim),
      ]
      )(layerNorm_11)

  # Add + Layer Normalize
  add_12 = layerNorm_11 + ffn_1
  layerNorm_12 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_12)

  # Softmax over vocabulary
  out = tf.keras.layers.Dense(units=vocab_size, activation="softmax")(layerNorm_12)

  # Extract the last words' embedding from the matrix of word embeddings in previous layer
  out = tf.keras.layers.Lambda(expand_last_timestep, output_shape=(1, vocab_size))(out)

  # Defining model
  model = tf.keras.Model(inputs=inp, outputs=out)

  # Compiling the model using loss = "sparse_categorical_crossentropy", since, I am not encoding the output (I am using the tokens itself as y_actual)
  model.compile(
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      optimizer="adam"
  )
  return model

pre_trained_model = MODEL(seq_len, embed_dim, num_head, key_dim, vocab_size)
pre_trained_model.summary()

##MODEL TRAINING

In [None]:
pre_trained_model.fit(X_Train, Y_Train, epochs=100, batch_size=256)

Epoch 1/20




[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 424ms/step - loss: 5.6405
Epoch 2/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 428ms/step - loss: 4.2795
Epoch 3/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 429ms/step - loss: 3.7702
Epoch 4/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 427ms/step - loss: 3.4001
Epoch 5/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 427ms/step - loss: 3.0278
Epoch 6/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 426ms/step - loss: 2.6506
Epoch 7/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 426ms/step - loss: 2.2850
Epoch 8/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 426ms/step - loss: 2.0088
Epoch 9/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 427ms/step - loss: 1.7658
Epoch 10/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x78c144f6ed50>

##SAVE MODEL

In [None]:
pre_trained_model.save('SHolmes_NWP_PRETRAINED.keras')

##LOAD SAVED MODEL

In [4]:
# To include lambda layer in the loaded model
tf.keras.config.enable_unsafe_deserialization()

# If there is a custom function layer defined in the actual model, then the
# definition of the function is to be redefined and the definition is to be
# mentioned in the custom_objects parameter of the keras.models.load_model()
def expand_last_timestep(x):
  seq_len = tf.shape(x)[1]
  return tf.expand_dims(x[:, seq_len - 1, :], axis=1)

uploaded_model = tf.keras.models.load_model("SHolmes_NWP_PRETRAINED.keras",
                                            custom_objects={'expand_last_timestep': expand_last_timestep})
uploaded_model.summary()

##TRAIN LOADED MODEL

In [5]:
uploaded_model.fit(X_Train, Y_Train, epochs=20, batch_size=256)

Epoch 1/20




[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 410ms/step - loss: 0.1906
Epoch 2/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 412ms/step - loss: 0.2015
Epoch 3/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 411ms/step - loss: 0.2126
Epoch 4/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 410ms/step - loss: 0.2108
Epoch 5/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 410ms/step - loss: 0.2016
Epoch 6/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 411ms/step - loss: 0.1761
Epoch 7/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 411ms/step - loss: 0.1952
Epoch 8/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 410ms/step - loss: 0.2141
Epoch 9/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 410ms/step - loss: 0.1993
Epoch 10/20
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x79c6488d7510>

In [6]:
uploaded_model.save('SHolmes_NWP_PRETRAINED.keras')

#MODEL TESTING

In [5]:
# Testing the model on a already trained item
test_id = 210
rev_vocab = {val:key for key, val in vocabulary.items()}

predicted_tokens = np.squeeze(uploaded_model.predict(np.expand_dims(X_Train[test_id], axis=0)))
predicted_string = [rev_vocab[np.argmax(i)] for i in predicted_tokens]

print("Input text           :", ' '.join([rev_vocab[i] for i in X_Train[test_id]]))
print("Actual output text   :", ' '.join([rev_vocab[i] for i in Y_Train[test_id]]))
print("Predicted output text:", rev_vocab[np.argmax(predicted_tokens)])



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436ms/step
Input text           : And yet there was but
Actual output text   : one
Predicted output text: one


##INFERENCE

In [14]:
text = 'I have seldom heard him mention'
n = 20

# Generate next words
for i in range(n):
  tokenized_text = [vocabulary[i] for i in text.split()]
  padded_text = np.array(tf.keras.preprocessing.sequence.pad_sequences([tokenized_text], maxlen = seq_len, padding = 'post'))
  prediction = np.squeeze(uploaded_model.predict(np.expand_dims(padded_text[0], axis=0)))

  string = [rev_vocab[np.argmax(i)] for i in prediction]
  print("Input text           :", ' '.join([rev_vocab[i] for i in tokenized_text]))
  print("Predicted output text:", rev_vocab[np.argmax(prediction)])
  text = text + ' ' + rev_vocab[np.argmax(prediction)]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
Input text           : I have seldom heard him mention
Predicted output text: her
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
Input text           : I have seldom heard him mention her
Predicted output text: under
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Input text           : I have seldom heard him mention her under
Predicted output text: any
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Input text           : I have seldom heard him mention her under any
Predicted output text: other
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
Input text           : I have seldom heard him mention her under any other
Predicted output text: name
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Input text           : I have seldom heard him mention her under any other name
Predicted output text: .