In [8]:
# Import the libraries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
# Import the libraries
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Turn off the tensorflow loggings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Initialization
output_sequence_length = 5
vocab_size = 10
output_length = 6


############################
#    TEXT VECTORIZATION    #
############################

# Sample dataset
sentences = [["I am a robot"], ["you too robot"]]

# Convert to tf.data
sentence_data = tf.data.Dataset.from_tensor_slices(sentences)

# Convert to tensors
word_tensors = tf.convert_to_tensor(sentences, dtype=tf.string)

# Create text vectorizer (for preprocessing)
vectorize_layer = tf.keras.layers.TextVectorization(output_sequence_length=output_sequence_length,
                                                    max_tokens=vocab_size)
# Train the layer
vectorize_layer.adapt(sentence_data)

# Preprocess the data
vectorized_words = vectorize_layer(word_tensors)

# Report
print("Vocabulary: ", vectorize_layer.get_vocabulary())
print("Vectorized words: ", vectorized_words)



In [2]:
# Initialization
output_sequence_length = 5
vocab_size = 10

In [9]:
# Dataset
sentences = [["I am a robot"], ["you too robot"]]
sentences

[['I am a robot'], ['you too robot']]

In [10]:
# Convert to tf.data
sentence_data = tf.data.Dataset.from_tensor_slices(sentences)
sentence_data

<_TensorSliceDataset element_spec=TensorSpec(shape=(1,), dtype=tf.string, name=None)>

In [14]:
# Convert to tensors
word_tensors = tf.convert_to_tensor(sentences, dtype=tf.string)
word_tensors

<tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'I am a robot'],
       [b'you too robot']], dtype=object)>

In [15]:
# Create text vectorizer (for preprocessing)
vectorize_layer = tf.keras.layers.TextVectorization(output_sequence_length=output_sequence_length,
                                                    max_tokens=vocab_size)
# Train the layer
vectorize_layer.adapt(sentence_data)

# Preprocess the data
vectorized_words = vectorize_layer(word_tensors)

# Report
print("Vocabulary: ", vectorize_layer.get_vocabulary())
print("Vectorized words: ", vectorized_words)

Vocabulary:  ['', '[UNK]', 'robot', 'you', 'too', 'i', 'am', 'a']
Vectorized words:  tf.Tensor(
[[5 6 7 2 0]
 [3 4 2 0 0]], shape=(2, 5), dtype=int64)


In [16]:
##############################
#    WORD EMBEDDING LAYER    #
##############################  

In [17]:
# Initialization
output_length = 6

In [26]:
# Embedding layer
word_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=output_length)

# Feed the data
embedded_words = word_embedding_layer(vectorized_words)

print("Embedded words: ", embedded_words)

Embedded words:  tf.Tensor(
[[[-0.01089402  0.01386965  0.04864142  0.01684561  0.03848663
    0.04736397]
  [ 0.00924904  0.00032661  0.03953311 -0.00486473 -0.04105572
   -0.02126145]
  [ 0.00253134 -0.03019602 -0.01808629 -0.01031756  0.02852184
    0.00721966]
  [-0.0281015  -0.04744308 -0.04805942 -0.03590043  0.00918899
   -0.03965997]
  [-0.04285062 -0.04663532 -0.02162063  0.02399271  0.0155862
   -0.03969042]]

 [[ 0.01689786 -0.01927587  0.00308625 -0.04438348  0.04125218
    0.00348777]
  [ 0.0100126   0.01868899  0.00589956 -0.03106445  0.03364502
    0.01358403]
  [-0.0281015  -0.04744308 -0.04805942 -0.03590043  0.00918899
   -0.03965997]
  [-0.04285062 -0.04663532 -0.02162063  0.02399271  0.0155862
   -0.03969042]
  [-0.04285062 -0.04663532 -0.02162063  0.02399271  0.0155862
   -0.03969042]]], shape=(2, 5, 6), dtype=float32)


In [None]:
##################################
#    POSITION EMBEDDING LAYER    #
##################################

In [27]:
# Embedding layer
position_embedding_layer = tf.keras.layers.Embedding(input_dim=output_sequence_length, output_dim=output_length)

# Initialize the positions
position_indices = tf.range(output_sequence_length)

# Feed the positions
embedded_indices = position_embedding_layer(position_indices)

print("Embedded indices: ", embedded_indices)

Embedded indices:  tf.Tensor(
[[ 0.00608952  0.00206887 -0.03670409  0.03553356  0.03877778 -0.04307035]
 [ 0.04962518  0.04153587 -0.00615101 -0.04446935  0.00044483  0.00444321]
 [ 0.04820463  0.01890317  0.03044811 -0.01812079 -0.04739708  0.01327846]
 [-0.0372933  -0.01662916  0.03327545 -0.01628338 -0.0472406  -0.01894217]
 [ 0.02533323  0.04628959  0.03844256  0.02668405  0.03564659 -0.03734051]], shape=(5, 6), dtype=float32)


In [None]:
#########################
#    FINAL EMBEDDING    #
#########################

In [28]:
# Sum up the word and position embedding
final_output_embedding  = embedded_words + embedded_indices

print("Final output embedding: ", final_output_embedding)

Final output embedding:  tf.Tensor(
[[[-0.0048045   0.01593852  0.01193733  0.05237917  0.07726441
    0.00429362]
  [ 0.05887423  0.04186248  0.0333821  -0.04933408 -0.04061089
   -0.01681825]
  [ 0.05073597 -0.01129285  0.01236182 -0.02843835 -0.01887524
    0.02049812]
  [-0.0653948  -0.06407224 -0.01478396 -0.05218381 -0.03805162
   -0.05860213]
  [-0.0175174  -0.00034573  0.01682193  0.05067676  0.05123279
   -0.07703093]]

 [[ 0.02298738 -0.017207   -0.03361784 -0.00884992  0.08002996
   -0.03958259]
  [ 0.05963779  0.06022485 -0.00025145 -0.07553379  0.03408985
    0.01802723]
  [ 0.02010312 -0.02853991 -0.0176113  -0.05402121 -0.03820809
   -0.02638151]
  [-0.08014393 -0.06326447  0.01165482  0.00770932 -0.0316544
   -0.05863259]
  [-0.0175174  -0.00034573  0.01682193  0.05067676  0.05123279
   -0.07703093]]], shape=(2, 5, 6), dtype=float32)


In [None]:
###############################
#    EMBEDDING LAYER CLASS    #
###############################

In [29]:
# POsition embedding layer
class PositionEmbeddingLayer(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Word embedding layer
        self.word_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=output_dim)

        # Position embedding layer
        self.position_embedding_layer = tf.keras.layers.Embedding(input_dim=seq_length, output_dim=output_dim)

    # Call function
    def call(self, inputs):

        # Initialize the positions
        position_indices = tf.range(start=0, limit=tf.shape(inputs)[-1])

        # Feed words and positions to embedding layer
        embedded_words = self.word_embedding_layer(inputs)
        embedded_positions = self.position_embedding_layer(position_indices)

        # Sum up the embeddings
        out = embedded_words + embedded_positions

        return out

In [31]:
# Initialize the custom layer
custom_embedding_layer = PositionEmbeddingLayer(seq_length=output_sequence_length,
                                                vocab_size=vocab_size,
                                                output_dim=output_length)

# Feed the data
embedded_layer_output = custom_embedding_layer(vectorized_words)

print("Embedded layer output: ", embedded_layer_output)

Embedded layer output:  tf.Tensor(
[[[-0.0048573   0.01108037 -0.00096008  0.00831679 -0.07041946
   -0.00099148]
  [-0.04668096 -0.04576692  0.04594807 -0.0551206  -0.07384367
    0.00989917]
  [ 0.00748022  0.04786074 -0.07845758  0.0040458  -0.04062707
    0.03330592]
  [-0.01791889  0.03598766 -0.0087663  -0.01792306  0.0187975
    0.03526177]
  [ 0.00701702 -0.06934252  0.03205752 -0.0727486   0.06291012
   -0.00442809]]

 [[-0.007153   -0.02741077 -0.06575882 -0.00222047 -0.02321532
    0.00115896]
  [-0.01011484  0.00387889  0.02005375  0.00350453 -0.0184971
    0.0601273 ]
  [-0.00112214  0.05705871 -0.07803564  0.00512144 -0.05553848
    0.00906985]
  [ 0.04086523 -0.01879575  0.04493554 -0.07944542  0.06024898
   -0.00207628]
  [ 0.00701702 -0.06934252  0.03205752 -0.0727486   0.06291012
   -0.00442809]]], shape=(2, 5, 6), dtype=float32)


In [None]:
#####################################################
#    POSITIONAL EMBEDDING LAYER WITH SINE/COSINE    #
#####################################################

In [1]:
# Turn off the tensorflow loggings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Import the libraries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Initialization
output_sequence_length = 5
vocab_size = 10
output_length = 6


############################
#    TEXT VECTORIZATION    #
############################

# Sample dataset
sentences = [["I am a robot"], ["you too robot"]]

# Convert to tf.data
sentence_data = tf.data.Dataset.from_tensor_slices(sentences)

# Convert to tensors
word_tensors = tf.convert_to_tensor(sentences, dtype=tf.string)

# Create text vectorizer (for preprocessing)
vectorize_layer = tf.keras.layers.TextVectorization(output_sequence_length=output_sequence_length,
                                                    max_tokens=vocab_size)
# Train the layer
vectorize_layer.adapt(sentence_data)

# Preprocess the data
vectorized_words = vectorize_layer(word_tensors)

# Report
print("Vocabulary: ", vectorize_layer.get_vocabulary())
print("Vectorized words: ", vectorized_words)



Vocabulary:  ['', '[UNK]', 'robot', 'you', 'too', 'i', 'am', 'a']
Vectorized words:  tf.Tensor(
[[5 6 7 2 0]
 [3 4 2 0 0]], shape=(2, 5), dtype=int64)


In [8]:
###################################
#    POSITIONAL ENCODING LAYER    #
###################################

# POsition embedding layer
class PositionEmbeddingLayer(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Word embedding layer
        self.word_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=output_dim)

        # Position embedding layer
        self.position_embedding_layer = tf.keras.layers.Embedding(input_dim=seq_length, output_dim=output_dim)

    # Call function
    def call(self, inputs):

        # Initialize the positions
        position_indices = tf.range(start=0, limit=tf.shape(inputs)[-1])

        # Feed words and positions to embedding layer
        embedded_words = self.word_embedding_layer(inputs)
        embedded_positions = self.position_embedding_layer(position_indices)

        # Sum up the embeddings
        out = embedded_words + embedded_positions

        return out
    
# Initialize the layer
custom_embedding_layer = PositionEmbeddingLayer(seq_length=output_sequence_length,
                                                vocab_size=vocab_size,
                                                output_dim=output_length
                                                )

# Feed the data
embedded_layer_output = custom_embedding_layer(vectorized_words)

print("Embedded layer output: ", embedded_layer_output)

Embedded layer output:  tf.Tensor(
[[[-0.9589243   1.2836622   0.23000172  1.9731903   0.01077196
    1.9999421 ]
  [ 0.56205547  1.5004725   0.3213085   1.9603932   0.01508068
    1.9999142 ]
  [ 1.566284    0.3377554   0.41192317  1.9433732   0.01938933
    1.999877  ]
  [ 1.0504174  -1.4061394   0.2314966   1.9860148   0.01077211
    1.9999698 ]
  [-0.7568025   0.3463564   0.18459873  1.982814    0.00861763
    1.9999628 ]]

 [[ 0.14112     0.0100075   0.1387981   1.9903207   0.00646326
    1.9999791 ]
  [ 0.08466846 -0.11334133  0.23099795  1.9817369   0.01077207
    1.9999605 ]
  [ 1.8185948  -0.8322937   0.185397    1.9913884   0.00861771
    1.9999814 ]
  [ 0.14112     0.0100075   0.1387981   1.9903207   0.00646326
    1.9999791 ]
  [-0.7568025   0.3463564   0.18459873  1.982814    0.00861763
    1.9999628 ]]], shape=(2, 5, 6), dtype=float32)


In [16]:
#######################################
#    SCALED DOT PRODUCCT ATTENTION    #
#######################################

In [21]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Import the libraries
import tensorflow as tf
import numpy as np

# Initialization
d_k = 64               # Dimension of the key vector (and also the query vector)
d_v = 64               # Dimension of the value vector
batch_size = 64
input_seq_length = 5   # Maximum length of the input sequence

# Scaled dot product class
class ScaledDotProductAttention(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

    # Call function
    def call(self, queries, keys, values, d_k, mask=None):

        # Attention socre
        attention_scores = tf.matmul(queries, keys, transpose_b=True) / tf.math.sqrt( tf.cast(d_k, tf.float32) )

        # Apply mask
        if mask is not None:

            # Apply mask
            attention_scores += -1e9 * mask

        # Apply softmax
        weights = tf.keras.backend.softmax(attention_scores)

        # Calculate the weighted sum of the value vectors
        out = tf.matmul(weights, values)

        return out
    
# Initialize the queries, keys, and values
queries = np.random.random((batch_size, input_seq_length, d_k))
keys = np.random.random((batch_size, input_seq_length, d_k))
values = np.random.random((batch_size, input_seq_length, d_v))

# Initialize the layer
attention_layer = ScaledDotProductAttention()

# Feed the data
attention_output = attention_layer(queries, keys, values, d_k)

print("Attention output: ", attention_output)

Attention output:  tf.Tensor(
[[[0.42552757 0.4209954  0.44070917 ... 0.44263887 0.71424043 0.6253689 ]
  [0.40977898 0.3985669  0.44573477 ... 0.48359036 0.7332069  0.58461004]
  [0.4088182  0.40428117 0.45158815 ... 0.48357344 0.7292079  0.58147603]
  [0.41378352 0.4027637  0.44322675 ... 0.5019091  0.7352413  0.56602585]
  [0.41841158 0.4188102  0.44571918 ... 0.45981276 0.71746486 0.60514796]]

 [[0.5964128  0.59765047 0.69386137 ... 0.53982836 0.6785065  0.6883336 ]
  [0.6070596  0.6019193  0.6729281  ... 0.528684   0.66200775 0.7016909 ]
  [0.6027177  0.5892477  0.6794846  ... 0.53259146 0.6672454  0.6915069 ]
  [0.59733444 0.59044796 0.6914817  ... 0.5307452  0.6742085  0.6903019 ]
  [0.6056535  0.5855043  0.67923135 ... 0.5180448  0.66661626 0.68405515]]

 [[0.60367906 0.52061844 0.34178936 ... 0.38859078 0.62887084 0.6432541 ]
  [0.58475596 0.5131419  0.34162548 ... 0.38594082 0.6261367  0.6466565 ]
  [0.5915015  0.51155126 0.35172242 ... 0.37740138 0.6168307  0.64545625]
  [0

In [None]:
##############################
#    MULTI-HEAD ATTENTION    #
##############################

In [1]:
# Turn off the tensorflow logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Import the libraries
import tensorflow as tf
import numpy as np

# Scaled dot product attention class
class ScaledDotProductAttentionLayer(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)


    # Call function
    def call(self, queries, keys, values, d_k, mask=None):

        # Attention score
        attention_score = tf.matmul(queries, keys, transpose_b=True) / tf.math.sqrt(tf.cast(d_k, tf.float32))

        # Apply mask
        if mask is not None:
            attention_score += (-1e9 * mask)

        # Apply softmax
        weights = tf.keras.backend.softmax(attention_score)

        # Calculate the weighted sum 
        out = tf.matmul(weights, values)

        return out
    

# Multi-head attention class
class MultiHeadAttentionLayer(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, h, d_k, d_v, d_model, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Initialize the scaled dot product attention layer
        self.attention = ScaledDotProductAttentionLayer()

        # Initialization
        self.heads = h            # Number of attention heads
        self.d_k = d_k            # Dimension of the key vector (and also the query vector)
        self.d_v = d_v            # Dimension of the value vector
        self.d_model = d_model    # Dimension of the model

        # Initialize dense layer for learned projection matrix for queries, keys, values, and model
        self.W_q = tf.keras.layers.Dense(units=d_k)
        self.W_k = tf.keras.layers.Dense(units=d_k)
        self.W_v = tf.keras.layers.Dense(units=d_v)
        self.W_o = tf.keras.layers.Dense(units=d_model)


    # Function for reshaping the tensor 
    def reshape_tensor(self, x, heads, flag):

        # If flag is on
        # Used when recieving the linearly projected queries, keys, or values as input
        # Final shape should be: (batch_size, heads, seq_length, -1)
        if flag:

            # Reshape the tensor
            x = tf.reshape(x, shape=(tf.shape(x)[0], tf.shape(x)[1], heads, -1))

            # Transpose the tensor
            x = tf.transpose(x, perm=(0, 2, 1, 3))

        # If flag is off
        # Use after the data feeded into the multi head attention layer
        # Final shape should be: (batch_size, seq_length, d_k)
        else:

            # Transpose
            x = tf.transpose(x, perm=(0, 2, 1, 3))

            # Reshape
            x = tf.reshape(x, shape=(tf.shape(x)[0], tf.shape(x)[1], self.d_k))

        return x
    

    # Call function
    def call(self, queries, keys, values, mask=None):

        # Reshape queries, keys, values to be able to compute all heads in parallel
        queries_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        keys_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        values_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        
        # Compute multi-head attention
        output_reshaped = self.attention(queries_reshaped, keys_reshaped, values_reshaped, self.d_k, mask)

        # Rearrange the output into concatenated form
        output = self.reshape_tensor(output_reshaped, self.heads, False)

        # Apply the linear projection to the output
        output = self.W_o(output)

        return output
    

# Initialization
h = 8                     # Number of self-attention heads
d_k = 64                  #  
d_v = 64                  #
d_model = 512             #
batch_size = 64           # 
input_seq_length = 64     # Maximum length of the input sequence


# Initialize the queries, keys, and values
queries = np.random.random((batch_size, input_seq_length, d_k))
keys = np.random.random((batch_size, input_seq_length, d_k))
values = np.random.random((batch_size, input_seq_length, d_v))


# Initialize the multi-head attention layer
multi_head_attention_layer = MultiHeadAttentionLayer(h, d_k, d_v, d_model)

# Feed the data
output_attention = multi_head_attention_layer(queries, keys, values)

print("Output of the multi-head attention: \n", output_attention)

Output of the multi-head attention: 
 tf.Tensor(
[[[-0.08046755 -0.280252   -0.19821672 ... -0.47180074  0.02163792
   -0.10966138]
  [-0.08029709 -0.27922976 -0.19712687 ... -0.47145772  0.02097024
   -0.11003992]
  [-0.07974803 -0.27940008 -0.19841315 ... -0.47166383  0.02262438
   -0.11007026]
  ...
  [-0.08090815 -0.28016812 -0.1984467  ... -0.47133493  0.02119658
   -0.10933197]
  [-0.08028192 -0.27965182 -0.19864747 ... -0.47178677  0.02115291
   -0.11039263]
  [-0.08022219 -0.27961904 -0.19810556 ... -0.47198555  0.02190784
   -0.11036518]]

 [[-0.08683407 -0.2937707  -0.19879058 ... -0.4523206   0.00739739
   -0.08516073]
  [-0.08527576 -0.29362044 -0.19832356 ... -0.45231304  0.00636609
   -0.08442787]
  [-0.08668507 -0.2950291  -0.19909163 ... -0.4523024   0.00647642
   -0.08469182]
  ...
  [-0.0869846  -0.29458934 -0.19853981 ... -0.45199537  0.00694377
   -0.08457109]
  [-0.08712101 -0.29391465 -0.19853598 ... -0.45353624  0.00666998
   -0.08377881]
  [-0.08585763 -0.294817

In [8]:
#############################
#    TRANSFORMER ENCODER    #
#############################

In [3]:
# Turn off the tensorflow logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Import the libraries
import numpy as np
import tensorflow as tf
from multihead_attention import MultiHeadAttentionLayer
from positional_encoding import PositionEmbeddingLayer, PositionEmbeddingLayerWithFixedWeights

# Custom layer for Add & Norm layer
class AddNormalization(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Initialize the layer normalization layer
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    # Call function
    def call(self, x, sublayer_x):

        # Add the sublayer input and output together
        add = x + sublayer_x

        # Apply layer normalization
        out = self.layer_norm(add)

        return out
    
# Custom layer for Feed-Forward layer
class FeedForward(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, d_ff, d_model, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Initialize the dense layers
        self.fully_connected_1 = tf.keras.layers.Dense(units=d_ff)
        self.fully_connected_2 = tf.keras.layers.Dense(units=d_model)

        # Initialize the activation function
        self.activation = tf.keras.layers.ReLU()

    # Call function
    def call(self, x):

        # Feed the data
        x = self.fully_connected_1(x)
        x = self.activation(x)
        x = self.fully_connected_2(x)

        return x
    
# Custom layer for Transformer Encoder
class EncoderLayer(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Attention layer
        self.multihead_attention = MultiHeadAttentionLayer(h, d_k, d_v, d_model)

        # Feed-forward layer
        self.feed_forward = FeedForward(d_ff, d_model)

        # Add & Norm layer
        self.add_norm_1 = AddNormalization()
        self.add_norm_2 = AddNormalization()

        # Dropout layer
        self.dropout_1 = tf.keras.layers.Dropout(rate)
        self.dropout_2 = tf.keras.layers.Dropout(rate)

    # Call function
    def call(self, x, padding_mask, training):

        # Feed the data
        multihead_output   = self.multihead_attention(x, x, x, padding_mask)
        multihead_output   = self.dropout_1(multihead_output, training=training)
        addnorm_output     = self.add_norm_1(x, multihead_output)
        feedforward_output = self.feed_forward(addnorm_output)
        feedforward_output = self.dropout_2(feedforward_output, training=training)
        addnorm_output     = self.add_norm_2(addnorm_output, feedforward_output)

        return addnorm_output

# Custom layer for the full model
class Encoder(tf.keras.layers.Layer):

    # Constructor function
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):

        # Inherite the parent's constructor
        super().__init__(**kwargs)

        # Positional encoding layer
        self.positional_encoding = PositionEmbeddingLayerWithFixedWeights(sequence_length, vocab_size, d_model)

        # Dropout layer
        self.dropout = tf.keras.layers.Dropout(rate)

        # Encoder layers (for N times)
        self.encoder_layer = [EncoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]

    # Call function
    def call(self, input_sentence, padding_mask, training):

        # Feed the data
        x = self.positional_encoding(input_sentence)
        x = self.dropout(x, training=training) 
        for i_index, i_layer in enumerate(self.encoder_layer):
            x = i_layer(x, padding_mask, training)

        return x
    
# Initialization
h = 8                     # Number of self-attention heads
d_k = 64                  # Dimension of the key and query vectors
d_v = 64                  # Dimension of the value vectors
d_ff = 2048               # Dimension of the inner feed-forward layer
d_model = 512             # Dimension of the mode syb-layer' output
n = 6                     # Number of encoder layers
batch_size = 64           # Batch size
dropout_rate = 0.1        # Dropout rate
enc_vocab_size = 8192     # Encoder vocabulary size
input_seq_length = 64     # Maximum length of the input sequence

# Initialize the input sequence
input_seq = np.random.random((batch_size, input_seq_length))

# Encoder architecture
encoder = Encoder(enc_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

# Feed the data
output_encoder = encoder(input_seq, None, True)

print("Output of the encoder: \n", output_encoder)

Output of the encoder: 
 tf.Tensor(
[[[ 0.37819418  0.12199633 -0.12122659 ...  1.2220829   0.9045144
   -1.1083595 ]
  [-0.70115393 -0.73868483  1.1814679  ...  1.0412371   0.18032189
    0.37272906]
  [-0.36027443 -1.2345542   0.28285587 ...  1.7093079   0.39469686
   -0.08427565]
  ...
  [-0.7033425  -0.81434023  0.2103533  ...  0.7067392   0.47598606
   -0.34587842]
  [-0.71203476 -0.01993795 -0.04863934 ...  0.8154599   0.00322881
   -0.15144242]
  [-0.22229958 -0.8755953   1.2803488  ...  1.049097    0.12476938
   -0.78157794]]

 [[-0.729063   -0.5781892   0.36787152 ...  1.5686175   0.7096653
    0.02643431]
  [-0.1265513  -0.3757296   0.3155213  ...  1.2185204   0.62619156
   -0.25125715]
  [-0.95256996  0.37783024  0.41807356 ...  1.8436189   1.0409188
    0.01972632]
  ...
  [-0.69148254 -1.6627825   0.14145178 ...  0.13539803  0.67895925
    0.36161864]
  [-0.87217855 -0.09892818  0.24182941 ...  1.5428499  -0.5753602
   -0.15251134]
  [-1.1214936  -0.28015083  0.3957598  ..

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-40b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
   "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


  from .autonotebook import tqdm as notebook_tqdm
2023-06-26 14:01:25.838970: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-26 14:01:25.875735: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading shards:   0%|          | 0/9 [00:04<?, ?it/s]


KeyboardInterrupt: 