In [2]:
import tensorflow as tf
import random
import numpy as np
from keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout,TextVectorization, Embedding
# from keras.backend import softmax 
from tensorflow.keras.activations import softmax
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32

AttributeError: 'NoneType' object has no attribute 'Rlocation'

In [1]:
import subprocess

# Run the conda info --envs command
conda_envs = subprocess.run(['conda', 'info', '--envs'], capture_output=True, text=True)

# Print the output
print(conda_envs.stdout)


# conda environments:
#
base                   /Users/ramnaresh/anaconda3
ComputerVision         /Users/ramnaresh/anaconda3/envs/ComputerVision
Lam-Research           /Users/ramnaresh/anaconda3/envs/Lam-Research
LamResearch          * /Users/ramnaresh/anaconda3/envs/LamResearch
MachineLearning        /Users/ramnaresh/anaconda3/envs/MachineLearning
MetalTensorFlow        /Users/ramnaresh/anaconda3/envs/MetalTensorFlow
MetalTensorflow        /Users/ramnaresh/anaconda3/envs/MetalTensorflow
Transformers           /Users/ramnaresh/anaconda3/envs/Transformers
mlx                    /Users/ramnaresh/anaconda3/envs/mlx




In [2]:
tf.__version__

'2.15.0'

In [3]:
tf.keras.__version__

AttributeError: module 'tensorflow.keras' has no attribute '__version__'

### Positional embedding

In [4]:
class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
        super().__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
        pos_embedding_matrix = self.get_position_encoding(seq_length, output_dim)
        self.word_embedding_layer = Embedding(input_dim=vocab_size, output_dim=output_dim,weights=[word_embedding_matrix],trainable=False)
        self.position_embedding_layer = Embedding(input_dim=seq_length, output_dim=output_dim,weights=[pos_embedding_matrix],trainable=False)
    
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
    
    
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

### Single headed Self Attention

In [5]:
class DotProductAttention(Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    
    def call(self,queries,keys,values,d_k,mask = None):
        
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
        
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)
        
    

### Multi - headed self attention

In [6]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = DotProductAttention() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projection matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing:
            # (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations:
            # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    
    def call(self, queries, keys, values, mask=None):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)

### Normalization layer

In [7]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization() # Layer normalization layer
    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
        # Apply layer normalization to the sum
        return self.layer_norm(add)


### Feed forward layer

In [8]:
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super().__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) # First fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer
        
    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
        return self.fully_connected2(self.activation(x_fc1))

### A single decoder layer

In [9]:
class DecoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()
    
    
    def call(self, x, encoder_output, lookahead_mask=None, padding_mask=None, training=True):
        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
         
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)
        # Followed by an Add & Norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output,encoder_output, padding_mask)
         
        # Add in another dropout layer
        multihead_output2 = self.dropout2(multihead_output2, training=training)
        
        # Followed by another Add & Norm layer
        addnorm_output2 = self.add_norm1(addnorm_output1, multihead_output2)
        
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)
        # Expected output shape = (batch_size, sequence_length, d_model)
        
        # Add in another dropout layer
        feedforward_output = self.dropout3(feedforward_output, training=training)
        
        # Followed by another Add & Norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)

### Decoder construct

In [10]:
class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,**kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
        
    
    def call(self, output_target, encoder_output, lookahead_mask=None, padding_mask=None, training=False):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        # Expected output shape = (number of sentences, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask=lookahead_mask, padding_mask=padding_mask, training=training)
        return x

In [11]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
batch_size = 64 # Batch size from the training process
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers

In [12]:
dec_vocab_size = 20 # Vocabulary size for the decoder
input_seq_length = 5 # Maximum length of the input sequence
input_seq = np.random.rand(batch_size, input_seq_length)
enc_output = np.random.rand(batch_size, input_seq_length, d_model)

In [13]:
input_seq

array([[0.74578872, 0.31021086, 0.36399593, 0.8133169 , 0.27280584],
       [0.17300805, 0.52715101, 0.73753011, 0.7257941 , 0.41685431],
       [0.92498015, 0.19735936, 0.22324765, 0.9273471 , 0.69489655],
       [0.77498968, 0.81098925, 0.21029924, 0.7098165 , 0.39923168],
       [0.34599084, 0.29722432, 0.15995964, 0.06505372, 0.48101539],
       [0.37704796, 0.52873194, 0.01886489, 0.18111276, 0.35642297],
       [0.32913466, 0.77642414, 0.05517162, 0.78733503, 0.89303435],
       [0.17091525, 0.36898292, 0.10248418, 0.20781432, 0.69450595],
       [0.56219092, 0.71912938, 0.7884896 , 0.15433083, 0.26669092],
       [0.13570547, 0.61114383, 0.93642969, 0.94558054, 0.25899686],
       [0.45077013, 0.93549129, 0.21181243, 0.93190043, 0.51392282],
       [0.75222002, 0.2790389 , 0.07615899, 0.42758309, 0.12784809],
       [0.61120744, 0.18254934, 0.26662305, 0.24820033, 0.66732167],
       [0.93728065, 0.27216735, 0.246638  , 0.94225177, 0.57048222],
       [0.38675773, 0.78086603, 0.

In [14]:
enc_output.shape

(64, 5, 512)

In [15]:
print("Input Sequence Shape:", input_seq.shape)
print("Encoder Output Shape:", enc_output.shape)

Input Sequence Shape: (64, 5)
Encoder Output Shape: (64, 5, 512)


In [16]:
decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
res = decoder(input_seq, enc_output, lookahead_mask=None, padding_mask=None, training=True)

2025-01-22 22:27:16.642125: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-01-22 22:27:16.642189: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-01-22 22:27:16.642200: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-01-22 22:27:16.642228: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-22 22:27:16.642248: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


ValueError: Exception encountered when calling MultiHeadAttention.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 64 (of type <class 'int'>)[0m

Arguments received by MultiHeadAttention.call():
  • queries=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • keys=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • values=tf.Tensor(shape=(64, 5, 512), dtype=float32)
  • mask=None

In [None]:
res.shape

In [None]:
res