In [19]:
import tensorflow as tf
import random
import numpy as np
from keras.layers import LayerNormalization, Layer, Dense, ReLU, Dropout,TextVectorization, Embedding
from keras.backend import softmax 
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32

In [20]:
tf.__version__

'2.15.0'

In [21]:
# tf.keras.__version__

In [22]:
import keras
keras.__version__

'2.15.0'

### Positional embedding

In [23]:
class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
        super().__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
        pos_embedding_matrix = self.get_position_encoding(seq_length, output_dim)
        self.word_embedding_layer = Embedding(input_dim=vocab_size, output_dim=output_dim,weights=[word_embedding_matrix],trainable=False)
        self.position_embedding_layer = Embedding(input_dim=seq_length, output_dim=output_dim,weights=[pos_embedding_matrix],trainable=False)
    
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
    
    
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

### Single headed Self Attention

In [24]:
class DotProductAttention(Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
    
    def call(self,queries,keys,values,d_k,mask = None):
        
        # Scoring the queries against the keys after transposing the latter, and scaling
        scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
        
        # Apply mask to the attention scores
        if mask is not None:
            scores += -1e9 * mask
        
        # Computing the weights by a softmax operation
        weights = softmax(scores)
        
        # Computing the attention by a weighted sum of the value vectors
        return matmul(weights, values)
        
    

### Multi - headed self attention

In [25]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = DotProductAttention() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projection matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
            # Tensor shape after reshaping and transposing:
            # (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
            # Reverting the reshaping and transposing operations:
            # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    
    def call(self, queries, keys, values, mask=None):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)

### Normalization layer

In [26]:
# Implementing the Add & Norm Layer
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization() # Layer normalization layer
    def call(self, x, sublayer_x):
        # The sublayer input and output need to be of the same shape to be summed
        add = x + sublayer_x
        # Apply layer normalization to the sum
        return self.layer_norm(add)


### Feed forward layer

In [27]:
# Implementing the Feed-Forward Layer
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super().__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) # First fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer
        
    def call(self, x):
        # The input is passed into the two fully-connected layers, with a ReLU in between
        x_fc1 = self.fully_connected1(x)
        return self.fully_connected2(self.activation(x_fc1))

### A single decoder layer

In [28]:
class DecoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()
    
    
    def call(self, x, encoder_output, lookahead_mask, padding_mask, training):
        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
         
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)
        # Followed by an Add & Norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output,encoder_output, padding_mask)
         
        # Add in another dropout layer
        multihead_output2 = self.dropout2(multihead_output2, training=training)
        
        # Followed by another Add & Norm layer
        addnorm_output2 = self.add_norm1(addnorm_output1, multihead_output2)
        
        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)
        # Expected output shape = (batch_size, sequence_length, d_model)
        
        # Add in another dropout layer
        feedforward_output = self.dropout3(feedforward_output, training=training)
        
        # Followed by another Add & Norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)

### Decoder construct

In [29]:
class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,**kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size,d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]
        
    
    def call(self, output_target, encoder_output, lookahead_mask, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        # Expected output shape = (number of sentences, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask, padding_mask, training)
        return x

In [30]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_ff = 2048 # Dimensionality of the inner fully connected layer
d_model = 512 # Dimensionality of the model sub-layers' outputs
n = 6 # Number of layers in the encoder stack
batch_size = 64 # Batch size from the training process
dropout_rate = 0.1 # Frequency of dropping the input units in the dropout layers

In [31]:
dec_vocab_size = 20 # Vocabulary size for the decoder
input_seq_length = 5 # Maximum length of the input sequence
input_seq = np.random.rand(batch_size, input_seq_length)
enc_output = np.random.rand(batch_size, input_seq_length, d_model)

In [32]:
input_seq

array([[0.23813781, 0.42660844, 0.13588108, 0.69869822, 0.15502572],
       [0.54423014, 0.72538066, 0.96723151, 0.50507157, 0.37427433],
       [0.26311723, 0.4107038 , 0.18986037, 0.04716535, 0.11306036],
       [0.77567514, 0.50454637, 0.86204885, 0.32113565, 0.12107299],
       [0.86997978, 0.6966864 , 0.69174761, 0.42036661, 0.87675345],
       [0.60359937, 0.91711472, 0.7754632 , 0.59363341, 0.28111298],
       [0.6206492 , 0.3163445 , 0.58198378, 0.70812806, 0.54572353],
       [0.49704156, 0.36933219, 0.30878125, 0.436825  , 0.73380636],
       [0.44149429, 0.40475286, 0.1570381 , 0.8534246 , 0.84651119],
       [0.31217915, 0.90187771, 0.72047894, 0.16132092, 0.63965084],
       [0.38533828, 0.47631031, 0.93778978, 0.83679139, 0.45861227],
       [0.03710763, 0.4883284 , 0.42453325, 0.30148008, 0.5892237 ],
       [0.96776412, 0.32441958, 0.42511626, 0.20273241, 0.20243375],
       [0.63239289, 0.8651447 , 0.09875595, 0.02418938, 0.00638857],
       [0.9661999 , 0.85020729, 0.

In [33]:
enc_output.shape

(64, 5, 512)

In [34]:
decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
res = decoder(input_seq, enc_output, None, True)

2025-01-22 22:35:19.292079: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-01-22 22:35:19.292155: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-01-22 22:35:19.292186: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-01-22 22:35:19.292272: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-22 22:35:19.292334: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [35]:
res.shape

TensorShape([64, 5, 512])

In [36]:
res

<tf.Tensor: shape=(64, 5, 512), dtype=float32, numpy=
array([[[-0.31399804,  0.3582848 ,  0.9608977 , ...,  0.7796432 ,
         -0.17132258,  0.6318447 ],
        [-0.16476522,  0.26555887,  1.1058702 , ...,  0.7995784 ,
         -0.21809655,  0.60369086],
        [-0.16110751,  0.12419628,  1.1247411 , ...,  0.8273342 ,
         -0.23887831,  0.58821464],
        [-0.33306292,  0.08242546,  0.9983434 , ...,  0.84448594,
         -0.24468735,  0.59599435],
        [-0.5300367 ,  0.1823957 ,  0.80842257, ...,  0.86645854,
         -0.22730008,  0.60720867]],

       [[-0.2937455 , -0.01389704,  0.9137147 , ...,  0.9045331 ,
         -0.21043386,  0.84684753],
        [-0.15168926, -0.09704671,  1.0583394 , ...,  0.9050211 ,
         -0.24323916,  0.8080223 ],
        [-0.13834375, -0.22642714,  1.0986501 , ...,  0.90665746,
         -0.2700992 ,  0.77882814],
        [-0.29970443, -0.28521848,  0.9809275 , ...,  0.9209179 ,
         -0.26641336,  0.7722647 ],
        [-0.47807658, -0.1

In [1]:
import subprocess

# Run the conda info --envs command
conda_envs = subprocess.run(['conda', 'info', '--envs'], capture_output=True, text=True)

# Print the output
print(conda_envs.stdout)


# conda environments:
#
base                   /Users/ramnaresh/anaconda3
ComputerVision         /Users/ramnaresh/anaconda3/envs/ComputerVision
Lam-Research           /Users/ramnaresh/anaconda3/envs/Lam-Research
LamResearch            /Users/ramnaresh/anaconda3/envs/LamResearch
MachineLearning        /Users/ramnaresh/anaconda3/envs/MachineLearning
MetalTensorFlow        /Users/ramnaresh/anaconda3/envs/MetalTensorFlow
MetalTensorflow        /Users/ramnaresh/anaconda3/envs/MetalTensorflow
Transformers         * /Users/ramnaresh/anaconda3/envs/Transformers
mlx                    /Users/ramnaresh/anaconda3/envs/mlx


