<a href="https://colab.research.google.com/github/sagsarkar/ML_Coding/blob/main/deep_learning_layers_in_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

Dropout layer

In [2]:
class DropoutLayer:
  def __init__(self, dropout_rate):
    self.dropout_rate = dropout_rate
    self.mask = None

  def forward(self, x, is_training=True):
    if is_training:
      self.mask = (np.random.rand(*x.shape) < (1 - self.dropout_rate)) / (1-self.dropout_rate)
      output = self.mask * x
    else:
      output = x

    return output

  def backward(self, output_gradient):
    if self.mask is not None:
      input_gradient = self.mask * output_gradient
      return input_gradient
    else:
      return output_gradient

# Test the DropoutLayer
np.random.seed(42)
x = np.random.rand(5, 5)
dropout_layer = DropoutLayer(dropout_rate=0.3)

print("Input:")
print(x)

print("\nDuring Training:")
output_train = dropout_layer.forward(x, is_training=True)
print(output_train)

# Backpropagation example (assuming some gradient from subsequent layers)
grad_output = np.random.rand(*x.shape)
print("\nGradient w.r.t. output:\n", grad_output)

grad_input = dropout_layer.backward(grad_output)
print("\nGradient w.r.t. input:\n", grad_input)

print("\nDuring Inference:")
output_inference = dropout_layer.forward(x, is_training=False)
print(output_inference)

Input:
[[0.37454012 0.95071431 0.73199394 0.59865848 0.15601864]
 [0.15599452 0.05808361 0.86617615 0.60111501 0.70807258]
 [0.02058449 0.96990985 0.83244264 0.21233911 0.18182497]
 [0.18340451 0.30424224 0.52475643 0.43194502 0.29122914]
 [0.61185289 0.13949386 0.29214465 0.36636184 0.45606998]]

During Training:
[[0.         1.35816329 1.04570563 0.85522641 0.22288377]
 [0.22284931 0.08297659 1.23739449 0.         0.        ]
 [0.         1.3855855  1.18920377 0.30334159 0.25974995]
 [0.26200644 0.43463178 0.74965205 0.         0.41604163]
 [0.87407556 0.19927694 0.4173495  0.52337406 0.65152855]]

Gradient w.r.t. output:
 [[0.96958463 0.77513282 0.93949894 0.89482735 0.59789998]
 [0.92187424 0.0884925  0.19598286 0.04522729 0.32533033]
 [0.38867729 0.27134903 0.82873751 0.35675333 0.28093451]
 [0.54269608 0.14092422 0.80219698 0.07455064 0.98688694]
 [0.77224477 0.19871568 0.00552212 0.81546143 0.70685734]]

Gradient w.r.t. input:
 [[0.         1.1073326  1.34214135 1.27832479 0.854

Multi Head Attention

In [3]:
class MultiHeadAttentionLayer:
    def __init__(self, hid_dim, n_heads, dropout):
        assert hid_dim % n_heads == 0, "hidden_dim should be divisible by num_heads"

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        # Fully connected layers for query, key, value, and output
        self.fc_q = lambda x: np.dot(x, np.random.randn(self.hid_dim, self.hid_dim))
        self.fc_k = lambda x: np.dot(x, np.random.randn(self.hid_dim, self.hid_dim))
        self.fc_v = lambda x: np.dot(x, np.random.randn(self.hid_dim, self.hid_dim))
        self.fc_o = lambda x: np.dot(x, np.random.randn(self.hid_dim, self.hid_dim))

        # Dropout layer
        self.dropout = lambda x: x * np.random.choice([0, 1], size=x.shape, p=[dropout, 1 - dropout])

        self.scale = np.sqrt(self.head_dim)

    def forward(self, query, key, value, src_mask=None):
        batch_size = query.shape[0]

        # Linear transformations for query, key, and value
        Q = self.fc_q(query)  # [batch_size, query_len, hid_dim]
        K = self.fc_k(key)    # [batch_size, key_len, hid_dim]
        V = self.fc_v(value)  # [batch_size, value_len, hid_dim]

        # Reshape and transpose for multi-head attention
        Q = Q.reshape(batch_size, -1, self.n_heads, self.head_dim).transpose(0, 2, 1, 3)  # [batch_size, n_heads, query_len, head_dim]
        K = K.reshape(batch_size, -1, self.n_heads, self.head_dim).transpose(0, 2, 1, 3)  # [batch_size, n_heads, key_len, head_dim]
        V = V.reshape(batch_size, -1, self.n_heads, self.head_dim).transpose(0, 2, 1, 3)  # [batch_size, n_heads, value_len, head_dim]

        # Compute scaled dot-product attention energy
        energy = np.matmul(Q, K.transpose(0, 1, 3, 2)) / self.scale  # [batch_size, n_heads, query_len, key_len]

        # Apply source sequence mask if provided
        if src_mask is not None:
            energy = np.where(src_mask == 0, -1e10, energy)

        # Apply softmax to get attention scores
        attention = np.exp(energy - np.max(energy, axis=-1, keepdims=True))  # [batch_size, n_heads, query_len, key_len]
        attention /= np.sum(attention, axis=-1, keepdims=True)  # Normalize attention scores

        # Apply dropout to attention scores and compute weighted sum with values
        x = np.matmul(self.dropout(attention), V)  # [batch_size, n_heads, query_len, head_dim]

        # Transpose and reshape back to original shape
        x = x.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.hid_dim)  # [batch_size, query_len, hid_dim]

        # Linear transformation for the final output
        x = self.fc_o(x)  # [batch_size, query_len, hid_dim]

        return x, attention

# Test the MultiHeadAttentionLayer
hid_dim = 64
n_heads = 8
dropout = 0.1

batch_size = 4
sentence_len = 10

sentence = np.random.randn(batch_size, sentence_len, hid_dim)
src_mask = np.random.choice([0, 1], size=(batch_size, 1, 1, sentence_len), p=[0.1, 0.9])  # Example source mask

attention_layer = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
output, attention = attention_layer.forward(sentence, sentence, sentence, src_mask)

print("Output shape:", output.shape)
print("Attention shape:", attention.shape)


Output shape: (4, 10, 64)
Attention shape: (4, 8, 10, 10)


Dense Layer with skip connection

In [4]:
class DenseLayerWithSkip:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.bias = np.zeros((1, output_size))

    def forward(self, x):
        self.x = x
        self.z = np.dot(x, self.weights) + self.bias
        return self.z + x

    def backward(self, dz):
        self.db = np.sum(dz, axis=0)
        self.dW = np.dot(self.x.T, dz)
        dx_skip = dz
        dx_dense = np.dot(dz, self.weights.T)

        dx_input = dx_skip + dx_dense  # Overall input gradient

        return dx_input

# Testing the class
input_size = 5
output_size = 5

layer = DenseLayerWithSkip(input_size, output_size)

# Forward pass
x = np.random.randn(10, input_size)
output = layer.forward(x)
print("Output shape:", output.shape)

# Backward pass
dz = np.random.randn(10, output_size)
dx_input = layer.backward(dz)
print("Overall input gradient shape:", dx_input.shape)

Output shape: (10, 5)
Overall input gradient shape: (10, 5)


Layer normalization

In [5]:
class LayerNormalization:
    def __init__(self, epsilon=1e-5):
        self.epsilon = epsilon
        self.gamma = np.ones(1)  # Initialize gamma to ones
        self.beta = np.zeros(1)  # Initialize beta to zeros
        self.mean = None
        self.variance = None
        self.normalized_input = None

    def forward(self, x):
        self.mean = np.mean(x, axis=-1, keepdims=True)
        self.variance = np.var(x, axis=-1, keepdims=True)
        self.normalized_input = (x - self.mean) / np.sqrt(self.variance + self.epsilon)

        y = self.gamma * self.normalized_input + self.beta
        return y

# Example usage
batch_size = 32
sentence_length = 10
hidden_dimension = 64

# Generate random input tensor
input_tensor = np.random.randn(batch_size, sentence_length, hidden_dimension)

# Create a LayerNormalization instance
layer_norm = LayerNormalization()

# Forward pass
output_tensor = layer_norm.forward(input_tensor)

print("Input shape:", input_tensor.shape)
print("Output shape:", output_tensor.shape)

Input shape: (32, 10, 64)
Output shape: (32, 10, 64)
