In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# 1. ReLU Grad

In [2]:
class ReluGrad(nn.Module):
    def __init__(self):
        super(ReluGrad, self).__init__()

    def forward(self, dY, X):
        """
        Compute the gradient of the ReLU function.

        Args:
            dY (Tensor): Gradient with respect to the output of ReLU.
            X (Tensor): Input tensor to ReLU.

        Returns:
            Tensor: Gradient with respect to the input of ReLU.
        """
        # Ensure dY and X are on the same device and dtype
        #dY = dY.to(X.device)
        
        # Compute dX using the ReLU gradient formula
        dX = torch.where(X > 0, dY, torch.zeros_like(dY))
        
        return dX

In [3]:
# Example usage
dY = torch.randn(2, 3)  # Gradient w.r.t. ReLU output
X = torch.randn(2, 3)   # Input tensor to ReLU

# Create ReluGrad object
relu_grad = ReluGrad()
dX = relu_grad(dY, X)

print("dY:",dY)
print("X:",X)
print("dX:",dX)

dY: tensor([[-1.5239, -0.5434, -1.0209],
        [-0.8136, -1.0139,  0.9791]])
X: tensor([[-0.5237,  2.0639, -0.1120],
        [-0.1539,  0.6541, -0.0548]])
dX: tensor([[ 0.0000, -0.5434,  0.0000],
        [ 0.0000, -1.0139,  0.0000]])


# 2. QuickGeLU Grad 

In [4]:
class QuickGELU(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(1.702*x)

### Torch gradient

In [5]:
x = torch.linspace(-5, 5, 5, requires_grad=True) # requires_grad=True
act = QuickGELU()
out = act(x)
out.backward(torch.ones_like(x)) # out
x_grad = x.grad 
print(np.round(x_grad,3))

tensor([-0.0020, -0.0450,  0.5000,  1.0450,  1.0020])


### Manual gradient

In [6]:
def sigmoid(x):
    """Compute the sigmoid function."""
    return 1 / (1 + np.exp(-x))

def quickgelu(x):
    """Compute the QuickGELU activation function."""
    return x * sigmoid(1.702*x)

def quickgelu_grad(x):
    """Compute the gradient of the QuickGELU function."""
    sig = sigmoid(1.702*x)
    grad = sig + x * sig * (1 - sig)*1.702
    return grad

# Example usage:
x = np.linspace(-5, 5, 5)
grad = quickgelu_grad(x)
print("Gradient of QuickGELU function:\n", np.round(grad,3))

Gradient of QuickGELU function:
 [-0.002 -0.045  0.5    1.045  1.002]


# 3. Softmax Grad

In [7]:
class Softmax(nn.Module):
    def forward(self, x):
        shiftx = x -torch.max(x)
        exps = torch.exp(shiftx)
        return exps/torch.sum(exps)

### Torch gradient

In [8]:
def softmax(z):
  return z.exp() / z.exp().sum(axis=1, keepdim=True)

In [9]:
z = torch.tensor([[4., 2.]], requires_grad=True)

In [10]:
torch_sm = F.softmax(z, dim=1)

# to extract the first row in the jacobian matrix, use [[1., 0]] 
# retain_graph=True because we re-use backward() for the second row
torch_sm.backward(torch.tensor([[1.,0.]]), retain_graph=True) 
r1 = z.grad
z.grad = torch.zeros_like(z) 

# to extract the second row in the jacobian matrix, use [[0., 1.]] 
torch_sm.backward(torch.tensor([[0.,1.]])) 
r2 = z.grad
torch_sm_p = torch.cat((r1,r2))

print(torch_sm_p)

tensor([[ 0.1050, -0.1050],
        [-0.1050,  0.1050]])


### Manual gradient

In [11]:
def softmax_grad(z):
  sm = softmax(z).squeeze()
  sm_size = sm.shape[0]
  sm_ps = []
  for i, sm_i in enumerate(sm):
    for j, sm_j in enumerate(sm):
      # First case: i and j are equal:
      if(i==j):
        # Differentiating the softmax of a neuron w.r.t to itself
        sm_p = sm_i * (1 - sm_i)
        sm_ps.append(sm_p)
      # Second case: i and j are not equal:
      else:
        # Differentiating the softmax of a neuron w.r.t to another neuron
        sm_p = -sm_i * sm_j
        sm_ps.append(sm_p)
  sm_ps = torch.tensor(sm_ps).view(sm_size, sm_size)
  return sm_ps

In [12]:
sm_p = softmax_grad(z)
print(sm_p)

tensor([[ 0.1050, -0.1050],
        [-0.1050,  0.1050]])


# 4. RMS Norm (Simplified Layer Norm) Grad

In [13]:
torch.manual_seed(0)
x = torch.randn([2, 2, 3], dtype=torch.float32, requires_grad=True)

In [14]:
def rms_norm(x):
    eps = 1e-6
    mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
    norm_factor = torch.rsqrt(mean_sq + eps)
    
    # Normalize the input
    y = x * norm_factor
    return y, norm_factor, mean_sq

### Torch gradient

In [15]:
y, norm_factor, mean_sq = rms_norm(x)
y.sum().backward()  # Backward pass to compute gradients
autograd_grad = x.grad
print(autograd_grad)

tensor([[[0.7736, 0.6207, 0.4635],
         [1.2253, 0.3718, 0.2097]],

        [[1.2487, 1.0069, 1.8734],
         [0.9376, 0.2695, 2.9609]]])


### Manual gradient

In [16]:
def rms_norm_grad(x):
    # Forward pass
    y, norm_factor, mean_sq = rms_norm(x)
    
    
    # Compute gradients manually
    # Gradient of mean_sq with respect to x
    grad_mean_sq = 2 * x.mean(dim=-1, keepdim=True)
    
    # Gradient of norm_factor with respect to mean_sq
    grad_norm_factor = -0.5 * (mean_sq + 1e-6).pow(-1.5) * grad_mean_sq
    
    # Gradient of y = x * norm_factor
    grad_y = norm_factor + x * grad_norm_factor
    
    return grad_y

In [17]:
manual_grad = rms_norm_grad(x)
print(manual_grad)

tensor([[[0.7736, 0.6207, 0.4635],
         [1.2253, 0.3718, 0.2097]],

        [[1.2487, 1.0069, 1.8734],
         [0.9376, 0.2695, 2.9609]]], grad_fn=<AddBackward0>)


# 5. Layer Norm Grad

In [18]:
torch.manual_seed(0)
d = 4
x = torch.randn(2,3,d,requires_grad=True)
dout = torch.rand(2,3,d)

In [19]:
layer_norm = nn.LayerNorm(d)
y = layer_norm(x)

### Torch gradient

In [20]:
fakeloss=(y*dout).sum()
fakeloss.backward()
print(x.grad)


 x.grad: tensor([[[ 0.4349, -0.2657,  0.6004, -0.7697],
         [-0.1073, -0.0484,  0.2523, -0.0966],
         [-0.6142, -0.1115,  0.0653,  0.6604]],

        [[ 0.2990, -0.0633,  0.6517, -0.8875],
         [ 0.4394, -0.2104, -0.0678, -0.1612],
         [ 0.1001,  0.1124, -0.3442,  0.1317]]])


### Manual gradient

In [21]:
eps=1e-10

class LayerNorm:

    @staticmethod
    def forward(x, w, b):
        # x is the input activations, of shape B,T,C
        # w are the weights, of shape C
        # b are the biases, of shape C
        B, T, C = x.size()
        # calculate the mean
        mean = x.sum(-1, keepdim=True) / C # B,T,1
        # calculate the variance
        xshift = x - mean # B,T,C
        var = (xshift**2).sum(-1, keepdim=True) / C # B,T,1
        # calculate the inverse standard deviation: **0.5 is sqrt, **-0.5 is 1/sqrt
        rstd = (var + eps) ** -0.5 # B,T,1
        # normalize the input activations
        norm = xshift * rstd # B,T,C
        # scale and shift the normalized activations at the end
        out = norm * w + b # B,T,C

        # return the output and the cache, of variables needed later during the backward pass
        cache = (x, w, mean, rstd)
        return out, cache

    @staticmethod
    def backward(dout, cache):
        x, w, mean, rstd = cache
        # recompute the norm (save memory at the cost of compute)
        norm = (x - mean) * rstd
        # gradients for weights, bias
        db = dout.sum((0, 1))
        dw = (dout * norm).sum((0, 1))
        # gradients for input
        dnorm = dout * w
        dx = dnorm - dnorm.mean(-1, keepdim=True) - norm * (dnorm * norm).mean(-1, keepdim=True)
        dx *= rstd
        return dx, dw, db

In [22]:
torch.manual_seed(0)
B = 2 # some toy numbers here
T = 3
C = 4
x = torch.randn(B, T, C, requires_grad=True)
w = torch.ones(4,requires_grad=True)
b = torch.zeros(4, requires_grad=True)
dout = torch.rand(B,T,C)
out, cache = LayerNorm.forward(x,w,b)

In [23]:
dx, dw, db = LayerNorm.backward(dout, cache)
print(dx)

tensor([[[ 0.4349, -0.2657,  0.6004, -0.7697],
         [-0.1073, -0.0484,  0.2523, -0.0966],
         [-0.6142, -0.1115,  0.0653,  0.6604]],

        [[ 0.2990, -0.0633,  0.6518, -0.8875],
         [ 0.4394, -0.2104, -0.0678, -0.1612],
         [ 0.1001,  0.1124, -0.3442,  0.1317]]], grad_fn=<MulBackward0>)


# 6. SoftmaxCrossEntropyLoss Grad

In [24]:
# Define the logits and class indices
logits = torch.tensor([
    [2.0, 1.0, 0.1, 0.5, 1.5],  # Sample 1
    [0.2, 2.1, 0.4, 1.0, -0.5],  # Sample 2
    [1.0, 0.0, 2.5, 0.5, 1.2]    # Sample 3
], requires_grad=True)  # Logits with requires_grad=True

class_indices = torch.tensor([2, 0, 3])  # True class indices for each sample

In [25]:
# Compute the softmax probabilities
def softmax(x):
    exps = torch.exp(x - torch.max(x, dim=1, keepdim=True).values)
    return exps / exps.sum(dim=1, keepdim=True)

# Compute softmax probabilities
probabilities = softmax(logits)

# Compute the cross-entropy loss manually
def cross_entropy_loss(probabilities, class_indices):
    log_probs = torch.log(probabilities)
    loss = -log_probs[range(logits.size(0)), class_indices].mean()
    return loss

### Torch gradient

In [26]:
loss = cross_entropy_loss(probabilities, class_indices)
loss.backward()
print(logits.grad)

tensor([[ 0.1420,  0.0522, -0.3121,  0.0317,  0.0861],
        [-0.3047,  0.1916,  0.0350,  0.0638,  0.0142],
        [ 0.0434,  0.0160,  0.1946, -0.3070,  0.0530]])


### Manual gradient

In [27]:
def compute_gradients(logits, probabilities, class_indices):
    # One-hot encode the true class indices
    one_hot_labels = torch.zeros_like(probabilities)
    one_hot_labels[range(logits.size(0)), class_indices] = 1
    
    # Gradient of the loss with respect to logits
    grad_logits = (probabilities - one_hot_labels) /logits.size(0)
    return grad_logits

In [28]:
manual_grad_logits = compute_gradients(logits, probabilities, class_indices)
print(manual_grad_logits)

tensor([[ 0.1420,  0.0522, -0.3121,  0.0317,  0.0861],
        [-0.3047,  0.1916,  0.0350,  0.0638,  0.0142],
        [ 0.0434,  0.0160,  0.1946, -0.3070,  0.0530]], grad_fn=<DivBackward0>)
