In [None]:
import numpy as np
# The Variable class is complete. You do not need to make any changes here.
class Variable(object):
    def __init__(self, matrix):
        self.value = np.array(matrix,dtype=np.float64)
        if len(self.value.shape)==0:
            self.value = self.value.reshape([1,1])
        elif len(self.value.shape)!=2:
            raise Exception("Only 2D matrices or scalars supported.")

        self.fanout = 0
        self.gradient = 0


    def __add__(self,other):
        if not(isinstance(other, Variable)):
            other = Variable(other)
        register_operation(self,other)
        return MatrixAddition(self,other)

    def __truediv__(self,other):
        if not(isinstance(other, Variable)):
            other = Variable(other)
        register_operation(self,other)
        return MatrixDivision(self,other)

    def __mul__(self,other):
        if not(isinstance(other, Variable)):
            other = Variable(other)
        register_operation(self,other)
        return ElementwiseMultiplication(self,other)

    def __matmul__(self,other):
        if not(isinstance(other, Variable)):
            other = Variable(other)
        register_operation(self,other)
        return MatrixMultiplication(self,other)

    def exp(self):
        register_operation(self)
        return Exp(self)

    def log(self):
        register_operation(self)
        return Log(self)

    def sum(self,axis):
        register_operation(self)
        return Sum(self,axis=axis)

    def reset(self):
        self.gradient = 0
        self.fanout = 0

## HELPER FUNCTIONS
def propagate_gradients(*inputs):
    """
    This function checks if the variable is "ready" to backpropagate.
    Explain:

    You do not have to modify the code.
    """
    for variable in inputs:
        variable.fanout -= 1
        if variable.fanout == 0 and "backward" in dir(variable):
            variable.backward()

def register_operation(*inputs):
    """
    This function counts the number of times a variable is used.
    Explain:

    You do not have to modify the code.
    """
    for variable in inputs:
        variable.fanout += 1


def broadcast_gradients(gradient,variable):
    """
    In some cases, the variable gets broadcasted during an operation.
    Ex: In adding a [2,2] Matrix with [1,2] Vector, the Vector gets broadcasted.
    During backpropagation, we need to appropriately "broadcast" the gradients to the variable.
    Given the gradient and the variable, return the broadcasted version of the gradient.

    The simplest case of broadcasting is when the shape of the gradient
    matches the shape of the variable.

    Write the code for the case when the shapes do not match.
    """
    if gradient.shape == variable.value.shape:
        return gradient
    else:
        #Code here.
        None

"""
Replace the 0s in the constructor and backward function with the correct values.
Those classes which are already done for you have a note.

The classes are arranged in increasing order of difficulty.
"""


'\nReplace the 0s in the constructor and backward function with the correct values. \nThose classes which are already done for you have a note. \n\nThe classes are arranged in increasing order of difficulty. \n'

In [None]:
class Log(Variable):
    """
    Usage:
        v = Variable([[1,2,3]])
        log_v = v.log()
    """
    def __init__(self,v):
        super().__init__(np.log(v.value)) #log of v - works
        self.v = v

    def backward(self):
        self.v.gradient += self.gradient * 1 / self.v.value # d/dv(logv) = 1/v - to test

        propagate_gradients(self.v)

In [None]:
v = Variable([[1,2,3]])
log_v = v.log()

In [None]:
log_v.value

array([[0.        , 0.69314718, 1.09861229]])

In [None]:
import math
math.log(3)

1.0986122886681098

In [None]:
class Exp(Variable):
    """
    Usage:
        v = Variable([[1,2,3]])
        exp_v = v.exp()
    """
    def __init__(self,v):
        super().__init__(np.exp(v.value)) #e^v - works
        self.v = v

    def backward(self):
        self.v.gradient += self.gradient * np.exp(self.v.value) #d/dv(e^v) = e^v - to test

        propagate_gradients(self.v)

In [None]:
v = Variable([[1,2,3]])
exp_v = v.exp()
print(exp_v.value)
print(math.exp(1), math.exp(2), math.exp(3))

[[ 2.71828183  7.3890561  20.08553692]]
2.718281828459045 7.38905609893065 20.085536923187668


In [None]:
class Sigmoid(Variable):
    """
    Usage:
        v1 = Variable([[1,2],
                       [3,4]])
        v1_act = Sigmoid(v1)
    """
    def __init__(self,v):
        sigmoid = 1 / (1 + np.exp(-v.value)) #sig(v) = 1/(1+e^-v) - works
        super().__init__(sigmoid)
        self.v = v
        self.v.fanout += 1

    def backward(self):
        self.v.gradient += self.gradient * self.value * (1 - self.value) #d/dx(sig(x) = sig(x)(1-sig(x)) - to test

        propagate_gradients(self.v)

In [None]:
v1 = Variable([[1,2],
                [3,4]])
v1_act = Sigmoid(v1)
print(v1_act.value)
print(test_sigmoid(1), test_sigmoid(2), test_sigmoid(3), test_sigmoid(4))

[[0.73105858 0.88079708]
 [0.95257413 0.98201379]]
0.7310585786300049 0.8807970779778823 0.9525741268224334 0.9820137900379085


In [None]:
import math

def test_sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [None]:
class ReLU(Variable):
    """
    Usage:
        v1 = Variable([[1,2],
                       [3,4]])
        v1_act = ReLU(v1)
    """
    def __init__(self,v):
        relu = np.maximum(0, v.value) # relu = max(0,x) - works
        super().__init__(relu)
        self.v = v
        self.v.fanout += 1

    def backward(self):
        self.v.gradient += self.gradient * (self.value > 0) # dy/dx(relu) = 1 for x > 0 and 0 for x <= 0

        propagate_gradients(self.v)

In [None]:
v1 = Variable([[-1,7],
                [0,-3]])
v1_act = ReLU(v1)
v1_act.value

array([[0., 7.],
       [0., 0.]])

In [None]:
class Tanh(Variable):
    """
    Usage:
        v1 = Variable([[1,2],
                       [3,4]])
        v1_act = Tanh(v1)
    """
    def __init__(self,v):
        #tanh = np.tanh(v.value) - shorter way
        tanh = (np.exp(v.value) - np.exp(-v.value)) / (np.exp(v.value) + np.exp(-v.value)) # tanh(x) = (e^x - e^-x) / (e^x + e^-x) - works
        super().__init__(tanh)
        self.v = v
        self.v.fanout += 1

    def backward(self):
        self.v.gradient += self.gradient * (1 - self.value**2) # d/dx(tanh(x)) = 1 - tanh(x)^2

        propagate_gradients(self.v)

In [None]:
v1 = Variable([[1,2],
                [3,4]])
v1_act = Tanh(v1)
v1_act.value

array([[0.76159416, 0.96402758],
       [0.99505475, 0.9993293 ]])

In [None]:
v1_act.value

array([[0.76159416, 0.96402758],
       [0.99505475, 0.9993293 ]])

In [None]:
np.tanh(4)

0.999329299739067

In [None]:
# Medium Difficulty.
class MatrixMultiplication(Variable):
    """
    Usage:
        v1 = Variable([[1,2],
                       [3,4]])
        v2 = Variable([[6],
                       [7]])
        v1_v2 = v1 @ v2
    """
    def __init__(self,v1,v2):
        super().__init__(np.matmul(v1.value, v2.value)) #matrix multiplication - works
        self.v1 = v1
        self.v2 = v2

    def backward(self):
        self.v1.gradient += np.matmul(self.gradient, self.v2.value.T) # if A = BC, then dA/dB = C.T and dA/dC = B
        self.v2.gradient += np.matmul(self.v1.value.T, self.gradient)

        propagate_gradients(self.v1,self.v2)

In [None]:
v1 = Variable([[1,2],
                [3,4]])
v2 = Variable([[0, 1],
             [1, 0]])
v1_v2 = v1 @ v2

In [None]:
v1_v2.value

array([[2., 1.],
       [4., 3.]])

In [None]:
class MatrixAddition(Variable):
    """
    Usage:
        v1 = Variable([[1,2],
                       [3,4]])
        v2 = Variable([[6],
                       [7]])
        v1_v2 = v1 + v2
    This has been done for you.
    """
    def __init__(self,v1,v2):
        super().__init__(v1.value+v2.value)

        self.v1 = v1
        self.v2 = v2

    def backward(self):
        # L X N
        self.v1.gradient += broadcast_gradients(self.gradient,self.v1)
        self.v2.gradient += broadcast_gradients(self.gradient,self.v2)

        propagate_gradients(self.v1,self.v2)

In [None]:
v1 = Variable([[1,2],
                [3,4]])
v2 = Variable([[6],
                [7]])
v1_v2 = v1 + v2

In [None]:
v1_v2.value

array([[ 7.,  8.],
       [10., 11.]])

In [None]:
class ElementwiseMultiplication(Variable):
    """
    Usage:
        v1 = Variable([[1,2,3]])
        v2 = Variable([[4,5,6]])
        v1_v2 = v1 * v2
    """
    def __init__(self,v1,v2):
        super().__init__(v1.value * v2.value) # Aij * Bij - works
        self.v1 = v1
        self.v2 = v2

    def backward(self):
        self.v1.gradient += broadcast_gradients(self.gradient * self.v2.value, self.v1) #other element * gradient - to test
        self.v2.gradient += broadcast_gradients(self.gradient * self.v1.value, self.v2)

        propagate_gradients(self.v1,self.v2)

In [None]:
v1 = Variable([[1,2,3]])
v2 = Variable([[0,3,1]])
v1_v2 = v1 * v2
v1_v2.value

array([[0., 6., 3.]])

In [None]:
class MatrixDivision(Variable):
    def __init__(self, v1, v2):
        super().__init__(v1.value / v2.value) # works
        self.v1 = v1
        self.v2 = v2

    def backward(self):
        self.v1.gradient += broadcast_gradients(self.gradient / self.v2.value, self.v1) # A = B/C, then dA/dB = 1/C and dA/dC = -B/C^2 to test
        self.v2.gradient += broadcast_gradients(self.gradient * (-self.v1.value / (self.v2.value**2)), self.v2)

        propagate_gradients(self.v1, self.v2)

In [None]:
v1 = Variable([[2,4,6]])
v2 = Variable([[2]])
v1_v2 = v1 / v2
v1_v2.value

array([[1., 2., 3.]])

Explanation:

The function takes two parameters: gradient and variable. gradient is the incoming gradient from the subsequent operation, and variable is the variable with respect to which the gradient is being calculated.

The function checks if the shape of the gradient matches the shape of the variable.value. If they match, it means no broadcasting is needed, and the original gradient is returned.

If the shapes do not match, broadcasting is required. The axis_to_sum list is created to identify the axes along which the broadcasting should be performed. For each axis where the size of the gradient is 1, it means broadcasting is needed along that axis.

np.sum(gradient, axis=tuple(axis_to_sum), keepdims=True) performs the broadcasting. It sums the gradient along the specified axes, effectively extending its dimensions to match the shape of the variable.

The result is the broadcasted version of the gradient, which can be added to the original gradient during backpropagation. This ensures that the gradients are appropriately aligned with the dimensions of the variables involved in the operation.

In [None]:
def broadcast_gradients(gradient, variable):
    """
    In some cases, the variable gets broadcasted during an operation.
    Ex: In adding a [2,2] Matrix with [1,2] Vector, the Vector gets broadcasted.
    During backpropagation, we need to appropriately "broadcast" the gradients to the variable.
    Given the gradient and the variable, return the broadcasted version of the gradient.

    The simplest case of broadcasting is when the shape of the gradient
    matches the shape of the variable.

    Write the code for the case when the shapes do not match.
    """
    if gradient.shape == variable.value.shape:
        return gradient
    else:
        # Broadcasting is performed by summing along the appropriate axis
        axis_to_sum = [i for i in range(len(variable.value.shape)) if gradient.shape[i] == 1]
        return np.sum(gradient, axis=tuple(axis_to_sum), keepdims=True)
