In [1]:
import numpy as np

class Tensor:
    def __init__(self, data, requires_grad=False, creators=None, creation_op=None):
        self.data = data
        self.requires_grad = requires_grad
        self.grad = None
        self.creators = creators  # Tensors that led to this tensor being created
        self.creation_op = creation_op  # The operation that created this tensor
        self.children = {}
        
        # Tracking how many children a tensor has
        if creators is not None:
            for c in creators:
                if id(self) not in c.children:
                    c.children[id(self)] = 1
                else:
                    c.children[id(self)] += 1
    
    def all_children_grads_accounted_for(self):
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True

    def backward(self, grad=None, grad_origin=None):
        if self.requires_grad:
            if grad is None:
                grad = Tensor(np.ones_like(self.data))
            
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad

            # Only proceed with backpropagation if all gradients from children are accounted for
            if self.all_children_grads_accounted_for() or grad_origin is None:
                if self.creation_op == "add":
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)

    def __add__(self, other):
        out = Tensor(self.data + other.data, requires_grad=self.requires_grad or other.requires_grad)

        def backward():
            if self.requires_grad: self.backward(out.grad)
            if other.requires_grad: other.backward(out.grad)

        out._backward = backward

        return out

    def __mul__(self, other):
        out = Tensor(self.data * other.data, requires_grad=self.requires_grad or other.requires_grad)

        def backward():
            if self.requires_grad: self.backward(other.data * out.grad)
            if other.requires_grad: other.backward(self.data * out.grad)

        out._backward = backward

        return out

    def dot(self, other):
        out = Tensor(self.data.dot(other.data), requires_grad=self.requires_grad or other.requires_grad)

        def backward():
            if self.requires_grad: self.backward(out.grad.dot(other.data.T))
            if other.requires_grad: other.backward(self.data.T.dot(out.grad))

        out._backward = backward

        return out

    def __repr__(self):
        return f"Tensor(data={self.data}, requires_grad={self.requires_grad})"



class Operation:
    """
    Base class for all operations in the computational graph.
    """

    def __init__(self):
        # To be optionally used by subclasses for storing inputs, outputs, or other necessary data.
        self.inputs = None
        self.output = None
        # Each operation node will keep track of gradients with respect to its inputs.
        self.grads = {}

    def forward(self, *inputs):
        """
        Forward pass for the operation. Takes inputs, performs computation, and returns output.
        Subclasses must implement this method.
        """
        raise NotImplementedError("Forward pass not implemented.")

    def backward(self, grad_output):
        """
        Backward pass for the operation. Takes gradient with respect to operation's output and
        computes gradients with respect to operation's inputs.
        Subclasses must implement this method.
        
        :param grad_output: Gradient of the loss function with respect to this operation's output.
        :return: Gradients of the loss function with respect to this operation's inputs.
        """
        raise NotImplementedError("Backward pass not implemented.")

    def __call__(self, *inputs):
        """
        Makes the operation callable. Also, useful for directly using the operation in the computational graph.
        """
        return self.forward(*inputs)
    


def relu(tensor):
    out = Tensor(np.maximum(0, tensor.data), requires_grad=tensor.requires_grad)

    def backward():
        grad = (tensor.data > 0).astype(float) * out.grad
        tensor.backward(grad)

    out._backward = backward
    return out

def sigmoid(tensor):
    out_data = 1 / (1 + np.exp(-tensor.data))
    out = Tensor(out_data, requires_grad=tensor.requires_grad)

    def backward():
        grad = out_data * (1 - out_data) * out.grad
        tensor.backward(grad)

    out._backward = backward
    return out

def mse_loss(input_tensor, target):
    diff = input_tensor - target
    return (diff * diff).mean()