In [617]:
class ActivationFunctions:
    @staticmethod
    def tanh(x):
        result = []
        for row in x.data:
            tanh_row = [math.tanh(_x) for _x in row]
            result.append(tanh_row)
        output = Tensor(result)
        output._prev = [x]

        def _backward():
            if x.grad is None:
                x.grad = [[0 for _ in range(len(x.data[0]))] for _ in range(len(x.data))]
            for i in range(len(x.data)):
                for j in range(len(x.data[0])):
                    x.grad[i][j] += (1 - math.tanh(x.data[i][j])**2) * output.grad[i][j]

        output._backward = _backward
        return output
    
    @staticmethod
    def relu(x):
        result = []
        for row in x.data:
            relu_row = [max(0, _x) for _x in row]
            result.append(relu_row)
        output = Tensor(result)
        output._prev = [x]

        def _backward():
            if x.grad is None:
                x.grad = [[0 for _ in range(len(x.data[0]))] for _ in range(len(x.data))]
            for i in range(len(x.data)):
                for j in range(len(x.data[0])):
                    x.grad[i][j] += (1 if x.data[i][j] > 0 else 0) * output.grad[i][j]

        output._backward = _backward
        return output
    
    @staticmethod
    def sigmoid(x):
        result = []
        for row in x.data:
            sigmoid_row = [1 / (1 + math.exp(-_x)) for _x in row]
            result.append(sigmoid_row)
        output = Tensor(result)
        output._prev = [x]

        def _backward():
            if x.grad is None:
                x.grad = [[0 for _ in range(len(x.data[0]))] for _ in range(len(x.data))]
            for i in range(len(x.data)):
                for j in range(len(x.data[0])):
                    sig = 1 / (1 + math.exp(-x.data[i][j]))
                    x.grad[i][j] += sig * (1 - sig) * output.grad[i][j]

        output._backward = _backward
        return output
        
    @staticmethod
    def softmax(x):
        max_val = [[max(row)] for row in x.data]
        exps = [[math.exp(i - max_row[0]) for i in row] for row, max_row in zip(x.data, max_val)]
        
        sums = [[sum(row)] for row in exps]
        softmax_result = [[exps[i][j] / sums[i][0] for j in range(len(exps[i]))] for i in range(len(exps))]
        
        result = Tensor(softmax_result)
        
        result._prev = [x]
        
        def _backward():
            if x.grad is None:
                x.grad = [[0 for _ in range(len(x.data[0]))] for _ in range(len(x.data))]

            for i in range(len(result.data)):
                for j in range(len(result.data[0])):
                    for k in range(len(result.data[0])):
                        grad_val = result.data[i][j] * ((1 if j == k else 0) - result.data[i][k])
                        x.grad[i][k] += result.grad[i][j] * grad_val
        
        result._backward = _backward
        return result


activation_functions = {
    "tanh": ActivationFunctions.tanh,
    "relu": ActivationFunctions.relu,
    "sigmoid": ActivationFunctions.sigmoid,
    "softmax": ActivationFunctions.softmax,
    
}


In [618]:
class ErrorFunction:
    @staticmethod
    def mse(output, target):
        squared_diffs = [[(o - t) ** 2 for o, t in zip(out_row, target_row)] 
                          for out_row, target_row in zip(output.data, target.data)]
        
        loss_value = sum(sum(row) for row in squared_diffs) / (len(output.data) * len(output.data[0]))

        loss_tensor = Tensor([[loss_value]])
        loss_tensor._prev = [output] 

        def _backward():
            if output.grad is None:
                output.grad = [[0.0 for _ in range(len(output.data[0]))] for _ in range(len(output.data))]
            for i in range(len(output.data)):
                for j in range(len(output.data[0])):
                    output.grad[i][j] += (2 * (output.data[i][j] - target.data[i][j])) / (len(output.data) * len(output.data[0]))

        loss_tensor._backward = _backward  

        return loss_tensor

err_functions = {
    "mse": ErrorFunction.mse,
}


In [619]:
class Tensor:
    def __init__(self, data):
        self.set_data(data)
        self.grad = None
        self._backward = lambda: None
        self._prev = []

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad}, prev={self._prev} shape={self.shape})"

    def set_data(self,data):
        self.data = data
        self.determine_shape()

    def determine_shape(self):
        data = self.data
        if isinstance(data[0], list):
            rows = len(data)
            cols = len(data[0]) if rows > 0 else 0
            self.shape = (rows, cols)
        else:
            self.shape = (len(data), 1)

    def dot(self, other):
        v1 = self.data  
        v2 = other.data 
        
        m = len(v1)      
        n = len(v2[0])  
        p = len(v2)    
    
        if len(v1[0]) != len(v2):
            raise ValueError("Incompatible dimensions for matrix multiplication.")
        
        result = [[0 for _ in range(n)] for _ in range(m)]
    
        for i in range(m):
            for j in range(n):
                for k in range(p):
                    result[i][j] += v1[i][k] * v2[k][j]        
        output = Tensor(result)
        
        def _backward():
            
            if self.grad is None:
                self.grad = [[0 for _ in range(len(self.data[0]))] for _ in range(len(self.data))]
            if other.grad is None:
                other.grad = [[0 for _ in range(len(other.data[0]))] for _ in range(len(other.data))]
            
            for i in range(m):
                for k in range(p):
                    for j in range(n):
                        self.grad[i][k] += output.grad[i][j] * other.data[k][j]
            
            self_T = self.transpose()
            for k in range(p):
                for j in range(n):
                    for i in range(m):
                        other.grad[k][j] += output.grad[i][j] * self_T.data[k][i]
        
        output._backward = _backward
        output._prev = [self, other]
        
        return output

    # FIXME: broadcast value replacement causes error
    def broadcast(self, m):
        result = []
        for _ in range(m):
            result.append(self.data)
        self.set_data(result)

    def add(self, other):
        if self.shape != other.shape or not isinstance(other.data[0], list):
            other.broadcast(self.shape[0])
        
        result = []
        for i in range(len(self.data)):
            row = []
            for j in range(len(self.data[0])):
                row.append(self.data[i][j] + other.data[i][j])
            result.append(row)
        output = Tensor(result)

        def _backward():
            if self.grad is None:
                self.grad = [[0 for _ in range(len(self.data[0]))] for _ in range(len(self.data))]
            for i in range(len(self.data)):
                for j in range(len(self.data[0])):
                    self.grad[i][j] += output.grad[i][j]
    
            if other.grad is None:
                other.grad = [[0 for _ in range(len(other.data[0]))] for _ in range(len(other.data))]
            for i in range(len(other.data)):
                for j in range(len(other.data[0])):
                    other.grad[i][j] += output.grad[i][j]
    
        output._backward = _backward
        output._prev = [self, other]
        
        return output

    def step(self, lr):
        for i in range(len(self.data)):
            row = []
            for j in range(len(self.data[0])):
                self.data[i][j] += self.grad[i][j] * -lr


    def transpose(self):
        transposed_data = [[self.data[j][i] for j in range(self.shape[0])]
                           for i in range(self.shape[1])]
        return Tensor(transposed_data)

    def backward(self):
        if self.grad is None:
            self.grad = [[1.0 for _ in range(len(self.data[0]))] for _ in range(len(self.data))]
        self._backward()
        for prev_tensor in self._prev:
            prev_tensor.backward()



In [620]:
import random
import math

class DenseLayer:
    def __init__(self,nin,nout,act_func = "tanh"):
        self.weights = Tensor([[i + 1 for i in range(nout)] for _ in range(nin)])
        self.biases = Tensor([1 for _ in range(nout)])
        
    def set_parameters(self, weights, biases):
        self.weights = Tensor(weights)
        self.biases = Tensor(biases)
        
    def set_biases(self, biases):
        self.biases = Tensor(biases)

    def get_parameters(self):
        return self.weights, self.biases

    def forward(self, inputs):
        result = inputs.dot(self.weights).add(self.biases)
        return result
        
class MLP:
    def __init__(self, layer_sizes, activations = []):
        self.layers = []
        self.activations = []
        for i in range(len(layer_sizes) - 1):
            self.layers.append(DenseLayer(layer_sizes[i], layer_sizes[i + 1]))
            if(i < len(activations)):
                self.activations.append(None if activations[i] == None else activations[i])
            else:
                self.activations.append(None)
        
            
    def set_parameters(self,weights, biases):
        for layer,weight, bias in zip(self.layers,weights, biases):
            layer.set_parameters(weight, bias)
            

    def forward(self, inputs):
        for idx, layer in enumerate(self.layers):
            inputs = layer.forward(inputs)
            if self.activations[idx]:
                inputs = activation_functions[self.activations[idx]](inputs)
            
        return inputs

    def get_parameters(self):
        all_weights = []
        all_biases = []
        for layer in self.layers:
            weights, biases = layer.get_parameters()
            all_weights.append(weights)
            all_biases.append(biases)
            
        return all_weights, all_biases

    def step(self, lr):
        weights, biases = self.get_parameters()
        for param in weights + biases:
            param.step(lr)

    def zero_grad(self):
        weights, biases = self.get_parameters()
        for param in weights + biases:
            param.grad = None
            


In [623]:
import torch
import torch.nn as nn
import random

class MLP_TORCH(nn.Module):
    def __init__(self, layer_sizes, activations = []):
        super(MLP_TORCH, self).__init__()
        layers = []
        self.activations = []
        for i in range(len(layer_sizes) - 1):
            layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
            if(i < len(activations)):
                self.activations.append(None if activations[i] == None else activations[i])
            else:
                self.activations.append(None)
        self.network = nn.ModuleList(layers)
        
    def get_act_func(self, name):
        if(name == "tanh"):
            return nn.Tanh
        if(name == "relu"):
            return nn.ReLU
        if(name == "sigmoid"):
            return nn.Sigmoid
        if(name == "softmax"):
            return nn.Softmax

    def forward(self, x):
        for idx, layer in enumerate(self.network):
            x = layer(x)
            if self.activations[idx]:
                act_func = self.get_act_func(self.activations[idx])()
                x = act_func(x)
        return x

    def set_parameters(self, weights, biases):
        with torch.no_grad():
            for idx, layer in enumerate(self.network):
                layer.weight = nn.Parameter(torch.tensor(weights[idx], dtype=torch.float32).T)
                layer.bias = nn.Parameter(torch.tensor(biases[idx], dtype=torch.float32))
                
    def get_parameters(self):
        weights = []
        biases = []
        for idx, layer in enumerate(self.network):
            weights.append(layer.weight)
            biases.append(layer.bias)
        return weights, biases

def get_err_func(name):
    if(name == "mse"):
        return nn.MSELoss

def compare_weights_and_biases_values(mlp_torch, mlp):
    e_weights, e_biases = mlp_torch.get_parameters()
    weights, biases = mlp.get_parameters()

    for i in range(len(weights)):
        expected = e_weights[i].T
        current = torch.tensor(weights[i].data)
        assert torch.allclose(expected, current, rtol=1e-4, atol=1e-4), \
            f"Weights mismatch in layer {i}:\nExpected: {expected}\nProvided: {current}"
    for i in range(len(biases)):
        expected = e_biases[i].flatten()
        current = torch.tensor(biases[i].data).flatten()
        assert torch.allclose(expected, current, rtol=1e-4, atol=1e-4), \
            f"Weights mismatch in layer {i}:\nExpected: {expected}\nProvided: {current}"

    #print("Parameter values Match")
        
def compare_weights_and_biases_grads(mlp_torch, mlp):
    e_weights, e_biases = mlp_torch.get_parameters()
    weights, biases = mlp.get_parameters()

    for i in range(len(weights)):
        expected = e_weights[i].grad.T
        current = torch.tensor(weights[i].grad)
        assert torch.allclose(expected, current, rtol=1e-4, atol=1e-4), \
            f"Weights grads mismatch in layer {i}:\nExpected: {expected}\nProvided: {current}"

    for i in range(len(biases)):
        expected = e_biases[i].grad.flatten()
        current = torch.tensor(biases[i].grad).flatten()
        assert torch.allclose(expected, current, rtol=1e-4, atol=1e-4), \
            f"Weights mismatch in layer {i}:\nExpected: {expected}\nProvided: {current}"

    # print("Parameter grads Match")

def compare_outputs(o1,o2,i):
    if o1.dtype != torch.float32:
        o1 = o1.float()
    if o2.dtype != torch.float32:
        o2 = o2.float()
    assert torch.allclose(o1, o2, rtol=1e-4, atol=1e-4), \
        f"Output mismatch at batch starting index {i}:\nMLP Output: {o1}\nTorch Output: {o2}"
    #print(f"Outputs match for this batch {i}")

def compare_mlp(layer_sizes, x, y, batch_size=2, learning_rate=0.001, epochs=1000):
    weights = []
    biases = []
    activations = ["tanh", "relu", "softmax"]
    err_func = "mse"
    for l in range(len(layer_sizes) - 1):
        weights.append([[random.uniform(-1, 1) for _ in range(layer_sizes[l + 1])] for _ in range(layer_sizes[l])])
        biases.append([random.uniform(-1, 1) for _ in range(layer_sizes[l + 1])])
    mlp = MLP(layer_sizes, activations)
    mlp.set_parameters(weights, biases)

    mlp_torch = MLP_TORCH(layer_sizes, activations)
    mlp_torch.set_parameters(weights, biases)

    compare_weights_and_biases_values(mlp_torch, mlp)

    optimizer = torch.optim.SGD(mlp_torch.parameters(), lr=learning_rate)
    
    for epoch in range(100):
        for i in range(0, len(x), batch_size):
            x_batch = x[i:i + batch_size]
            y_batch = y[i:i + batch_size]

            mlp_output = mlp.forward(Tensor(x_batch))
            
            torch_output = mlp_torch.forward(torch.tensor(x_batch))
            
            compare_outputs(torch.tensor(mlp_output.data), torch_output.data, i)

            torch_loss_func = get_err_func(err_func)()
            torch_loss = torch_loss_func(torch_output, torch.tensor(y_batch))
            loss = err_functions[err_func](mlp_output, Tensor(y_batch))
            
            loss.backward()
            
            torch_loss.backward()
            
            mlp.step(learning_rate)
            optimizer.step()
            compare_weights_and_biases_grads(mlp_torch, mlp)

            optimizer.zero_grad()
            mlp.zero_grad()
            print(f"Epoch {epoch + 1}/{epochs} Test Passed")
        
        
    


compare_mlp([3,3,2],[
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
],[[1.0, 3.0], [-1.0, -2.0], [-1.0, 1.0], [1.0, -1.0]])
    
    

RuntimeError: The size of tensor a (3) must match the size of tensor b (6) at non-singleton dimension 0