In [3]:
import pandas as pd
import numpy as np
import torch

# Task 1

Loading the synthetic dataset.

In [4]:
# You may need to edit the path, depending on where you put the files.
data = pd.read_csv("data/a4_synthetic.csv")

X = data.drop(columns="y").to_numpy()
Y = data.y.to_numpy()

Training a linear regression model for this synthetic dataset.

In [3]:
np.random.seed(1)

w_init = np.random.normal(size=(2, 1))
b_init = np.random.normal(size=(1, 1))

# We just declare the parameter tensors. Do not use nn.Linear.
w = torch.tensor(w_init, requires_grad=True)
b = torch.tensor(b_init, requires_grad=True)

eta = 1e-2
opt = torch.optim.SGD([w, b], lr=eta)

for i in range(10):
    sum_err = 0

    for row in range(X.shape[0]):
        x = torch.tensor(X[[row], :])
        y = torch.tensor(Y[[row]])

        # Forward pass
        opt.zero_grad()
        y_pred = x @ w + b
        err = (y_pred - y) ** 2

        # Backward and update.
        # TODO: compute gradients and then update the model.
        err.backward()
        opt.step()

        # For statistics.
        sum_err += err.item()

    mse = sum_err / X.shape[0]
    print(f"Epoch {i+1}: MSE =", mse)

Epoch 1: MSE = 0.7999661130823178
Epoch 2: MSE = 0.017392390107906875
Epoch 3: MSE = 0.009377418010839892
Epoch 4: MSE = 0.009355326971438456
Epoch 5: MSE = 0.009365440968904256
Epoch 6: MSE = 0.009366989180952533
Epoch 7: MSE = 0.009367207398577986
Epoch 8: MSE = 0.009367238983974489
Epoch 9: MSE = 0.009367243704122532
Epoch 10: MSE = 0.009367244427185763


# Task 2

In [108]:
class Tensor:
    # Constructor. Just store the input values.
    def __init__(self, data, requires_grad=False, grad_fn=None):
        self.data = data
        self.shape = data.shape
        self.grad_fn = grad_fn
        self.requires_grad = requires_grad
        self.grad = None

    # So that we can print the object or show it in a notebook cell.
    def __repr__(self):
        dstr = repr(self.data)
        if self.requires_grad:
            gstr = ", requires_grad=True"
        elif self.grad_fn is not None:
            gstr = f", grad_fn={self.grad_fn}"
        else:
            gstr = ""
        return f"Tensor({dstr}{gstr})"

    # Extract one numerical value from this tensor.
    def item(self):
        return self.data.item()

    # YOUR WORK WILL BE DONE BELOW

    # For Task 2:

    # Operator +
    def __add__(self, right):
        # performs add operation
        new_data = self.data + right.data
        grad_fn = AdditionNode(self, right)
        if self.requires_grad or right.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)

    # Operator -
    def __sub__(self, right):
        new_data = self.data - right.data
        grad_fn = SubtractionNode(self, right)
        if self.requires_grad or right.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)

    # Operator @
    def __matmul__(self, right):
        new_data = self.data @ right.data
        grad_fn = MatMulNode(self, right)
        if self.requires_grad or right.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)

    # Operator **
    def __pow__(self, right):
        # NOTE! We are assuming that right is an integer here, not a Tensor!
        if not isinstance(right, int):
            raise Exception("only integers allowed")
        if right < 2:
            raise Exception("power must be ∏= 2")
        grad_fn = PowNode(self, right)
        new_data = self.data**right
        if self.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    def __truediv__(self, right):
        right = tensor(np.array(right)) if isinstance(right, int) else right 
        new_data = self.data / right.data
        grad_fn = DivisionNode(self, right)
        if self.requires_grad or right.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    def __neg__(self):
        return tensor(-self.data)
    
    def __radd__(self, right):
        if isinstance(right, int):
            return tensor(np.array(right) + self.data)
        return right + self
    
    def __rsub__(self, right):
        if isinstance(right, int):
            return tensor(np.array(right) - self.data)
        return right - self
    
    def __mul__(self, right):
        new_data = self.data * right.data
        grad_fn = MultiplicationNode(self, right)
        if self.requires_grad or right.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    def log(self):
        new_data = np.log(self.data)
        grad_fn = LogNode(self)
        if self.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    def tanh(self):
        # using tanh formula e^x - e^-x / e^x + e^-x
        new_data = np.exp(self.data) - np.exp(-self.data) / (np.exp(self.data) + np.exp(-self.data))
        grad_fn = TanhNode(self)
        if self.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    def sum(self):
        new_data = np.sum(self.data)
        grad_fn = SumNode(self)
        if self.requires_grad:
            return Tensor(new_data, grad_fn=grad_fn, requires_grad=True)
        return Tensor(new_data, grad_fn=grad_fn)
    
    # Backward computations. Will be implemented in Task 4.
    def backward(self, grad_output=None):
        # We first check if this tensor has a grad_fn: that is, one of the
        # nodes that you defined in Task 3.

        if self.grad_fn is not None:
            # If grad_fn is defined, we have computed this tensor using some operation.
            if grad_output is None:
                # This is the starting point of the backward computation.
                # This will typically be the tensor storing the output of
                # the loss function, on which we have called .backward()
                # in the training loop.

                # L_grad_fn --> Pow node --> tensor(left), int right

                # print("self", self)
                self.grad_fn.backward(1)

            else:
                # This is an intermediate node in the computational graph.
                # This corresponds to any intermediate computation, such as
                # a hidden layer.
                self.grad_fn.backward(self.grad)

                # self.backward(self.grad)
        else:
            # If grad_fn is not defined, this is an endpoint in the computational
            # graph: learnable model parameters or input data.

            if self.requires_grad:
                # This tensor *requires* a gradient to be computed. This will
                # typically be a tensor that holds learnable parameters.

                self.grad = grad_output
            else:
                # This tensor *does not require* a gradient to be computed. This
                # will typically be a tensor holding input data.
                self.grad = None


# A small utility where we simply create a Tensor object. We use this to
# mimic torch.tensor.
def tensor(data, requires_grad=False):
    return Tensor(data, requires_grad)


# We define helper functions to implement the various arithmetic operations.

# This function takes two tensors as input, and returns a new tensor holding
# the result of an element-wise addition on the two input tensors.

Some sanity checks.

In [5]:
# Two tensors holding row vectors.
x1 = tensor(np.array([[2.0, 3.0]]))
x2 = tensor(np.array([[1.0, 4.0]]))
# A tensors holding a column vector.
w = tensor(np.array([[-1.0], [1.2]]))

# Test the arithmetic operations.
test_plus = x1 + x2
test_minus = x1 - x2
test_power = x2**2
test_matmul = x1 @ w
test_combination = (x1**2 - x2 @ w) ** 3

print(f"Test of addition: {x1.data} + {x2.data} = {test_plus.data}")
print(f"Test of subtraction: {x1.data} - {x2.data} = {test_minus.data}")
print(f"Test of power: {x2.data} ** 2 = {test_power.data}")
print(f"Test of matrix multiplication: {x1.data} @ {w.data} = {test_matmul.data}")


# Check that the results are as expected. Will crash if there is a miscalculation.
assert np.allclose(test_plus.data, np.array([[3.0, 7.0]]))
assert np.allclose(test_minus.data, np.array([[1.0, -1.0]]))
assert np.allclose(test_power.data, np.array([[1.0, 16.0]]))
assert np.allclose(test_matmul.data, np.array([[1.6]]))
assert np.allclose(test_combination.data, np.array([[8.00000e-03, 1.40608e02]]))

Test of addition: [[2. 3.]] + [[1. 4.]] = [[3. 7.]]
Test of subtraction: [[2. 3.]] - [[1. 4.]] = [[ 1. -1.]]
Test of power: [[1. 4.]] ** 2 = [[ 1. 16.]]
Test of matrix multiplication: [[2. 3.]] @ [[-1. ]
 [ 1.2]] = [[1.6]]


# Tasks 3 and 4

In [97]:
class Node:
    def __init__(self):
        pass

    def backward(self, grad_output):
        raise NotImplementedError("Unimplemented")

    def __repr__(self):
        return str(type(self))


class AdditionNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        # TODO: in Task 4, implement backward step for the addition operation.

        self.left.grad = grad_output
        self.right.grad = grad_output
        self.right.backward(self.right.grad)
        self.left.backward(self.left.grad)


class SubtractionNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        # TODO: in Task 4, implement backward step for the addition operation.

        self.left.grad = grad_output
        self.right.grad = -grad_output
        self.right.backward(self.right.grad)
        self.left.backward(self.left.grad)


class MatMulNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        self.left.grad = self.right @ grad_output
        self.right.grad = self.left.data.T @ grad_output
        self.right.backward(self.right.grad)
        self.left.backward(self.left.grad)


class PowNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        # TODO: in Task 4, implement backward step for the addition operation.

        self.left.grad = (self.right * self.left.data ** (self.right - 1)) * grad_output
        self.left.backward(self.left.grad)
        
        
class DivisionNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        ##
        
        self.left.grad = grad_output / self.right.data
        self.right.grad = -grad_output * self.left.data / (self.right.data**2)
        
class LogNode(Node):
    def __init__(self, left):
        self.left = left

    def backward(self, grad_output):
        self.left.grad = grad_output * (1/self.left.data)
        self.left.backward(self.left.grad)
        
class TanhNode(Node):
    def __init__(self, left):
        self.left = left

    def backward(self, grad_output):
        self.left.grad = grad_output * (1 - (self.left.data ** 2))
        self.left.backward(self.left.grad)
        
class MultiplicationNode(Node):
    def __init__(self, left, right):
        self.left = left
        self.right = right

    def backward(self, grad_output):
        self.left.grad = grad_output * self.right.data
        self.right.grad = grad_output * self.left.data
        self.right.backward(self.right.grad)
        self.left.backward(self.left.grad)
        
        
# create a class that will accound for summing a tensor
class SumNode(Node):
    def __init__(self, left):
        self.left = left

    def backward(self, grad_output):
        self.left.grad = grad_output * np.ones_like(self.left.data)
        self.left.backward(self.left.grad)

Sanity check for Task 3.

In [6]:
x = tensor(np.array([[2.0, 3.0]]))
w1 = tensor(np.array([[1.0, 4.0]]), requires_grad=True)
w2 = tensor(np.array([[3.0, -1.0]]), requires_grad=True)

test_graph = x + w1 + w2

print("Computational graph top node after x + w1 + w2:", test_graph.grad_fn)

assert isinstance(test_graph.grad_fn, AdditionNode)
assert test_graph.grad_fn.right is w2
assert test_graph.grad_fn.left.grad_fn.left is x
assert test_graph.grad_fn.left.grad_fn.right is w1

Computational graph top node after x + w1 + w2: <class '__main__.AdditionNode'>


Sanity check for Task 4.

In [30]:
x = tensor(np.array([[2.0, 3.0]]))
w = tensor(np.array([[-1.0], [1.2]]), requires_grad=True)
y = tensor(np.array([[0.2]]))

# We could as well write simply loss = (x @ w - y)**2
# We break it down into steps here if you need to debug.

model_out = x @ w
diff = model_out - y
loss = diff**2

loss.backward()

print("Gradient of loss w.r.t. w =\n", w.grad)

assert np.allclose(w.grad, np.array([[5.6], [8.4]]))
assert x.grad is None
assert y.grad is None

Gradient of loss w.r.t. w =
 [[5.6]
 [8.4]]


An equivalent cell using PyTorch code. Your implementation should give the same result for `w.grad`.

In [9]:
pt_x = torch.tensor(np.array([[2.0, 3.0]]))
pt_w = torch.tensor(np.array([[-1.0], [1.2]]), requires_grad=True)
pt_y = torch.tensor(np.array([[0.2]]))

pt_model_out = pt_x @ pt_w
pt_model_out.retain_grad()  # Keep the gradient of intermediate nodes for debugging.

pt_diff = pt_model_out - pt_y
pt_diff.retain_grad()

pt_loss = pt_diff**2
pt_loss.retain_grad()

pt_loss.backward()
pt_w.grad

tensor([[5.6000],
        [8.4000]], dtype=torch.float64)

# Task 5

In [10]:
class Optimizer:
    def __init__(self, params):
        self.params = params

    def zero_grad(self):
        for p in self.params:
            p.grad = np.zeros_like(p.data)

    def step(self):
        raise NotImplementedError("Unimplemented")


class SGD(Optimizer):
    def __init__(self, params, lr):
        super().__init__(params)
        self.lr = lr

    def step(self):
        for p in self.params:
            if p.requires_grad:
                p.data -= self.lr * p.grad

In [13]:
np.random.seed(1)

w_init = np.random.normal(size=(2, 1))
b_init = np.random.normal(size=(1, 1))

# We just declare the parameter tensors. Do not use nn.Linear.
w = tensor(w_init, requires_grad=True)
b = tensor(b_init, requires_grad=True)

eta = 1e-2
opt = SGD([w, b], lr=eta)

for i in range(10):
    sum_err = 0

    for row in range(X.shape[0]):
        x = tensor(X[[row], :])
        y = tensor(Y[[row]])

        # Forward pass
        opt.zero_grad()
        y_pred = x @ w + b
        err = (y_pred - y) ** 2

        # Backward and update.
        # TODO: compute gradients and then update the model.
        err.backward()
        opt.step()

        # For statistics.
        sum_err += err.item()

    mse = sum_err / X.shape[0]
    print(f"Epoch {i+1}: MSE =", mse)

Epoch 1: MSE = 0.7999661130823178
Epoch 2: MSE = 0.017392390107906875
Epoch 3: MSE = 0.009377418010839892
Epoch 4: MSE = 0.009355326971438456
Epoch 5: MSE = 0.009365440968904256
Epoch 6: MSE = 0.009366989180952533
Epoch 7: MSE = 0.009367207398577986
Epoch 8: MSE = 0.009367238983974489
Epoch 9: MSE = 0.009367243704122532
Epoch 10: MSE = 0.009367244427185763


# Task 6

In [None]:
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

# You may need to edit the path, depending on where you put the files.
a4data = pd.read_csv("data/raisins.csv")

X = scale(a4data.drop(columns="Class"))
Y = 1.0 * (a4data.Class == "Besni").to_numpy()

np.random.seed(0)
shuffle = np.random.permutation(len(Y))
X = X[shuffle]
Y = Y[shuffle]

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, random_state=0, test_size=0.2)

In [46]:
# we will create a layer class to create a layer of neurons

class Module:
    def zero_grad(self):
        for p in self.parameters():
            p.grad = np.zeros_like(p.data)

    def parameters(self):
        raise NotImplementedError("Method for getting parameters is not implemented")

class Layer(Module):
    def __init__(self, in_features, out_features):
        self.w = tensor(np.random.normal(size=(in_features, out_features)), requires_grad=True)
        self.b = tensor(np.random.normal(size=(1, out_features)), requires_grad=True)

    def __call__(self, x):
        model_output = x @ self.w + self.b
       
        return model_output.tanh()
    
    def parameters(self):
        return [self.w, self.b]

class MLP(Module):
    def __init__(self, in_features, hidden_features, out_features):
        self.hidden = Layer(in_features, hidden_features)
        self.out = Layer(hidden_features, out_features)

    def __call__(self, x):
        return self.out(self.hidden(x))
    

def cross_entropy(y_pred, y_true):
    return -y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred)

In [47]:

model = MLP(Xtrain.shape[1], 10, 1)
opt = SGD(model.hidden.w, lr=1e-2)


Tensor(array([[-1.        ],
       [ 3.28549244]]), requires_grad=True)

In [110]:
y_pred = tensor(np.array([[0.5], [0.8]]), requires_grad=True)
y_true = tensor(np.array([[0.6], [0.3]]))

# what should the loss be?


def cross_entropy(y_pred, y_true):

    return ((-y_true * y_pred.log() - (1 - y_true) * (1 - y_pred).log()).sum()) / y_true.shape[0]

loss = cross_entropy(y_pred, y_true)
loss


Tensor(0.9433483923290391, requires_grad=True)

In [85]:
loss.data.shape

# turn the loss into a scalar, it is of shiape (2, 1), i want to do sum with verctor of ones
loss.data.sum()


-0.4700036292457358