<a href="https://colab.research.google.com/github/pranay8297/fastaip2/blob/main/ml_coding_interview_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Micrograd

In [228]:
import numpy as np

In [244]:
# Lets start with Micro grad

class Value:

    def __init__(self, val, parents = (), op = None, label = None):
        self.val = val
        self.parents = parents
        self.op = op
        self.grad = 0
        self.label = label

    def __repr__(self):
        return f'Value: {self.val:.4f}' + (f'  :: Label: {self.label}' if self.label != None else '')

    def __sub__(self, other):
        return self + (-other)

    def __neg__(self):
        return self * -1

    def __radd__(self, other):
        return self + other

    def __rmul__(self, other):
        return self * other

    def __truediv__(self, other):
        return self * (other**-1)

    def __add__(self, other):

        if not isinstance(other, Value): other = Value(other, label = str(other))

        out = Value(self.val + other.val, (self, other), label = f'{self.label}+{other.label}')

        def _backward():
            self.grad += 1 * out.grad
            other.grad += 1 * out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        if not isinstance(other, Value): other = Value(other, label = str(other))
        out = Value(self.val * other.val, (self, other), label = f'{self.label}*{other.label}')

        def _backward():
            self.grad += other.val * out.grad
            other.grad += self.val * out.grad

        out._backward = _backward
        return out

    def __pow__(self, pow):
        out = Value(self.val ** pow, (self,), label = f'{self.label}**{pow}')
        def _backward():
            self.grad += pow*self.val**(pow-1) * out.grad
        out._backward = _backward
        return out

    def exp(self):
        exp = np.exp(self.val)
        out = Value(exp, (self,), label = f'exp({self.label})')
        def _backward():
            self.grad += exp * out.grad
        out._backward = _backward
        return out

    def relu(self):
        out = Value(self.val if self.val > 0 else 0, label = f'relu({self.label})')

        def _backward():
            self.grad += 0 if out.val else out.grad

        out._backward = _backward
        return out

    def backward(self):
        self.grad = 1.
        visited = set()
        dfs = []
        # breakpoint()
        def _dfs(node):

            if node in visited: return

            visited.add(node)

            for i in node.parents:
                _dfs(i)

            dfs.append(node)

        _dfs(self)
        topsort = reversed(dfs)

        for i in topsort:
            if not hasattr(i, '_backward'): continue
            i._backward()


In [230]:
x1, x2 = Value(-1.5, label = 'x1'), Value(1.3, label = 'x2')
w1, w2 = Value(4, label = 'w1'), Value(5, label = 'w2')
b = Value(0, label = 'b')
z = w1*x1 + w2*x2 + b
z.label = 'z'

# z.backward()
# w1.grad, w2.grad, b.grad
print(z)
yhat = ((2*z).exp() - 1)/((2*z).exp() + 1)
# yhat.label = 'yhat'
print(yhat)

yhat.backward()

yhat.grad, z.grad, w1.grad, w2.grad, b.grad

Value: 0.5000  :: Label: z
Value: 0.4621  :: Label: exp(z*2)+-1*exp(z*2)+1**-1


(1.0,
 0.7864477329659272,
 -1.1796715994488909,
 1.0223820528557053,
 0.7864477329659272)

In [286]:
import random

class Module:
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

class Neuron(Module):
    def __init__(self, n_in):
        self.ws = [Value(random.uniform(-1, 1)) for _ in range(n_in)]
        self.b = Value(0)

    def forward(self, x): # dot product + b: wx + b
        # breakpoint()
        return sum((a * b for a, b in zip(self.ws, x))) + self.b

    def __repr__(self):
        return f"Neuron containing: {self.ws}"

class Layer(Module):
    def __init__(self, n_in, n_out):
        # n_in: number of inputs
        # n_out: number of outputs desired in a layer
        self.neurons = [Neuron(n_in) for _ in range(n_out)]

    def forward(self, x):
        return [n(x) for n in self.neurons]

    def __repr__(self): return f"Layer containing weights of shape: {len(self.neurons)}x{len(self.neurons[0].ws)}"


class MLP(Module):

    def __init__(self, lc = [], act = 'relu'):
        # assume lc - layer_config has n_in preppended
        self.layers = [Layer(lc[i-1], lc[i]) for i in range(1, len(lc))]

    def forward(self, x):

        for layer in self.layers:
            x = layer(x)
        return x

    def __repr__(self):
        st = ''
        for i in self.layers:
            st += i.__repr__() + ' \n'
        return st

In [288]:
x = [Value(1, label = 'x1'), Value(1.5, label = 'x2'), Value(1.2, label = 'x3')]
neu = Neuron(3)
neu(x)

Value: -1.1044  :: Label: None*x1+0+None*x2+None*x3+None

In [289]:
x = [Value(1, label = 'x1'), Value(1.5, label = 'x2'), Value(1.2, label = 'x3')]
layer = Layer(3, 2)
inter = layer(x)
inter

[Value: 1.2700  :: Label: None*x1+0+None*x2+None*x3+None,
 Value: 1.3024  :: Label: None*x1+0+None*x2+None*x3+None]

In [290]:
x = [Value(1, label = 'x1'), Value(1.5, label = 'x2'), Value(1.2, label = 'x3')]
mlp = MLP((3, 2, 1))
yhat = mlp(x)

In [293]:
yhat[0].backward()
yhat[0].val

-0.13525551427592739

In [294]:
for layer in mlp.layers:
    for neu in layer.neurons:
        print([i.grad for i in neu.ws])

[0.29952624858971144, 0.44928937288456716, 0.3594314983076537]
[-0.5134056657947423, -0.7701084986921135, -0.6160867989536908]
[-0.5655506771842844, -0.06650054863652788]


In [331]:
# batch norm
import torch
from torch import nn
from torch.nn import functional as F

class BN(nn.Module):

    def __init__(self, n_filters):
        super().__init__()
        self.running_mean = None
        self.running_std = None

        self.scale = nn.Parameter(torch.tensor(1.))
        self.shift = nn.Parameter(torch.tensor(0.))

    def forward(self, x):
        # x.shape - bs, ch, h, w
        mean = x.mean((0, 1)).squeeze() #bs, h, w
        std = x.std((0, 1)).squeeze() #bs, h, w
        return self.scale*((x - mean)/std) + self.shift

In [305]:
bn = BN(4)
x = torch.randn(16, 4, 32, 32)
out = bn(x)
out.shape, out.mean(), out.std()

(torch.Size([16, 4, 32, 32]),
 tensor(-5.0204e-10, grad_fn=<MeanBackward0>),
 tensor(0.9922, grad_fn=<StdBackward0>))

In [306]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [332]:
class Softmax():
    def __init__(self): pass

    def __call__(self, x):
        self.x = x
        x_exp = np.exp(x)
        x_exp_sum = x_exp.sum()

        self.es = x_exp/x_exp_sum
        return self.es

    def _backward(self):
        lsp, rsp = self.es, (np.eye(len(self.x)) - self.es).T
        return np.dot(lsp, rsp)

In [333]:
x = [0.14, 0.5, -0.8]
soft = Softmax()
es = soft(x)

In [334]:
es, es.sum()

(array([0.35411301, 0.50756059, 0.1383264 ]), 1.0)

In [335]:
x_grad = soft._backward()

In [336]:
x_grad

array([-0.04803496,  0.10541262, -0.26382157])

torch.Size([1, 3])

In [348]:
xx = torch.tensor(x, requires_grad = True)
y = F.softmax(xx)

y.sum().backward()

  y = F.softmax(xx)


In [350]:
xx, xx.grad

(tensor([ 0.1400,  0.5000, -0.8000], requires_grad=True),
 tensor([2.1107e-08, 3.0253e-08, 8.2449e-09]))

# Perceptron and othe blocks from scratch (Dealing with matrices)

In [77]:
import numpy as np
import torch

from torch.nn import functional as F

import matplotlib.pyplot as plt

In [352]:
# create a dataset of 20 examples
# 3xs and 1ys
# create a ws and b
# sigmoid
# mse

In [127]:
def mse(yhat, y):
    return np.mean((yhat-y)**2)

def mse_grad(yhat, y):
    return (2/len(yhat))*(yhat - y)

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def sigmoid_grad(act):
    return act*(1 - act)

def relu(z):
    return z * (z > 0)

def relu_grad(z):
    return (z > 0)

def softmax(z):
    exp_sum = np.exp(z).sum(-1)
    return np.exp(z)/np.expand_dims(exp_sum, axis=1)

def softmax_grad(acts):
    return acts * (1. - acts)

def cross_entropy(yhat, y):
    return y * -1 * np.log(yhat)

def cross_entropy_grad(yhat, y):
    return y * (-1/yhat)

In [423]:
ys = np.arange(0, 1, 1/20)

ys = np.random.uniform(low=0, high=1., size=(20,2))
xs = np.random.uniform(low= -1, high = 2, size = (20, 3))

# 3x2 matrix for 3 inputs and 2 outputs
ws = np.random.uniform(low = -1, high = 1, size = (3, 2))
bs = np.zeros((2))
# xs.shape, ys.shape, ws.shape, bs.shape

zs = xs.dot(ws) + bs
yhat = sigmoid(zs)
print(yhat.shape)

loss = mse(yhat, y)
print("Loss: ", loss)

dl_dyhat = mse_grad(yhat, y)
dyhat_dz = sigmoid_grad(yhat)
# dyhat_dz.shape
dl_dz = dyhat_dz * dl_dyhat
dl_dw = (1/xs.shape[0])*np.dot(xs.T, dl_dz)

lr = 0.5
ws += -lr*dl_dw
bs += -lr*dl_dz.mean(0)

(20, 2)
Loss:  0.243156526893116


In [15]:
def train(xs, ys, n_epochs = 100):

    lr = 0.7
    # 3x2 matrix for 3 inputs and 2 outputs
    ws = np.random.uniform(low = -1, high = 1, size = (xs.shape[1], ys.shape[1]))
    bs = np.zeros((ys.shape[1]))
    print(xs.shape, ys.shape, ws.shape, bs.shape)

    for i in range(n_epochs):

        # Forward Pass
        zs = xs.dot(ws) + bs
        yhat = sigmoid(zs)

        # Loss Calculation
        loss = mse(yhat, ys)
        if (i)%10 == 0:
            print(f"Epoch: {i+1}, Loss: {loss}")

        # Calculating Gradients
        dl_dyhat = mse_grad(yhat, ys)
        dyhat_dz = sigmoid_grad(yhat)
        dl_dz = dyhat_dz * dl_dyhat
        dl_dw = (1/xs.shape[0])*np.dot(xs.T, dl_dz) # dl_dz*dz_dw
        dl_db = dl_dz.mean(0) # 1

        # Backward Pass
        ws += -lr * dl_dw
        bs += -lr * dl_db

    return ws, bs

In [21]:
ys = np.random.uniform(low=0, high=1., size=(20,2))
xs = np.random.uniform(low= -4, high = 4, size = (20, 3))
trained_ws, trained_bs = train(xs, ys, 100)

(20, 3) (20, 2) (3, 2) (2,)
Epoch: 1, Loss: 0.17480770537172163
Epoch: 11, Loss: 0.14914855268778654
Epoch: 21, Loss: 0.12973814316934065
Epoch: 31, Loss: 0.11517636329801513
Epoch: 41, Loss: 0.10321831826716292
Epoch: 51, Loss: 0.09272226704806837
Epoch: 61, Loss: 0.08349640143061773
Epoch: 71, Loss: 0.07582077763916781
Epoch: 81, Loss: 0.0699662471142186
Epoch: 91, Loss: 0.06588078714386562


In [74]:
# 3, 4, 2
ys = np.random.uniform(low=0, high=1., size=(20,2))
xs = np.random.uniform(low= -10, high = 10, size = (20, 3))

w1 = np.random.uniform(low = -1, high = 1, size = (xs.shape[1], 4))
b1 = np.zeros((4))

w2 = np.random.uniform(low = -1, high = 1, size = (4, ys.shape[1]))
b2 = np.zeros((ys.shape[1]))

z1 = np.dot(xs, w1) + b1
a1 = sigmoid(z1)

z2 = np.dot(a1, w2) + b2
yhat = sigmoid(z2)
loss = mse(yhat, ys)

print("First Loss: ", loss)

dl_dyhat = mse_grad(yhat, ys)
dl_dyhat.shape

dyhat_dz2 = sigmoid_grad(yhat)
dyhat_dz2.shape

dl_dz2 = dl_dyhat * dyhat_dz2
dl_dz2.shape

# dl_dw2 = dl_dz2 * dz2_dw2
# dl_db2 = dl_dz2

dl_dw2 = (1/a1.shape[0]) * np.dot(a1.T, dl_dz2) # Done
dl_db2 = dl_dz2.mean(0) # Done

dl_dw2.shape, dl_db2.shape

# Now we have to calculate dz2_da1
# then dl_da1 = dl_dz2 * dz2_da1
w2.shape, dl_dz2.shape

# dz2_dz1 = w2
dl_da1 = np.dot(dl_dz2, w2.T)
dl_da1.shape

# dl_dz1 = dl_da1 * da1_dz1
da1_dz1 = sigmoid_grad(a1)
dl_dz1 = dl_da1 * da1_dz1
dl_dz1.shape

dl_dw1 = (1/z1.shape[0]) * np.dot(xs.T, dl_dz1) # Done
dl_db1 = dl_dz1.mean(0) # Done

dl_dw1.shape, dl_db1.shape

lr = 0.89

w1 += -lr * dl_dw1
b1 += -lr * dl_db1

w2 += -lr * dl_dw2
b2 += -lr * dl_db2

z1 = np.dot(xs, w1) + b1
a1 = sigmoid(z1)

z2 = np.dot(a1, w2) + b2
yhat = sigmoid(z2)
loss = mse(yhat, ys)

loss

First Loss:  0.08896005403578382


0.08859228406828024

In [71]:
def mlp_train(xs, ys, n_epochs = 100):

    lr = 0.7
    # 3x2 matrix for 3 inputs and 2 outputs
    w1 = np.random.uniform(low = -1, high = 1, size = (xs.shape[1], 4))
    b1 = np.zeros((4))

    w2 = np.random.uniform(low = -1, high = 1, size = (4, ys.shape[1]))
    b2 = np.zeros((ys.shape[1]))

    # print(xs.shape, ys.shape, ws.shape, bs.shape)

    for i in range(n_epochs):

        # Forward Pass
        z1 = np.dot(xs, w1) + b1
        a1 = relu(z1)

        z2 = np.dot(a1, w2) + b2
        yhat = sigmoid(z2)

        # Loss Calculation
        loss = mse(yhat, ys)
        if (i)%10 == 0:
            print(f"Epoch: {i+1}, Loss: {loss}")

        # Calculating Gradients
        dl_dyhat = mse_grad(yhat, ys)
        dyhat_dz2 = sigmoid_grad(yhat)

        dl_dz2 = dl_dyhat * dyhat_dz2

        # Activation to weight
        dl_dw2 = (1/a1.shape[0]) * np.dot(a1.T, dl_dz2) # Done
        dl_db2 = dl_dz2.mean(0) # Done

        # Activation to Activation
        dl_da1 = np.dot(dl_dz2, w2.T)
        da1_dz1 = relu_grad(z1)

        dl_dz1 = dl_da1 * da1_dz1

        dl_dw1 = (1/z1.shape[0]) * np.dot(xs.T, dl_dz1) # Done
        dl_db1 = dl_dz1.mean(0) # Done

        # Backward Pass
        w1 += -lr * dl_dw1
        b1 += -lr * dl_db1

        w2 += -lr * dl_dw2
        b2 += -lr * dl_db2

    return w1, b1, w2, b2

In [72]:
ys = np.random.uniform(low=0, high=1., size=(20,2))
xs = np.random.uniform(low= -4, high = 4, size = (20, 3))
mlp_parameters = mlp_train(xs, ys, 200)

Epoch: 1, Loss: 0.20880276427291705
Epoch: 11, Loss: 0.19286537214136576
Epoch: 21, Loss: 0.17876130479920377
Epoch: 31, Loss: 0.1684445768804895
Epoch: 41, Loss: 0.16049987459064366
Epoch: 51, Loss: 0.15302545114742697
Epoch: 61, Loss: 0.1460114353910063
Epoch: 71, Loss: 0.13947239740666803
Epoch: 81, Loss: 0.13341236039021864
Epoch: 91, Loss: 0.12781272182953737
Epoch: 101, Loss: 0.12263763136945331
Epoch: 111, Loss: 0.11781626826874629
Epoch: 121, Loss: 0.11331716968269956
Epoch: 131, Loss: 0.10911714064164271
Epoch: 141, Loss: 0.1051894469078983
Epoch: 151, Loss: 0.10149579563690976
Epoch: 161, Loss: 0.09801162402526392
Epoch: 171, Loss: 0.09472338810090418
Epoch: 181, Loss: 0.09159211982221711
Epoch: 191, Loss: 0.08874235951211558


In [147]:
zs = torch.randn([2, 3], requires_grad = True)
ys = torch.tensor([[0, 1, 0], [1, 0, 0]], dtype = torch.float32)

In [148]:
yhat = F.softmax(zs, dim = 1)
yhat, yhat.shape

(tensor([[0.3702, 0.3669, 0.2629],
         [0.0279, 0.6735, 0.2986]], grad_fn=<SoftmaxBackward0>),
 torch.Size([2, 3]))

In [149]:
loss = F.cross_entropy(yhat, ys)
loss

tensor(1.2528, grad_fn=<DivBackward1>)

In [150]:
loss.backward()

In [122]:
zs.grad

tensor([[ 0.0683, -0.0734,  0.0051],
        [-0.1198,  0.1011,  0.0187]])

In [151]:
inp = zs.clone()

In [152]:
zs = inp.detach().numpy()
zs

array([[ 1.2684945 ,  1.2596434 ,  0.92624325],
       [-1.1967338 ,  1.9883913 ,  1.1748823 ]], dtype=float32)

In [153]:
yhat = softmax(zs)
yhat

array([[0.3701842 , 0.36692217, 0.26289365],
       [0.0278668 , 0.67354906, 0.29858416]], dtype=float32)

In [154]:
loss = cross_entropy(yhat, ys)

In [157]:
loss

tensor([[0.0000, 1.0026, 0.0000],
        [3.5803, 0.0000, 0.0000]])

In [158]:
np.log(yhat)

array([[-0.9937545 , -1.0026056 , -1.3360057 ],
       [-3.5803194 , -0.39519444, -1.2087034 ]], dtype=float32)

In [145]:
np.log(yhat), ys

(array([[-0.43588704, -1.8801389 , -1.6057433 ],
        [-0.96458936, -0.7107091 , -2.0591323 ]], dtype=float32),
 tensor([[0., 1., 0.],
         [1., 0., 0.]]))

In [None]:
def cross_entropy(yhat, y):
    return y * -1 * np.log(yhat)