In [1]:
import torch
import torch.nn.functional as F

# Layers

In [150]:
class simple_sum:
    def __init__(self, dim=0):
        self.dim=dim
    def __call__(self, x):
        self.out = x.sum(dim=self.dim, keepdims=True)
        self.x=x
        return self.out
    def back(self):
        with torch.no_grad():
            self.grad= torch.ones_like(self.x)
        return self.grad

class simple_mul:
    def __init__(self):
        self.scalar=torch.rand(1, requires_grad=True)
    def __call__(self, x):
        self.x=x
        self.out = x*self.scalar
        return self.out 
    def back(self):
        with torch.no_grad():
            self.scalar_grad= torch.sum(self.x)
            self.grad=torch.ones_like(self.x)*self.scalar
        return self.grad   

class simple_relu:
    def __call__(self, x):
        self.x=x
        self.out= torch.maximum(x, torch.zeros_like(x))
        return self.out
    def back(self):
        with torch.no_grad():
            mask=self.x > 0 #alternative mask -> self.out > 0 but in any case if self.x > 0 then self.out > 0 and self.out <= 0 otherwise so both options should lead to the same mask
            self.grad=mask*1.0
        return self.grad

class simple_linear:
    def __init__(self, fan_in, fan_out):
        self.w = torch.randn(fan_in, fan_out, requires_grad=True)
    
    def __call__(self, x):
        self.x=x
        self.out= x @ self.w
        return self.out
    def back(self):
        with torch.no_grad():
            self.w_grad=self.x
            self.grad=self.w


In [158]:
simp=simple_sum()
x=torch.randn(2)
x.requires_grad=True
print(x)
out=simp(x)
print('out',out)
print('x.grad',x.grad)
out.backward()

print('out.grad',out.grad)
print('x.grad',x.grad)

simp.back(),simp.x.grad

tensor([ 0.1688, -1.5060], requires_grad=True)
out tensor([-1.3372], grad_fn=<SumBackward1>)
x.grad None
out.grad None
x.grad tensor([1., 1.])


  print('out.grad',out.grad)


(tensor([1., 1.]), tensor([1., 1.]))

In [56]:
simpm=simple_mul()
y=torch.randn(2)
y.requires_grad=True
print(y)
outm=simpm(y)
print('outm',outm)
print('y.grad',y.grad)
outm_sum=outm.sum(0)
outm_sum.backward()
# print('outm.grad',outm.grad)
print('y.grad',y.grad)

tensor([0.4099, 1.5743], requires_grad=True)
outm tensor([0.3180, 1.2213], grad_fn=<MulBackward0>)
y.grad None
y.grad tensor([0.7758, 0.7758])


In [57]:
simpm.scalar, simpm.back(),simpm.x.grad, simpm.scalar.grad, simpm.scalar_grad,

(tensor([0.7758], requires_grad=True),
 tensor([0.7758, 0.7758]),
 tensor([0.7758, 0.7758]),
 tensor([1.9842]),
 tensor(1.9842))

In [121]:
sum1=simple_sum(0)
mul1=simple_mul()
j=torch.randn(3,3, requires_grad=True)
j.retain_grad()
k=sum1(j)
k.retain_grad()
l=mul1(k)
l.retain_grad()
m=torch.sum(l)
m.retain_grad()
m.backward()
print('j:',j, '\nk:', k , '\nl:',l,'\nm:', m,'\nmul1.scalar:', mul1.scalar)


j: tensor([[-0.0265, -1.7565, -0.5568],
        [ 1.8134,  0.1513, -0.2631],
        [-0.7829, -0.9893, -2.1119]], requires_grad=True) 
k: tensor([[ 1.0040, -2.5946, -2.9318]], grad_fn=<SumBackward1>) 
l: tensor([[ 0.4635, -1.1978, -1.3535]], grad_fn=<MulBackward0>) 
m: tensor(-2.0877, grad_fn=<SumBackward0>) 
mul1.scalar: tensor([0.4616], requires_grad=True)


In [130]:
# dm_dj = dm_dl*dl_dk*dk_dj
#dm_dl -> sum ->  [1, 1]
#dl_dk -> mul1 -> [0.5502, 0.5502]
#dk_dj -> sum1 -> [[1., 1.],
#                  [1., 1.]]
#dm_dj = [[0.5502, 0.5502],
#         [0.5502, 0.5502]]

print('mul1.back()',mul1.back(), '\nmul1.scalar_grad', mul1.scalar_grad, '\nmul1.scalar.grad', mul1.scalar.grad)
print('sum1.back()',sum1.back())
print('j.grad', j.grad, j.shape, '\nk.grad',k.grad,k.shape,'\nl.grad',l.grad,l.shape,'\nm.grad',m.grad, m.shape)
print('*************** calculated with .back() in each layer********************')
mul1_back=mul1.back()
sum1_back=sum1.back()
print('dm_dl*dl_dk = dm_dl * mul1.back() ')
back = torch.ones_like(l.grad) * mul1_back 
print('back',back, 'back.shape',back.shape)
back = back *sum1_back
print('dm_dj = dm_dl*dl_dk*dk_dj = dm_dl * sum1.back() * mul1.back()')
print('back',back, 'back.shape',back.shape)


mul1.back() tensor([[0.4616, 0.4616, 0.4616]]) 
mul1.scalar_grad tensor(-4.5224) 
mul1.scalar.grad tensor([-4.5224])
sum1.back() tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
j.grad tensor([[0.4616, 0.4616, 0.4616],
        [0.4616, 0.4616, 0.4616],
        [0.4616, 0.4616, 0.4616]]) torch.Size([3, 3]) 
k.grad tensor([[0.4616, 0.4616, 0.4616]]) torch.Size([1, 3]) 
l.grad tensor([[1., 1., 1.]]) torch.Size([1, 3]) 
m.grad tensor(1.) torch.Size([])
*************** calculated with .back() in each layer********************
dm_dl*dl_dk = dm_dl * mul1.back() 
back tensor([[0.4616, 0.4616, 0.4616]]) back.shape torch.Size([1, 3])
dm_dj = dm_dl*dl_dk*dk_dj = dm_dl * sum1.back() * mul1.back()
back tensor([[0.4616, 0.4616, 0.4616],
        [0.4616, 0.4616, 0.4616],
        [0.4616, 0.4616, 0.4616]]) back.shape torch.Size([3, 3])


In [3]:
B = 5
C = 4
D = 6
h=0.001
A  = B * C + D
Adb = (B+h) * C + D
Adc = B * (C+h) + D
Add = B * C + (D+h)
da_db= (Adb-A)/h
da_dc= (Adc-A)/h
da_dd= (Add-A)/h

print(da_db, da_dc, da_dd)

4.000000000001336 5.000000000002558 1.0000000000012221


In [7]:
B = 5
D = 6

E=2
F=3
C = (E*F)


h=0.001
A  = B * C + D
Adb = (B+h) * C + D
Adc = B * (C+h) + D
Add = B * C + (D+h)
Ade = B * ((E+h)*F) + D
Adf = B * (E*(F+h)) + D
Cde = (E+h)*F
Cdf = E*(F+h)

da_db= (Adb-A)/h
Da_db= C
da_dc= (Adc-A)/h
Da_dc = B
da_dd= (Add-A)/h
Da_dd=1

da_de= (Ade-A)/h
da_df= (Adf-A)/h

dc_de= (Cde-C)/h
Dc_de=F
dc_df= (Cdf-C)/h
Dc_df=E


print(' da_db ',da_db,'\n da_dc', da_dc, '\n da_dd', da_dd,'\n dc_de', dc_de, '\n dc_df', dc_df, '\n da_de', da_de, '\n da_df', da_df)
print(f' da_df = da_dc x dc_df -> {da_df} = {da_dc} x {dc_df} \n da_de = da_dc x dc_de -> {da_de} = {da_dc} x {dc_de}')
print(f'COMPOSE FROM VARS\n da_df =  Da_dc x Dc_df -> {Da_dc*Dc_df} = {Da_dc} x {Dc_df} \n da_de = Da_dc x Dc_de -> {Da_dc*Dc_de} = {Da_dc} x {Dc_de}')

 da_db  6.000000000000227 
 da_dc 5.000000000002558 
 da_dd 0.9999999999976694 
 dc_de 3.0000000000001137 
 dc_df 1.9999999999997797 
 da_de 15.000000000000568 
 da_df 9.99999999999801
 da_df = da_dc x dc_df -> 9.99999999999801 = 5.000000000002558 x 1.9999999999997797 
 da_de = da_dc x dc_de -> 15.000000000000568 = 5.000000000002558 x 3.0000000000001137
COMPOSE FROM VARS
 da_df =  Da_dc x Dc_df -> 10 = 5 x 2 
 da_de = Da_dc x Dc_de -> 15 = 5 x 3


In [143]:
r = torch.randn(2,2, requires_grad=True)
s= torch.relu(r)
mask = r>0
t=torch.sum(s)
t.backward()
r,s,mask, t

(tensor([[-2.1073,  0.7434],
         [-2.1842, -0.2667]], requires_grad=True),
 tensor([[0.0000, 0.7434],
         [0.0000, 0.0000]], grad_fn=<ReluBackward0>),
 tensor([[False,  True],
         [False, False]]),
 tensor(0.7434, grad_fn=<SumBackward0>))

In [144]:
r.grad

tensor([[0., 1.],
        [0., 0.]])

In [145]:
mask*1.0

tensor([[0., 1.],
        [0., 0.]])