In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F
import numpy as np

In [2]:
torch.set_printoptions(precision=2, sci_mode=False)

In [3]:
x = torch.randint(-10, 10, (128,3)).float()
y = (x[:,0]*(np.cos(x[:,1]) + np.exp(x[:,1]/3)) + x[:,2]) / 10

In [4]:
batch_size = 4
data = [(xi,yi) for xi,yi in zip(x, y)]
loader = DataLoader(data, batch_size=batch_size, shuffle=True)

In [5]:
for x,y in loader:
    print(x)
    print(y)
    break

tensor([[ 3., -1.,  0.],
        [ 4., -6.,  8.],
        [-2.,  5.,  4.],
        [ 0., -5.,  6.]])
tensor([ 0.38,  1.24, -0.72,  0.60])


In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(3,2)
        self.fc2 = nn.Linear(2,2)
        self.fc3 = nn.Linear(2,1)
        self.relu = nn.ReLU()
        
        self.fc1.register_forward_hook(self._forward_hook)
        self.fc1.register_full_backward_hook(self._backward_hook)
        self.fc2.register_forward_hook(self._forward_hook)
        self.fc2.register_full_backward_hook(self._backward_hook)
        self.fc3.register_forward_hook(self._forward_hook)
        self.fc3.register_full_backward_hook(self._backward_hook)
    
    def forward(self, x):
        y = self.fc1(x)
        y = self.relu(y)
        y = self.fc2(y)
        y = self.relu(y)
        return self.fc3(y)

    def _forward_hook(self, module, inp, output):
        print(type(inp))
        print(len(inp))
        print(type(output))
        print(inp)
        print(output)
        print()

    def _backward_hook(self, module, grad_input, grad_output):
        print(type(grad_output))
        print(len(grad_output))
        print(type(grad_input))
        print(len(grad_input))
        print(grad_output)
        print(grad_input)
        print()


In [7]:
model = Model()

In [8]:
x

tensor([[ 3., -1.,  0.],
        [ 4., -6.,  8.],
        [-2.,  5.,  4.],
        [ 0., -5.,  6.]])

In [9]:
y_hat = model(x)

<class 'tuple'>
1
<class 'torch.Tensor'>
(tensor([[ 3., -1.,  0.],
        [ 4., -6.,  8.],
        [-2.,  5.,  4.],
        [ 0., -5.,  6.]]),)
tensor([[-0.10,  1.31],
        [-0.57, -1.47],
        [ 0.74, -0.91],
        [-0.37, -2.63]], grad_fn=<AddmmBackward0>)

<class 'tuple'>
1
<class 'torch.Tensor'>
(tensor([[0.00, 1.31],
        [0.00, 0.00],
        [0.74, 0.00],
        [0.00, 0.00]], grad_fn=<BackwardHookFunctionBackward>),)
tensor([[-0.63, -0.48],
        [-0.69,  0.43],
        [-0.44,  0.46],
        [-0.69,  0.43]], grad_fn=<AddmmBackward0>)

<class 'tuple'>
1
<class 'torch.Tensor'>
(tensor([[0.00, 0.00],
        [0.00, 0.43],
        [0.00, 0.46],
        [0.00, 0.43]], grad_fn=<BackwardHookFunctionBackward>),)
tensor([[-0.29],
        [-0.01],
        [ 0.00],
        [-0.01]], grad_fn=<AddmmBackward0>)



In [10]:
y_hat.squeeze(), y

(tensor([-0.29, -0.01,  0.00, -0.01], grad_fn=<SqueezeBackward0>),
 tensor([ 0.38,  1.24, -0.72,  0.60]))

In [11]:
torch.mean((y - y_hat.squeeze())**2)

tensor(0.73, grad_fn=<MeanBackward0>)

In [12]:
mse = nn.MSELoss()
loss = mse(y_hat, y.unsqueeze(1))
loss

tensor(0.73, grad_fn=<MseLossBackward0>)

In [13]:
loss.backward()

<class 'tuple'>
1
<class 'tuple'>
1
(tensor([[-0.33],
        [-0.63],
        [ 0.36],
        [-0.31]]),)
(tensor([[-0.23, -0.22],
        [-0.43, -0.41],
        [ 0.25,  0.23],
        [-0.21, -0.20]]),)

<class 'tuple'>
1
<class 'tuple'>
1
(tensor([[ 0.00,  0.00],
        [ 0.00, -0.41],
        [ 0.00,  0.23],
        [ 0.00, -0.20]]),)
(tensor([[ 0.00,  0.00],
        [-0.02,  0.28],
        [ 0.01, -0.16],
        [-0.01,  0.14]]),)

<class 'tuple'>
1
<class 'tuple'>
1
(tensor([[0.00, 0.00],
        [0.00, 0.00],
        [0.01, 0.00],
        [0.00, 0.00]]),)
(None,)



In [14]:
model.fc1.weight, model.fc2.weight, model.fc3.weight

(Parameter containing:
 tensor([[-0.03,  0.11,  0.01],
         [ 0.48,  0.21, -0.27]], requires_grad=True),
 Parameter containing:
 tensor([[ 0.34,  0.05],
         [ 0.04, -0.69]], requires_grad=True),
 Parameter containing:
 tensor([[0.68, 0.65]], requires_grad=True))

In [15]:
model.fc1.weight.grad, model.fc2.weight.grad, model.fc3.weight.grad

(tensor([[-0.02,  0.04,  0.03],
         [ 0.00,  0.00,  0.00]]),
 tensor([[0.00, 0.00],
         [0.17, 0.00]]),
 tensor([[ 0.00, -0.24]]))

# Forward

In [16]:
torch.matmul(x, model.fc1.weight.T)

tensor([[-0.19,  1.23],
        [-0.67, -1.55],
        [ 0.64, -0.99],
        [-0.46, -2.71]], grad_fn=<MmBackward0>)

In [17]:
z1 = torch.matmul(x, model.fc1.weight.T) + model.fc1.bias
z1

tensor([[-0.10,  1.31],
        [-0.57, -1.47],
        [ 0.74, -0.91],
        [-0.37, -2.63]], grad_fn=<AddBackward0>)

In [18]:
R1 = torch.ones_like(z1)
R1[z1<0] = 0
R1

tensor([[0., 1.],
        [0., 0.],
        [1., 0.],
        [0., 0.]])

In [19]:
a1 = F.relu(z1)
a1

tensor([[0.00, 1.31],
        [0.00, 0.00],
        [0.74, 0.00],
        [0.00, 0.00]], grad_fn=<ReluBackward0>)

In [20]:
torch.matmul(a1, model.fc2.weight.T)

tensor([[ 0.06, -0.91],
        [ 0.00,  0.00],
        [ 0.25,  0.03],
        [ 0.00,  0.00]], grad_fn=<MmBackward0>)

In [21]:
z2 = torch.matmul(a1, model.fc2.weight.T) + model.fc2.bias
z2

tensor([[-0.63, -0.48],
        [-0.69,  0.43],
        [-0.44,  0.46],
        [-0.69,  0.43]], grad_fn=<AddBackward0>)

In [22]:
R2 = torch.ones_like(z2)
R2[z2<0] = 0
R2

tensor([[0., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.]])

In [23]:
a2 = F.relu(z2)
a2

tensor([[0.00, 0.00],
        [0.00, 0.43],
        [0.00, 0.46],
        [0.00, 0.43]], grad_fn=<ReluBackward0>)

In [24]:
z3 = torch.matmul(a2, model.fc3.weight.T) + model.fc3.bias
z3

tensor([[-0.29],
        [-0.01],
        [ 0.00],
        [-0.01]], grad_fn=<AddBackward0>)

# Backward

In [25]:
c = 2 / batch_size
d3 = (z3 - y.unsqueeze(1))*c
d3

tensor([[-0.33],
        [-0.63],
        [ 0.36],
        [-0.31]], grad_fn=<MulBackward0>)

In [26]:
torch.sum(d3*a2, dim=0)

tensor([ 0.00, -0.24], grad_fn=<SumBackward1>)

In [27]:
torch.matmul(d3, model.fc3.weight)

tensor([[-0.23, -0.22],
        [-0.43, -0.41],
        [ 0.25,  0.23],
        [-0.21, -0.20]], grad_fn=<MmBackward0>)

In [28]:
d2 = torch.matmul(d3, model.fc3.weight)*R2
d2

tensor([[-0.00, -0.00],
        [-0.00, -0.41],
        [ 0.00,  0.23],
        [-0.00, -0.20]], grad_fn=<MulBackward0>)

In [29]:
torch.matmul(d2.T, a1)

tensor([[0.00, 0.00],
        [0.17, 0.00]], grad_fn=<MmBackward0>)

In [30]:
torch.matmul(d2, model.fc2.weight)

tensor([[ 0.00,  0.00],
        [-0.02,  0.28],
        [ 0.01, -0.16],
        [-0.01,  0.14]], grad_fn=<MmBackward0>)

In [31]:
d1 = torch.matmul(d2, model.fc2.weight)*R1
d1

tensor([[0.00, 0.00],
        [-0.00, 0.00],
        [0.01, -0.00],
        [-0.00, 0.00]], grad_fn=<MulBackward0>)

In [32]:
torch.matmul(d1.T, x)

tensor([[-0.02,  0.04,  0.03],
        [ 0.00,  0.00,  0.00]], grad_fn=<MmBackward0>)

# Abstraction Experiment

In [75]:
np.set_printoptions(precision=4, suppress=True)

In [322]:
b = 128
d_in = 512
d_out = 128

In [323]:
G = np.random.rand(b,d_in)
W = np.random.rand(d_in,d_in/2)
R = np.random.randint(2, size=(b, d_in/2))
W2 = np.random.rand(d_in/2, d_in)

In [325]:
Gp = G@W*R@W2
Gp.shape

(128, 128)

In [326]:
delta = 1e-9
#M1 = np.linalg.inv(G.T@G+delta*np.eye(d))@G.T@Gp
M2 = np.linalg.solve(G.T@G+delta*np.eye(d_in), G.T@Gp)
#G@M1
G@M2

array([[1786.1721, 1955.2175, 1816.2568, ..., 1796.5105, 1894.5028,
        1978.8702],
       [1842.8868, 2095.1533, 2065.6891, ..., 2150.0277, 2133.2831,
        2143.3209],
       [1852.1479, 2166.2636, 2067.5268, ..., 1956.8618, 2020.4607,
        1997.3477],
       ...,
       [1709.8471, 1959.5511, 1865.9934, ..., 1833.1756, 1798.5266,
        2060.535 ],
       [1770.6676, 1794.436 , 1751.9749, ..., 1741.3164, 1972.1624,
        1878.5704],
       [2089.3307, 2368.257 , 2154.0787, ..., 2414.3493, 2346.2531,
        2413.0108]])

In [327]:
#np.linalg.norm(Gp - G@M1)
np.linalg.norm(Gp - G@M2)

0.006558552686109813

In [328]:
M2

array([[ -561.48  ,  -754.9675,  -544.6181, ...,  -784.4288, -1207.049 ,
         -626.1764],
       [ -424.4914,  -278.2212,  -267.1357, ...,  -394.6129,  -376.3098,
         -437.4163],
       [ -929.9578, -1709.0888,  -981.3751, ..., -1082.0538, -1363.0619,
        -1410.1297],
       ...,
       [ -481.8205, -1032.0656,  -562.0955, ...,  -460.2439,  -335.0367,
         -814.9755],
       [ -854.3841,  -892.7037,  -608.2119, ...,  -956.6669, -1291.7438,
         -970.9711],
       [-1417.286 , -1943.2019, -1398.9774, ..., -1743.5616, -2019.2089,
        -1928.0901]])

In [343]:
W2 * R.mean(axis=0).reshape(-1, 1)

array([[0.2465, 0.1876, 0.0378, ..., 0.3044, 0.0731, 0.4655],
       [0.3083, 0.3683, 0.0309, ..., 0.2417, 0.1843, 0.3492],
       [0.2663, 0.1967, 0.3579, ..., 0.4696, 0.4908, 0.4162],
       ...,
       [0.3628, 0.1362, 0.1674, ..., 0.342 , 0.5531, 0.3067],
       [0.2082, 0.3143, 0.2691, ..., 0.3634, 0.2865, 0.3306],
       [0.3245, 0.5116, 0.2695, ..., 0.3301, 0.3837, 0.3764]])

In [336]:
M3 = W@(W2 * R.mean(axis=0).reshape(-1, 1))

In [340]:
M3

array([[28.409 , 32.1285, 29.8569, ..., 30.78  , 32.1528, 31.7196],
       [31.5173, 34.1343, 31.9446, ..., 34.0527, 33.7582, 33.29  ],
       [29.2228, 32.6438, 31.1576, ..., 32.001 , 33.1844, 32.7538],
       ...,
       [31.3355, 33.8502, 30.1046, ..., 33.1273, 32.0806, 33.9928],
       [29.3544, 34.0876, 31.9171, ..., 33.363 , 32.4551, 33.4492],
       [29.7985, 33.897 , 31.4695, ..., 33.0704, 33.4874, 33.2684]])

In [338]:
np.linalg.norm(Gp - G@M3)

19416.35243953515

In [339]:
np.linalg.norm(Gp - G@W)

ValueError: operands could not be broadcast together with shapes (128,128) (128,256) 

In [316]:
G@M2

array([[8.2206, 7.4171, 9.3154, ..., 7.6118, 7.7293, 5.8584],
       [9.3547, 8.2342, 9.0923, ..., 7.273 , 9.2764, 6.5277],
       [8.353 , 6.6033, 6.2257, ..., 7.5358, 7.1936, 6.3307],
       ...,
       [8.7655, 7.0064, 7.0619, ..., 4.5887, 6.3798, 6.6948],
       [6.3136, 5.1219, 5.7426, ..., 4.7886, 5.4348, 4.5458],
       [8.7906, 7.8088, 8.469 , ..., 6.3153, 6.6372, 6.1693]])

In [317]:
G@M3

array([[10.0282,  8.553 ,  9.6917, ...,  7.834 ,  9.0316,  8.1951],
       [ 8.8574,  7.7185,  8.357 , ...,  6.3843,  7.5922,  6.9048],
       [ 8.5076,  7.5449,  7.9306, ...,  6.8513,  7.7407,  7.0388],
       ...,
       [10.4831,  9.7266,  9.741 , ...,  7.6903,  8.9856,  8.9292],
       [ 8.5683,  7.8353,  7.2812, ...,  5.9574,  7.7971,  6.8838],
       [ 8.5757,  7.2657,  7.9985, ...,  6.6236,  7.4361,  6.8287]])

In [219]:
Gp

array([[4.4923, 4.8916, 4.1447, ..., 4.3604, 4.5179, 4.3836],
       [3.9708, 2.8232, 5.5823, ..., 3.8476, 3.7759, 4.7226],
       [4.63  , 3.42  , 5.058 , ..., 5.2431, 4.5323, 4.9604],
       ...,
       [4.3227, 3.2854, 4.2373, ..., 3.9835, 4.8602, 4.214 ],
       [4.6283, 3.7563, 4.3044, ..., 4.1336, 3.571 , 3.5449],
       [3.0164, 3.2921, 4.9042, ..., 3.172 , 4.2198, 4.4461]])

In [9]:
import copy

In [6]:
W = np.array([[1, 1], [-1, -1]])
X = np.array([[1, 2],[-1, 2], [2, -1], [1, -2]]).T

In [7]:
W, X

(array([[ 1,  1],
        [-1, -1]]),
 array([[ 1, -1,  2,  1],
        [ 2,  2, -1, -2]]))

In [13]:
RX = copy.deepcopy(X)
RX[X<0] = 0
RX

array([[1, 0, 2, 1],
       [2, 2, 0, 0]])

In [15]:
Y = W@RX
Y

array([[ 3,  2,  2,  1],
       [-3, -2, -2, -1]])

In [16]:
M = Y@X.T@np.linalg.inv(X@X.T)

In [17]:
M

array([[ 1.36,  0.88],
       [-1.36, -0.88]])

In [20]:
np.linalg.norm(M@X-Y)

3.0199337741082997

In [22]:
M@X, Y

(array([[ 3.12,  0.4 ,  1.84, -0.4 ],
        [-3.12, -0.4 , -1.84,  0.4 ]]),
 array([[ 3,  2,  2,  1],
        [-3, -2, -2, -1]]))