In [1]:
import initialize
from mistify import _functional as F
import torch

In [2]:
def create_input_dataset1(shape, dropout_p: bool=0.5, exp: float=1.0, require_grad: bool=False):

    data = torch.rand(*shape) ** exp
    data = data * (torch.rand(shape) >= dropout_p)
    if require_grad:
        data.requires_grad_()
        data.retain_grad()
    return data



This notebook is to check if the straight through estimators succeed in optimizing.

In [3]:
import typing

def optim_op(f, x1: torch.Tensor, t, n=400, p: typing.List[torch.Tensor]=None, **kwargs):
    x1.requires_grad_()
    if p is None:
        optim = torch.optim.Adam([x1], lr=1e-2)
    else:
        print('setting p')
        optim = torch.optim.Adam(p, lr=1e-2)
    for i in range(n):
        optim.zero_grad()
        y = f(x1, **kwargs)
        loss = (y - t).pow(2).mean()
        loss.backward()
        # print('Grad: ', p[0].grad.abs().sum().item(), p[1].grad.abs().sum().item(), loss.item())
        optim.step()
    print('Looped : ', i, ' times')

In [4]:

# Test union
torch.manual_seed(1)
x1_b = torch.rand(4, 4)
x2 = torch.rand(4, 4)
t = F.union(x1_b, x2).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.union, x1, t, n=500, x2=x2, g=F.ClipG(0.1))
torch.sqrt((x1 - x1_b).pow(2).mean()).item()

print(F.union(x1, x2)) 
print(F.union(x1_b, x2))

# print(x1_b.detach(), x1.detach(), x2.detach())


HERE
HERE
Looped :  1  times
tensor([[0.8101, 0.4980, 0.9371, 0.6556],
        [0.9191, 0.4367, 0.6940, 0.2876],
        [0.9706, 0.5239, 0.8550, 0.7718],
        [0.2566, 0.8100, 0.6397, 0.9743]], grad_fn=<MaximumBackward0>)
tensor([[0.7576, 0.4980, 0.9371, 0.7347],
        [0.3138, 0.7999, 0.4162, 0.7544],
        [0.5695, 0.5239, 0.7981, 0.7718],
        [0.6826, 0.8100, 0.6397, 0.9743]])


In [5]:
# Test union 2 (both trainable)
torch.manual_seed(1)
x1_b = torch.rand(8, 4)
x2_b = torch.rand(8, 4)
t = F.union(x1_b, x2_b).detach()

x1 = create_input_dataset1([8, 4], 0.75, require_grad=True)
x2 = create_input_dataset1([8, 4], 0.75, require_grad=True)

optim_op(F.union, x1, t, n=500, x2=x2, p=[x1, x2], g=F.MulG(0.1))
torch.sqrt((x1 - x1_b).pow(2).mean()).item()

print(F.union(x1, x2)) 
print(F.union(x1_b, x2_b))


setting p
Looped :  499  times
tensor([[0.8300, 0.2793, 0.4031, 0.7347],
        [0.9391, 0.7999, 0.7140, 0.7544],
        [0.9906, 0.4388, 0.8750, 0.5247],
        [0.6826, 0.7570, 0.4635, 0.6471],
        [0.5725, 0.4980, 0.9371, 0.6556],
        [0.7713, 0.3785, 0.9980, 0.9008],
        [0.4766, 0.5239, 0.8045, 0.7718],
        [0.1768, 0.8248, 0.8036, 0.9743]], grad_fn=<MaximumBackward0>)
tensor([[0.8300, 0.2793, 0.4031, 0.7347],
        [0.9391, 0.7999, 0.7140, 0.7544],
        [0.9906, 0.4388, 0.8750, 0.5247],
        [0.6826, 0.7570, 0.4635, 0.6471],
        [0.5725, 0.4980, 0.9371, 0.6556],
        [0.7713, 0.3785, 0.9980, 0.9008],
        [0.4766, 0.5239, 0.8045, 0.7718],
        [0.1768, 0.8248, 0.8036, 0.9743]])


In [6]:
# Test union
torch.manual_seed(1)
x1_b = torch.rand(4, 4)
x2 = torch.rand(4, 4)
t = F.inter(x1_b, x2).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.inter, x1, t, x2=x2, g=F.ClipG(0.1))
torch.sqrt((x1 - x1_b).pow(2).mean()).item()


print(F.inter(x1, x2)) 
print(F.inter(x1_b, x2))

Looped :  399  times
tensor([[0.5725, 0.2793, 0.4031, 0.6556],
        [0.0293, 0.1980, 0.3971, 0.2843],
        [0.3398, 0.4388, 0.6387, 0.5247],
        [0.0112, 0.3051, 0.4635, 0.4550]], grad_fn=<MinimumBackward0>)
tensor([[0.5725, 0.2793, 0.4031, 0.6556],
        [0.0293, 0.1980, 0.3971, 0.2843],
        [0.3398, 0.4388, 0.6387, 0.5247],
        [0.0112, 0.3051, 0.4635, 0.4550]])


In [7]:
# Test inter on
torch.manual_seed(1)
x1_b = torch.rand(32, 4)
# x2 = torch.rand(4, 4)
t = F.inter_on(x1_b, -1, False).detach()

x1 = torch.rand(32, 4, requires_grad=True)
optim_op(F.inter_on, x1, t, dim=-1, keepdim=False, g=F.MulG(0.01))
torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print(F.inter_on(x1, dim=-1)) 
print(F.inter_on(x1_b, dim=-1))

Looped :  399  times
tensor([0.2793, 0.0293, 0.4388, 0.3051, 0.4980, 0.1980, 0.3398, 0.0112, 0.0246,
        0.2676, 0.2885, 0.2346, 0.0193, 0.3785, 0.1663, 0.1768, 0.2197, 0.1205,
        0.5285, 0.0369, 0.1227, 0.0555, 0.1251, 0.1652, 0.2873, 0.4075, 0.0748,
        0.6030, 0.1682, 0.1381, 0.1842, 0.0299], grad_fn=<MinBackward0>)
tensor([0.2793, 0.0293, 0.4388, 0.3051, 0.4980, 0.1980, 0.3398, 0.0112, 0.0246,
        0.2676, 0.2885, 0.2346, 0.0193, 0.3785, 0.1663, 0.1768, 0.2197, 0.1205,
        0.5285, 0.0369, 0.1227, 0.0555, 0.1251, 0.1652, 0.2873, 0.4075, 0.0748,
        0.6036, 0.1682, 0.1381, 0.1842, 0.0299])


In [76]:
# Test 1 layer

# This shows if i can optimize a single layer

torch.manual_seed(1)

g = F.MulG(0.1)

def f(x1, x2):
    return F.inter_on(F.union(x1, x2), dim=-1)

x1_b = create_input_dataset1([32, 4, 1], 0.75, require_grad=True)
x2_b = create_input_dataset1([1, 4, 8], 0.75, require_grad=True)
t = f(x1_b, x2_b).detach()

x1 = create_input_dataset1([32, 4, 1], 0.75, require_grad=True)
x2 = create_input_dataset1([1, 4, 8], 0.75, require_grad=True)

print((f(x1, x2) - f(x1_b, x2_b)).pow(2).mean())
optim_op(f, x1, t, n=1, x2=x2, p=[x1, x2])
torch.sqrt((x1 - x1_b).pow(2).mean()).item()

print((f(x1, x2) - f(x1_b, x2_b)).pow(2).mean()) 
# print()

tensor(0.1168, grad_fn=<MeanBackward0>)
Grad:  0.2834576666355133 0.09538086503744125 0.11684583872556686
Looped :  0  times
tensor(0.1131, grad_fn=<MeanBackward0>)


In [5]:
# Test 2 layers

# This shows if i can optimize a single layer

import torch.nn as nn
torch.manual_seed(1)

g = F.MulG(0.0001)

def layer(x1, w, g=None):

    return F.inter_on(F.union(x1, w, g=g), dim=-2, g=g)

def net(x1, w1, w2):

    y = layer(x1, w1)
    y = y.unsqueeze(-1)
    return layer(y, w2)

x1_b = torch.rand(32, 32) ** 2

t = nn.Linear(32, 6)(x1_b).detach()
x1_b = x1_b.unsqueeze(-1)

# x1_b = create_input_dataset1([32, 4, 1], 0, require_grad=False)
# w1_b = create_input_dataset1([1, 32, 8], 0.5, require_grad=False)
# w2_b = create_input_dataset1([1, 8, 6], 0.5, require_grad=False)

# t = net(x1_b, w1_b, w2_b).detach()

# x1 = create_input_dataset1([32, 4, 1], 0.75, require_grad=False)
w1 = create_input_dataset1([1, 32, 8], 0.5, require_grad=True)
w2 = create_input_dataset1([1, 8, 6], 0.5, require_grad=True)

# print((w1 - w1_b).pow(2).mean())
# print(t)

print((net(x1_b, w1, w2) - t).pow(2).mean()) 
w1_clone = torch.clone(w1).detach()

optim_op(net, x1_b, t, n=2, w1=w1, w2=w2, p=[w1, w2])
# torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print((w1 - w1_clone).pow(2).mean())

print((net(x1_b, w1, w2) - t).pow(2).mean()) 
# print()

tensor(0.0640, grad_fn=<MeanBackward0>)
setting p
Grad:  0.0 0.0 0.06397570669651031
Grad:  0.0 0.0 0.06397570669651031
Looped :  1  times
tensor(0., grad_fn=<MeanBackward0>)
tensor(0.0640, grad_fn=<MeanBackward0>)


In [7]:
# Test union on
torch.manual_seed(1)
x1_b = torch.rand(16, 4)
t = F.union_on(x1_b, -1, False).detach()

x1 = torch.rand(16, 4, requires_grad=True)
optim_op(F.union_on, x1, t, dim=-1, n=400, keepdim=False, g=F.MulG(0.01))
# torch.sqrt((x1 - x1_b).pow(2).mean()).item()


print(F.union_on(x1, dim=-1)) 
print(F.union_on(x1_b, dim=-1))

tensor([0.7576, 0.7999, 0.6387, 0.6826, 0.9371, 0.4162, 0.7981, 0.9743, 0.8300,
        0.9391, 0.9906, 0.7570, 0.4452, 0.9980, 0.8045, 0.9434],
       grad_fn=<MaxBackward0>)
tensor([0.7576, 0.7999, 0.6387, 0.6826, 0.9371, 0.4162, 0.7981, 0.9743, 0.8300,
        0.9391, 0.9906, 0.7570, 0.4452, 0.9980, 0.8045, 0.9434])


In [7]:
# Test binary
torch.manual_seed(2)
x1_b = torch.rand(4, 4)
# x2 = torch.rand(4, 4)
t = F.binarize(x1_b).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.binarize, x1, t, n=1000, g=True, clip=0.1)
print(torch.sqrt((x1 - x1_b).pow(2).mean()).item())
print(F.binarize(x1), F.binarize(x1_b))

0.1590016782283783
tensor([[1., 0., 1., 0.],
        [1., 1., 0., 0.],
        [1., 0., 1., 1.],
        [0., 1., 1., 1.]], grad_fn=<BinaryGBackward>) tensor([[1., 0., 1., 0.],
        [1., 1., 0., 0.],
        [1., 0., 1., 1.],
        [0., 1., 1., 1.]])


In [8]:
# Test binary
torch.manual_seed(2)
x1_b = torch.randn(4, 4)
# x2 = torch.rand(4, 4)
t = F.signify(x1_b).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.signify, x1, t, n=1000, g=True, clip=0.1)
print(torch.sqrt((x1 - x1_b).pow(2).mean()).item())
print(F.signify(x1)) 
print(F.signify(x1_b))

0.7361576557159424
tensor([[-1.,  1., -1., -1.],
        [-1.,  1., -1., -1.],
        [-1.,  1., -1., -1.],
        [ 1., -1., -1., -1.]], grad_fn=<SignGBackward>)
tensor([[-1.,  1., -1., -1.],
        [-1.,  1., -1., -1.],
        [-1.,  1., -1., -1.],
        [ 1., -1., -1., -1.]])


In [9]:
# Test clamp
torch.manual_seed(2)
x1_b = torch.rand(4, 4) * 3 - 1
# x2 = torch.rand(4, 4)
t = F.clamp(x1_b).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.clamp, x1, t, n=2000, g=True, clip=0.1)
print(torch.sqrt((x1 - x1_b).pow(2).mean()).item())
print(F.clamp(x1)) 
print(F.clamp(x1_b))

0.3310493528842926
tensor([[0.8441, 0.1430, 0.9113, 0.4234],
        [1.0000, 0.8571, 0.3276, 0.0000],
        [0.8425, 0.0000, 0.6971, 0.5997],
        [0.1702, 1.0000, 0.6001, 1.0000]], grad_fn=<ClampGBackward>)
tensor([[0.8441, 0.1430, 0.9113, 0.4234],
        [1.0000, 0.8571, 0.3276, 0.0000],
        [0.8425, 0.0000, 0.6971, 0.5997],
        [0.1702, 1.0000, 0.6001, 1.0000]])


In [7]:
# Test bounded union on
torch.manual_seed(1)
x1_b = torch.rand(4, 4) ** 4
# x2 = torch.rand(4, 4)
t = F.bounded_union_on(x1_b, -1, False).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.bounded_union_on, x1, t, dim=-1, keepdim=False, g=True, clip=0.1)
torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print(F.bounded_union_on(x1, dim=-1, keepdim=False)) 
print(F.bounded_union_on(x1_b, dim=-1, keepdim=False))

tensor([0.6533, 0.7580, 0.3844, 0.3148], grad_fn=<ClampGBackward>)
tensor([0.6533, 0.7580, 0.3844, 0.3148])


In [6]:
# Test bounded inter on
torch.manual_seed(1)
x1_b = torch.rand(4, 4) ** 0.25
# x2 = torch.rand(4, 4)
t = F.bounded_inter_on(x1_b, -1, False).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.bounded_inter_on, x1, t, dim=-1, keepdim=False, g=True, clip=0.1)
torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print(F.bounded_inter_on(x1, dim=-1, keepdim=False)) 
print(F.bounded_inter_on(x1_b, dim=-1, keepdim=False))

tensor([0.3826, 0.0852, 0.4276, 0.2986], grad_fn=<ReluBackward0>)
tensor([0.3826, 0.0852, 0.4276, 0.2986])


In [4]:

# Test bounded union
torch.manual_seed(1)
x1_b = torch.rand(4, 4) ** 4
x2 = torch.rand(4, 4)
t = F.bounded_union(x1_b, x2).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.bounded_union, x1, t, x2=x2, g=True)
torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print(F.bounded_union(x1, x2)) 
print(F.bounded_union(x1_b, x2))

tensor([[0.9020, 0.5041, 0.9635, 0.9469],
        [0.3138, 0.6073, 0.4411, 0.6082],
        [0.4450, 0.5610, 0.9645, 0.8475],
        [0.2283, 0.8186, 0.6859, 1.0000]], grad_fn=<ClampBackward1>)
tensor([[0.9020, 0.5041, 0.9635, 0.9469],
        [0.3138, 0.6073, 0.4411, 0.6082],
        [0.4450, 0.5610, 0.9645, 0.8475],
        [0.2283, 0.8186, 0.6859, 1.0000]])


In [5]:

# Test bounded inter
torch.manual_seed(1)
x1_b = torch.rand(4, 4) ** 0.5
x2 = torch.rand(4, 4) ** 0.5
t = F.bounded_inter(x1_b, x2).detach()

x1 = torch.rand(4, 4, requires_grad=True)
optim_op(F.bounded_inter, x1, t, x2=x2, g=True)
torch.sqrt((x1 - x1_b).pow(2).mean()).item()
print(F.bounded_inter(x1, x2)) 
print(F.bounded_inter(x1_b, x2))

tensor([[0.6270, 0.2342, 0.6029, 0.6668],
        [0.0000, 0.3393, 0.2753, 0.4018],
        [0.3376, 0.3862, 0.6925, 0.6028],
        [0.0000, 0.4524, 0.4806, 0.6616]], grad_fn=<ReluBackward0>)
tensor([[0.6270, 0.2342, 0.6029, 0.6668],
        [0.0000, 0.3393, 0.2753, 0.4018],
        [0.3376, 0.3862, 0.6925, 0.6028],
        [0.0000, 0.4524, 0.4806, 0.6616]])
