In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
%load_ext autoreload
%autoreload 2
import vardl
import torch
import torch.nn as nn
import sklearn.datasets
import numpy as np

  from ._conv import register_converters as _register_converters


In [3]:
layer0 = vardl.layers.BayesianLinear(in_features=5, out_features=2, local_reparameterization=True, approx='factorized', nmc_test=1, nmc_train=1)
layer1 = vardl.layers.BayesianLinear(in_features=2, out_features=1, local_reparameterization=True, approx='factorized', nmc_test=1, nmc_train=1)

In [4]:
layer0

BayesianLinear(
  in_features=5, out_features=2, bias=False, local_repr=True
  (prior_W): MatrixGaussianDistribution(approx=factorized)
  (q_posterior_W): MatrixGaussianDistribution(approx=factorized)
)

In [5]:
arch = nn.Sequential(layer0, layer1)

In [6]:
arch

Sequential(
  (0): BayesianLinear(
    in_features=5, out_features=2, bias=False, local_repr=True
    (prior_W): MatrixGaussianDistribution(approx=factorized)
    (q_posterior_W): MatrixGaussianDistribution(approx=factorized)
  )
  (1): BayesianLinear(
    in_features=2, out_features=1, bias=False, local_repr=True
    (prior_W): MatrixGaussianDistribution(approx=factorized)
    (q_posterior_W): MatrixGaussianDistribution(approx=factorized)
  )
)

In [7]:
model = vardl.models.RegrBayesianNet(architecure=arch, 
                                     dtype=torch.float32)

In [8]:
model

RegrBayesianNet(
  (architecture): Sequential(
    (0): BayesianLinear(
      in_features=5, out_features=2, bias=False, local_repr=True
      (prior_W): MatrixGaussianDistribution(approx=factorized)
      (q_posterior_W): MatrixGaussianDistribution(approx=factorized)
    )
    (1): BayesianLinear(
      in_features=2, out_features=1, bias=False, local_repr=True
      (prior_W): MatrixGaussianDistribution(approx=factorized)
      (q_posterior_W): MatrixGaussianDistribution(approx=factorized)
    )
  )
  (likelihood): Gaussian()
)

In [9]:
for name, par in model.named_parameters():
    print(name, par.requires_grad)

architecture.0.prior_W._mean False
architecture.0.prior_W._logvars False
architecture.0.q_posterior_W._mean True
architecture.0.q_posterior_W._logvars True
architecture.1.prior_W._mean False
architecture.1.prior_W._logvars False
architecture.1.q_posterior_W._mean True
architecture.1.q_posterior_W._logvars True
likelihood.log_noise_var True


In [10]:
X, Y, W = sklearn.datasets.make_regression(n_samples=100000, 
                                 n_features=5, #100
                                 n_informative=5, 
                                 n_targets=1, bias=0,
                                 effective_rank=None,
                                 noise=np.exp(0),
                                 shuffle=False, coef=True, 
                                 random_state=0)

X = torch.from_numpy(X).float()
Y = torch.from_numpy(Y.reshape(-1, 1)).float()

In [11]:
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(X, Y)

dataloader = DataLoader(dataset, batch_size=256, shuffle=True, 
                              drop_last=False, num_workers=0)

In [12]:
tb_logger = vardl.logger.TensorboardLogger('../work')

In [13]:
trainer = vardl.trainer.TrainerRegressor(model=model, 
                                         train_dataloader=dataloader, 
                                         test_dataloader=dataloader, 
                                         optimizer='Adam', 
                                         optimizer_config={'lr':0.1}, 
                                         device='cpu', 
                                         logger=tb_logger,
                                         seed=0)
trainer_logging_config = {'train_verbose': True, 'train_log_interval':100}


trainer.model.likelihood.log_noise_var.requires_grad = False
trainer.train_per_iterations(4000, **trainer_logging_config)
trainer.model.likelihood.log_noise_var.requires_grad = True
trainer.train_per_iterations(1000, **trainer_logging_config)

In [14]:
trainer.model.likelihood.log_noise_var.requires_grad = False

trainer.train_per_iterations(100, train_log_interval=1, train_verbose=True)

#trainer.fit(iterations=1000,test_interval=100, **trainer_logging_config)

torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    1   loss=3687969024  dkl=       0  error=99.91  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    2   loss=4564534784  dkl=       0  error=111.15  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    3   loss=4424243200  dkl=       1  error=109.43  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    4   loss=4690185728  dkl=       2  error=112.67  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    5   loss=4212494848  dkl=       3  error=106.78  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    6   loss=4915682816  dkl=       4  error=115.35  log_theta_noise_var=-2.00
torch.Size([256, 5])
torch.Size([1, 256, 2])
[1m[34mTrain[0m || iter=    7   loss=4275648768

In [20]:
init = vardl.initializer.LSUVInitializer(model, train_dataloader=dataloader, tollerance=0.001, max_iter=1000)

In [21]:
model.architecture[0].q_posterior_W.mean

Parameter containing:
tensor([[-0.1835],
        [-0.6763],
        [ 0.7989],
        [-0.4343],
        [ 0.1013]], requires_grad=True)

In [22]:
model.architecture[0].q_posterior_W.logvars

Parameter containing:
tensor([[-0.9163],
        [-0.9163],
        [-0.9163],
        [-0.9163],
        [-0.9163]], requires_grad=True)

In [23]:
init.initialize()

INFO - Variance at layer 0 (iter #34): 1.001


In [24]:
model.architecture[0].q_posterior_W.mean

Parameter containing:
tensor([[-0.4114],
        [ 0.1680],
        [ 0.5070],
        [-0.3756],
        [ 0.6798]], requires_grad=True)

In [25]:
model.architecture[0].q_posterior_W.logvars

Parameter containing:
tensor([[-0.9163],
        [-0.9163],
        [-0.9163],
        [-0.9163],
        [-0.9163]], requires_grad=True)