# Metropolis-adjusted Langevin algorithm Implementations for nonlinear regression with neural network

The Metropolis-adjusted Langevin algorithm (MALA) is a Markov Chain Monte Carlo (MCMC) method for obtaining random samples from a probability distribution for which direct sampling is difficult. MALA uses a combination of two mechanisms to generate the states of a random walk
- New states are proposed using Langevin dynamics: use evaluations of the gradient of the target probability density function
- Proposals are accepted or rejected using Metropolis-Hasting algorithm

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

In [97]:
# Select device which you are going to use for training
device = torch.device("cpu")

## Helper Functions
Helper functions imported from helper files

In [50]:
class Simulator:
    
    def __init__(self, w, b, sigma, N, design_range=(-10,10)):
        self.w = w
        self.b = b
        self.theta = np.expand_dims(np.concatenate([w, [b]], axis=0), axis=1)
        self.sigma = sigma
        self.N = N
        self.design_range = design_range
        self.X = None
        self.y = None
        self.y_mean = None
        
    def run(self):
        designs = np.random.uniform(self.design_range[0], self.design_range[1], size=(self.N, self.w.size))
        self.X = np.concatenate([designs, np.ones((self.N, 1))], axis=1)
        self.y_mean = (self.X @ self.theta).squeeze()
        self.y = np.random.multivariate_normal(mean=self.y_mean, cov=np.diag([self.sigma**2] * self.N))
    
    def plot(self):
        x = self.X[:, 0]
        plt.scatter(x, self.y, label="data")
        x_dense = np.linspace(self.design_range[0], self.design_range[1], 100)
        y_dense = x_dense * self.w[0] + self.b
        plt.plot(x_dense, y_dense, label="y mean")
        plt.xlabel("x")
        plt.ylabel("y")
        plt.legend()
        plt.title("Simulated data, N="+str(self.N))
        plt.show()

In [49]:
# Helper function for collecting nn gradient into a vector
def collect_grads(model):
    return torch.cat([p.grad.data.view(1, -1) for p in model.parameters()], dim=-1)

# Helper function for computing sizes of all nn parameters
def get_param_sizes(model):
    return [p.reshape(-1).size()[0] for p in model.parameters()]

# Helper function for writing the updated weights
def update_params(new_params, model, param_sizes):
    start_index = 0
    for i, p in enumerate(model.parameters()):
        end_index = start_index + param_sizes[i]
        source_tensor = new_params[:, start_index:end_index].reshape(p.shape)
        p.data = source_tensor
        start_index = end_index

In [102]:
# True weight(s)
w = np.array([1.5, -1.0, 0.7])

# Input dimensionality
d = w.size

# True intercept
b = 0.5

# True standard deviation
sigma = 0.5

# Number of data points
N = 100

# Defines range of inputs x
design_range = (-1.0, 1.0)

# Simulate
simulator = Simulator(w, b, sigma, N, design_range)
simulator.run()

X = simulator.X
y = simulator.y

### Step 1: Implement function that constructs MLP neural network
It would be good to not hardcode the amount of hidden layers, layer dimensions or activation functions but instead make it so that these can be given as arguments to the function.

In [7]:
# True weight(s)
w = np.array([1.5, -1.0, 0.7])

# Input dimensionality
d = w.size

In [8]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        self.layer_1 = nn.Linear(d + 1, 16)
        self.layer_2 = nn.Linear(16, 32)
        self.output_layer = nn.Linear(16, 1)
    
    def forward(self, x):
        x = torch.sigmoid(self.layer_1(x))
        x = torch.sigmoid(self.layer_2(x))
        return self.output_layer(x)

In [9]:
# Start with one hidden layer: 2 inputs and 1 output
# Replace log into L2_loss

### Step 2: Evaluate and Sample 
Evaluate log 
$$
log_p(\theta | \alpha^2, D) \propto \frac{-1}{2\alpha^2} (y- f_\theta(x))^T (y - f_\theta(x)) - \frac{-1}{2\alpha_0^2} \theta^T\theta $$

where:
- $log_p(\theta | \alpha^2, D)$ is L2 loss
- $f_\theta(x)$ is the neural network
- $\theta$ is a vector that contains all parameters of the neural network

In [83]:
mlp = MLP()

# Get all the nn parameters and store in theta
theta = torch.cat([x.reshape(-1) for x in mlp.parameters()])

# L2 Loss calculation
X_tensor = torch.tensor(X, dtype=torch.float32, requires_grad=False)
y_tensor = torch.tensor(y, dtype=torch.float32, requires_grad=False).view(1, -1)
#l2_loss = ((y_tensor - mlp(X_tensor)) ** 2).sum()
l2_loss = 203.51

# Define alpha 0
alpha_0 = 0.0001

# Define f0(x)
f0_x = mlp(X_tensor)

In [87]:
# WIP: Implement MALA and get value y
def sample_MALA():
    y = 0
    return y

#### Weight-decay regulization

In [93]:
def compute_loss(model, x1, x2, y):
    model.eval()
    with torch.no_grad():
        outputs = model.forward(x1,x2)
        loss = F.mse_loss(outputs, y)
        return loss.cpu().numpy()

In [103]:
# Create an Adam optimizer with learning rate 0.01 and weight decay parameter 0.001
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.01, weight_decay = 0.001)

# Train network with 
n_epochs = 4000
mlp.zero_grad()
train_errors = []
val_errors = []

# Convert x and Y from numpy to tensor
x_train, y_train = torch.from_numpy(X), torch.from_numpy(y)
x_train, y_train = x_train.type(torch.FloatTensor),y_train.type(torch.FloatTensor)

x = x_train.to(device)
y = y_train.to(device)

for epoch in range(n_epochs):
    optimizer.zero_grad()
    outputs = mlp.forward(x_train)
    loss = F.mse_loss(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        train_errors.append(compute_loss(mlp, x_train, y_train))
        val_errors.append(compute_loss(mlp, x_test, y_test))
        print_progress(epoch, train_errors[-1], val_errors[-1])

RuntimeError: mat1 and mat2 shapes cannot be multiplied (100x32 and 16x1)

In [86]:
T = 200
N = 1000

def evaluate(f0_x, theta, alpha_0, y):
    for i in range(T):
        alpha = alpha_0 + N/2
        
        # Calculate right handside
        b = (y - f0_x)
        A = -1/(2*alpha**2)* b.T @ b - (-1/(2*alpha_0**2) * theta.T @ theta)
        
    return A
        
# evaluate(f0_x, theta, alpha_0, y = sample_MALA())

In [None]:
# Plot right handside and left handside value

4