In [None]:
'''
 * Copyright (c) 2004 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

In [1]:
import random
import math
from typing import List

# Gaussian distribution helper functions
def gaussian_sample(mean: float, stddev: float) -> float:
    """Generate a random sample from a Gaussian distribution."""
    return random.gauss(mean, stddev)

def gaussian_pdf(x: float, mean: float, stddev: float) -> float:
    """Calculate the probability density function of a Gaussian distribution."""
    return (1.0 / (stddev * math.sqrt(2 * math.pi))) * math.exp(-0.5 * ((x - mean) / stddev) ** 2)

# Diffusion process functions
def forward_diffusion(x: List[float], beta_t: float) -> List[float]:
    """Forward diffusion process: Adds Gaussian noise to the input."""
    noisy_x = [gaussian_sample(x_i, math.sqrt(beta_t)) for x_i in x]
    return noisy_x

def backward_diffusion(z_t: List[float], beta_t: float, model_params: List[float]) -> List[float]:
    """Backward diffusion process: Removes Gaussian noise based on model parameters."""
    cleaned_z = [z_i - model_params[i] * math.sqrt(beta_t) for i, z_i in enumerate(z_t)]
    return cleaned_z

# Variational posteriors for forward and backward diffusion
def q_phi(z_t_given_z_t_minus_1: List[float], beta_t: float) -> List[float]:
    """Variational posterior qφ(zt | zt-1) for the forward diffusion."""
    return [gaussian_sample(z_t_minus_1, math.sqrt(beta_t)) for z_t_minus_1 in z_t_given_z_t_minus_1]

# Kullback-Leibler divergence for Gaussian distributions
def kl_divergence(mu1: float, std1: float, mu2: float, std2: float) -> float:
    """Calculate the Kullback-Leibler divergence between two Gaussian distributions."""
    return math.log(std2 / std1) + (std1 ** 2 + (mu1 - mu2) ** 2) / (2 * std2 ** 2) - 0.5

# ELBO calculation (simplified version)
def elbo(x: List[float], z_t: List[float], model_params: List[float], beta_t: float) -> float:
    """Calculate a simplified ELBO (Evidence Lower Bound) for the diffusion model."""
    reconstruction_error = sum((x_i - z_i) ** 2 for x_i, z_i in zip(x, z_t))  # L2 loss (simplified)
    kl_term = sum(kl_divergence(z_i, 1.0, model_params[i], math.sqrt(beta_t)) for i, z_i in enumerate(z_t))
    return -reconstruction_error - kl_term

# Example usage
if __name__ == "__main__":
    # Example input data (x)
    x = [2.0, 3.0, 1.5, 4.0]  # Some example data (e.g., an image vector or feature vector)
    T = 10  # Number of diffusion steps
    beta_t = 0.1  # Diffusion parameter at step t

    # Diffusion model parameters (just an example; in practice, these would be learned)
    model_params = [random.uniform(0, 1) for _ in range(len(x))]  # Random parameters for backward diffusion

    # Forward diffusion: add noise
    noisy_x = forward_diffusion(x, beta_t)

    # Backward diffusion: remove noise using model parameters
    cleaned_x = backward_diffusion(noisy_x, beta_t, model_params)

    # Calculate ELBO for the model (simplified)
    elbo_value = elbo(x, noisy_x, model_params, beta_t)
    print(f"ELBO Value: {elbo_value:.4f}")

    # Print results
    print(f"Original x: {x}")
    print(f"Noisy x (after forward diffusion): {noisy_x}")
    print(f"Cleaned x (after backward diffusion): {cleaned_x}")


ELBO Value: -153.7712
Original x: [2.0, 3.0, 1.5, 4.0]
Noisy x (after forward diffusion): [1.908332929757319, 2.4953427413903957, 1.4653547302261312, 4.76221706352085]
Cleaned x (after backward diffusion): [1.6470191610098692, 2.4406025147198394, 1.2681118824335853, 4.69274003685799]


## Hybrid Modeling

###  Introduction

In Chapter 1, I tried to convince you that learning the conditional distribution $ p(y|x) $ is not enough, and instead, we should focus on the joint distribution $ p(x, y) $ factorized as follows:

$$
p(x, y) = p(y|x)p(x)
$$

Why? Let me remind you of my reasoning. The conditional $ p(y|x) $ does not allow us to say anything about $ x $, but, instead, it will do its best to provide a decision. As a result, I can provide an object that has never been observed so far, and $ p(y|x) $ could still be pretty certain about its decision (i.e., assigning high probability to one class). On the other hand, once we have trained $ p(x) $, we should be able to, at least in theory, access the probability of the given object. And, eventually, determine whether our decision is reliable or not.

In the previous chapters, we completely focused on answering the question of how to learn $ p(x) $ alone. Since we had in mind the necessity of using it for evaluating the probability, we discussed only the likelihood-based models, namely, the autoregressive models (ARMs), the flow-based models (flows), and the variational autoencoders (VAEs).

Now, the naturally arising question is how to use a deep generative model together with a classifier (or a regressor). Let us focus on a classification task for simplicity and think of possible approaches.

##  Approach 1: Let’s Be Naive!

Let us start with some easy, naive, almost trivial approaches. In the most straightforward way, we can train $ p(y|x) $ and $ p(x) $ separately. And that is it, we have a classifier and a marginal distribution over objects. This approach is schematically presented in Figure 6.1 where we use different colors (purple and blue) to highlight that we use two different neural networks to parameterize the two distributions.
![image.png](attachment:image.png)

*Fig.1: A naive approach to learning the joint distribution by considering both distributions separately.*

Taking the logarithm of the joint distribution yields:

$$
\ln p(x, y) = \ln p_\alpha (y|x) + \ln p_\beta (x)
$$

where $ \alpha $ and $ \beta $ denote parameterizations of both distributions (i.e., neural networks). Once we start training and calculate gradients with respect to $ \alpha $ and $ \beta $, we clearly see that we get:

$$
\nabla_\alpha \ln p(x, y) = \nabla_\alpha \ln p_\alpha (y|x) + \nabla_\alpha \ln p_\beta (x)
$$

and

$$
\nabla_\beta \ln p(x, y) = \nabla_\beta \ln p_\alpha (y|x) + \nabla_\beta \ln p_\beta (x)
$$

Now, let's break these down:

- $ \nabla_\alpha \ln p_\beta (x) = 0 $ because $ \ln p_\beta (x) $ is not dependent on $ \alpha $,
- $ \nabla_\beta \ln p_\alpha (y|x) = 0 $ because $ \ln p_\alpha (y|x) $ does not depend on $ \beta $.

In other words, we can simply first train $ p_\alpha (y|x) $ using all data with labels and then train $ p_\beta (x) $ using all available data.

### What is a potential pitfall with this approach?

Intuitively, we can say that there is no guarantee that both distributions treat $ x $ in the same manner and, thus, could introduce some errors. Moreover, due to the stochasticity during training, there is no information flow between random variables $ x $ and $ y $, and as a result, the neural networks seek for their own (local) minima. To use a metaphor, they are like two wings of a bird that move in total separation, completely asynchronously. 

Moreover, training both models separately is also inefficient. We must use two different neural networks, with no weight sharing. Since training is stochastic, we really could worry about potential bad local optima, and our worries are even doubled now.

### Would such an approach fail?

Well, there is no simple answer to this question. Probably, it could work pretty well even, but it might lead to models far from optimal ones. Either way, who does like being unclear about training models? At least not me.


##  Approach 2: Shared Parameterization!

Alright, so since I whine about sharing the parameterization, it is obvious that the second approach uses (drums here) a shared parameterization! To be more precise, a partially shared parameterization assumes that there is a neural network that processes $ x $ and then its output is fed to two neural networks: one for the classifier and one for the marginal distribution over $ x $'s. An example of this approach is depicted in Figure 6.2 (the shared neural network is shown in purple).

Now, taking the logarithm of the joint distribution gives:

$$
\ln p(x, y) = \ln p_{\alpha, \gamma}(y|x) + \ln p_{\beta, \gamma}(x)
$$

where it is worth highlighting that both distributions partially share the parameterization $ \gamma $ (i.e., the purple neural network in Fig.2). As a result, during training, there is a piece of obvious information sharing between $ x $ and $ y $! Intuitively, both distributions operate on a processed $ x $ in the same manner, and then this representation is specialized to give probabilities for classes and objects. 

Again, one might ask: what is all this fuss about? Well, first of all, now, the two distributions are tightly connected. Like in the metaphor of a bird used before, both wings can move together, in a synchronized fashion. Second, from the optimization perspective, the gradients flow through the $ \gamma $ network, and thus, it contains information about both $ x $ and $ y $. This may greatly help in finding a better solution.

![image-3.png](attachment:image-3.png)

*Fig.2: An approach to learning the joint distribution by using a partially shared parameterization.*

### Hybrid Modeling

At first glance, there is nothing wrong with learning using the training objective expressed as:

$$
\ln p(x, y) = \ln p_{\alpha, \gamma}(y|x) + \ln p_{\beta, \gamma}(x)
$$

However, let us think about the dimensionalities of \( y \) and \( x \). For instance, if \( y \) is binary, then we have one single bit representing a class label. For a binary vector of \( x \), we have \( D \) bits. Hence, there is a clear discrepancy in scales! Let us take a look at the gradient with respect to \( \gamma \) first, namely:

$$
\nabla_\gamma \ln p(x, y) = \nabla_\gamma \ln p_{\alpha, \gamma}(y|x) + \nabla_\gamma \ln p_{\beta, \gamma}(x)
$$

If we think about it, during training, the \( \gamma \) network obtains a much stronger signal from \( \ln p_{\beta, \gamma}(x) \). Following our example of binary variables, let us assume that our neural nets return all probabilities equal to 0.5. So, for the independent Bernoulli variables, we get:

$$
\ln \text{Bern}(y|0.5) = y \ln 0.5 + (1 − y) \ln 0.5 = -\ln 2
$$

where we use the property of the logarithm (\( \ln 0.5 = \ln 2^{-1} = -\ln 2 \)) and it does not matter what the value of \( y \) is because the neural network returns \( 0.5 \) for \( y = 0 \) and \( y = 1 \). Similarly, for \( x \), we get:

$$
\prod_{d=1}^D \ln \text{Bern}(x_d|0.5) = \sum_{d=1}^D \ln \text{Bern}(x_d|0.5) = -D \ln 2
$$

Therefore, we see that the \( \ln p_{\beta, \gamma}(x) \) part is \( D \)-times stronger than the \( \ln p_{\alpha, \gamma}(y|x) \) part! How does it influence the final gradients during training? 

Try to visualize a bar of height \( \ln 2 \) and the other that is \( D \)-times higher. Now, imagine these bars "flow" through \( \gamma \). Do you see it? Yes, the \( \gamma \) neural network will obtain more information from the marginal distribution, and this information could cripple the classification part. In other words, our final model will always be biased toward the marginal part.

### Can we do something about it?

Fortunately, yes! In [1], it was proposed to consider the convex combination of \( \ln p(y|x) \) and \( \ln p(x) \) as the objective function, namely:

$$
L(x, y; \lambda) = (1 - \lambda) \ln p(y|x) + \lambda \ln p(x)
$$

where \( \lambda \in [0, 1] \). Unfortunately, this weighting scheme is not derived from a well-defined distribution, and it breaks the elegance of the likelihood-based approach. However, if you do not mind being inelegant, then this approach should work well!

A different approach is proposed in [2] where only \( \ln p(x) \) is weighted:

$$
\ell(x, y; \lambda) = \ln p(y|x) + \lambda \ln p(x)
$$

where \( \lambda \geq 0 \). This kind of weighting was proposed in various forms before (e.g., see [3, 4]). Still, the fudge factor \( \lambda \) is not derived from a probabilistic perspective. However, [2] argues that we can interpret \( \lambda \) as a way of encouraging robustness to input variations. They also mention that scaling \( \ln p(x) \) can be seen as a Jacobian-based regularization penalty. It is still not a valid distribution (because it is equivalent to \( p(x)^\lambda \)), but at least we can provide some interpretations.

In [2], the hybrid modeling idea has been pursued with \( p(x) \) being modeled by flows (in the paper, they used GLOW [5]), and then, the resulting latents \( z \) were used as the input to the classifier. In other words, a flow-based model is used for $ p(x) $, and the invertible neural network (e.g., consisting of coupling layers) is shared with the classifier. Then, the final layers on top of the invertible neural network are used to make a decision \( y \). The objective function is $ \ell(x, y; \lambda) $ as defined in Eq. (6.9). The approach is schematically presented in Fig.3.

There are a couple of interesting properties of this approach. First, we can use the invertible neural network for both generative and discriminative parts of the model. Hence, the flow-based model is well-informed about the label. Second, the weighting $ \lambda $ allows controlling whether the model is more discriminative or more generative. Third, we can use any flow-based model! GLOW was used in [2]; however, [6] used residual flows and [7] applied invertible DenseNets. Fourth, as presented by [2], we can use any classifier (or regressor), e.g., Bayesian classifiers.

A potential drawback of this approach lies in the necessity of determining $ \lambda $. This is an extra hyperparameter that requires tuning. Moreover, as noticed in previous papers [2, 6, 7], the value of $ \lambda $ drastically changes the performance of the model from discriminative to generative. That is an open question of how to deal with that.
![image-2.png](attachment:image-2.png)
  
*Fig.3: Hybrid modeling using invertible neural networks and flow-based models.*


## Let’s Implement It!

Now, it is time to be more specific and formulate a hybrid model. Let us start with the classifier and consider a fully connected neural network to model the conditional distribution $ p(y|x) $, namely:

$$
z \rightarrow \text{Linear}(D, M) \rightarrow \text{ReLU} \rightarrow \text{Linear}(M, M) \rightarrow \text{ReLU} \rightarrow \text{Linear}(M, K) \rightarrow \text{Softmax}
$$

where $ D $ is the dimensionality of $ x $ and $ K $ is the number of classes. The softmax gives us probabilities for each class. Remember that $ z = f^{-1}(x) $, where $ f $ is an invertible neural network.

In our example, we use the classifier, so we should take the categorical distribution for the conditional $ p(y|x) $:

$$
p(y|x) = \prod_{k=1}^{K} \theta_k(x) [y = k]
$$

where $ \theta_k(x) $ is the softmax value for the $ k $-th class and $ [y = k] $ is the Iverson bracket (i.e., $ [y = k] = 1 $ if $ y $ equals $ k $ and 0 otherwise).

Next, we focus on modeling $ p(x) $. We can use any marginal model, e.g., we can apply flows and the change of variable formula, namely:

$$
p(x) = \pi(z = f^{-1}(x)) |J_f(x)|^{-1}
$$

where $ J_f(x) $ denotes the Jacobian of the transformation (i.e., neural network) $ f $ evaluated at $ x $. In the case of the flow, we typically use $ \pi(z) = N(z | 0, 1) $, i.e., the standard Gaussian distribution.

Plugging these all distributions into the objective of the hybrid modeling $ \ell(x, y; \lambda) $, we get:

$$
\ell(x, y; \lambda) = \sum_{k=1}^{K} [y = k] \ln \theta_{k,g,f}(x) + \lambda N(z = f^{-1}(x) | 0, 1) - \ln |J_f(x)|
$$

where we additionally highlight that $ \theta_{k,g,f} $ is parameterized by two neural networks: $ f $ from the flow and $ g $ for the final classification.

Now, if we would like to follow [2], we could pick coupling layers as the components of $ f $, and eventually, we would model $ p(x) $ using RealNVP or GLOW, for instance. However, we want to be fancier, and we will utilize integer discrete flows (IDFs) [8, 9]. Why? Because we simply can, and also IDFs do not require calculating the Jacobian. Besides, we can practice a bit of formulating various hybrid models.

Let us quickly recall IDFs. First, they operate on $ \mathbb{Z}^D $, i.e., integers. Second, we need to pick an appropriate $ \pi(z) $ that in this case could be the discretized logistic (DL), $ \text{DL}(z | \mu, \nu) $ with mean $ \mu $ and scale $ \nu $. Since the change of variable formula for discrete random variables does not require calculating the Jacobian (remember no change of volume here!), we can rewrite the hybrid modeling objective as follows:

$$
\ell(x, y; \lambda) = \sum_{k=1}^{K} [y = k] \ln \theta_{k,g,f}(x) + \lambda \text{DL}(z = f^{-1}(x) | \mu, \nu)
$$

That’s it! Congratulations, if you have followed all these steps, you have arrived at a new hybrid model that uses IDFs to model the distribution of $ x $. Notice that the classifier takes integers as inputs.


## Code

We have all the components to implement our own hybrid integer discrete flow (HybridIDF)! Below, there is a code with a lot of comments that should help to understand every single line of it.

```python
class HybridIDF(nn.Module):
    def __init__(self, netts, classnet, num_flows, alpha=1., D=2):
        super(HybridIDF, self).__init__()

        print('HybridIDF by JT.')

        # Here we use the two options discussed previously: a coupling layer or a generalized invertible transformation
        # These formulate the transformation f.
        # NOTE: Please pay attention to a new variable here, namely, beta. This is the rezero trick used in (van den Berg et al., 2020).
        if len(netts) == 1:
            self.t = torch.nn.ModuleList([netts[0]() for _ in range(num_flows)])
            self.idf_git = 1
            self.beta = nn.Parameter(torch.zeros(len(self.t)))
        elif len(netts) == 4:
            self.t_a = torch.nn.ModuleList([netts[0]() for _ in range(num_flows)])
            self.t_b = torch.nn.ModuleList([netts[1]() for _ in range(num_flows)])
            self.t_c = torch.nn.ModuleList([netts[2]() for _ in range(num_flows)])
            self.t_d = torch.nn.ModuleList([netts[3]() for _ in range(num_flows)])
            self.idf_git = 4
            self.beta = nn.Parameter(torch.zeros(len(self.t_a)))
        else:
            raise ValueError('You can provide either 1 or 4 translation nets.')

        # This contains extra layers for classification on top of z.
        self.classnet = classnet

        # The number of flows (i.e., f’s).
        self.num_flows = num_flows

        # The rounding operator.
        self.round = RoundStraightThrough.apply

        # The mean and log-scale for the base distribution pi.
        self.mean = nn.Parameter(torch.zeros(1, D))
        self.logscale = nn.Parameter(torch.ones(1, D))

        # The dimensionality of the input.
        self.D = D

        # Since using "lambda" is confusing for Python, we will use alpha in the code for lambda in previous equations (not confusing at all, right?!)
        self.alpha = alpha

        # We use the built-in PyTorch loss function. It is for educational purposes! Otherwise, we could use the log-categorical.
        self.nll = nn.NLLLoss(reduction='none') #it requires log-softmax as input!!

    # The coupling layer as introduced before.
    # NOTE: We use the rezero trick!
    def coupling(self, x, index, forward=True):
        if self.idf_git == 1:
            (xa, xb) = torch.chunk(x, 2, 1)

            if forward:
                yb = xb + self.beta[index] * self.round(self.t[index](xa))
            else:
                yb = xb - self.beta[index] * self.round(self.t[index](xa))

            return torch.cat((xa, yb), 1)

        elif self.idf_git == 4:
            (xa, xb, xc, xd) = torch.chunk(x, 4, 1)

            if forward:
                ya = xa + self.beta[index] * self.round(self.t_a[index](torch.cat((xb, xc, xd), 1)))
                yb = xb + self.beta[index] * self.round(self.t_b[index](torch.cat((ya, xc, xd), 1)))
                yc = xc + self.beta[index] * self.round(self.t_c[index](torch.cat((ya, yb, xd), 1)))
                yd = xd + self.beta[index] * self.round(self.t_d[index](torch.cat((ya, yb, yc), 1)))
            else:
                yd = xd - self.beta[index] * self.round(self.t_d[index](torch.cat((xa, xb, xc), 1)))
                yc = xc - self.beta[index] * self.round(self.t_c[index](torch.cat((xa, xb, yd), 1)))
                yb = xb - self.beta[index] * self.round(self.t_b[index](torch.cat((xa, yc, yd), 1)))
                ya = xa - self.beta[index] * self.round(self.t_a[index](torch.cat((yb, yc, yd), 1)))

            return torch.cat((ya, yb, yc, yd), 1)

    # The permutation layer.
    def permute(self, x):
        return x.flip(1)

    # The flow transformation: forward pass ...
    def f(self, x):
        z = x
        for i in range(self.num_flows):
            z = self.coupling(z, i, forward=True)
            z = self.permute(z)
        return z

    # ... and the inverse pass.
    def f_inv(self, z):
        x = z
        for i in reversed(range(self.num_flows)):
            x = self.permute(x)
            x = self.coupling(x, i, forward=False)
        return x

    # A new function: This is used for classification. First, we predict probabilities, and then pick the most probable value.
    def classify(self, x):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)
        return torch.argmax(y_pred, dim=1)

    # An auxiliary function: We use it for calculating the classification loss, namely, the negative log-likelihood for p(y|x).
    # NOTE: We first apply the invertible transformation f.
    def class_loss(self, x, y):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)
        return self.nll(torch.log(y_pred), y)

    def sample(self, batchSize):
        # sample z:
        z = self.prior_sample(batchSize=batchSize, D=self.D)
        # x = f^-1(z)
        x = self.f_inv(z)
        return x.view(batchSize, 1, self.D)

    # The log-probability of the base distribution (a.k.a. prior).
    def log_prior(self, x):
        log_p = log_integer_probability(x, self.mean, self.logscale)
        return log_p.sum(1)

    # Sampling from the base distribution.
    def prior_sample(self, batchSize, D=2):
        # Sample from logistic
        y = torch.rand(batchSize, self.D)
        x = torch.exp(self.logscale) * torch.log(y / (1. - y)) + self.mean
        # And then round it to an integer.
        return torch.round(x)

    # The forward pass: Now, we use the hybrid model objective!
    def forward(self, x, y, reduction='avg'):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)

        idf_loss = -self.log_prior(z)
        class_loss = self.nll(torch.log(y_pred), y)  # remember to use logarithm on top of softmax!

        if reduction == 'sum':
            return (class_loss + self.alpha * idf_loss).sum()
        else:
            return (class_loss + self.alpha * idf_loss).mean()
# The number of invertible transformations
num_flows = 2

# Here, we present only for the option 1 IDF.
nett = lambda: nn.Sequential(nn.Linear(D // 2, M), nn.LeakyReLU(),
                              nn.Linear(M, M), nn.LeakyReLU(),
                              nn.Linear(M, D // 2))
netts = [nett]

# And a three-layered classifier.
classnet = nn.Sequential(nn.Linear(D, M), nn.LeakyReLU(),
                          nn.Linear(M, M), nn.LeakyReLU(),
                          nn.Linear(M, K), nn.Softmax(dim=1))

# Init HybridIDF
model = HybridIDF(netts, classnet, num_flows, D=D, alpha=alpha)


![image.png](attachment:image.png)

Fig.4 An example of outcomes after the training: (a) Randomly selected real images. (b) Unconditional generations from the HybridIDF. (c) An example of a validation curve for the classiﬁcation error. (d) An example of a validation curve for the negative log-likelihood, i.e., .− ln p(x).

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming `RoundStraightThrough` is defined elsewhere
class RoundStraightThrough(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        output = input.round()
        ctx.save_for_backward(input)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input == input.round()] = 0  # Zero gradient on rounded values
        return grad_input


# Define the HybridIDF class
class HybridIDF(nn.Module):
    def __init__(self, netts, classnet, num_flows, alpha=1., D=2):
        super(HybridIDF, self).__init__()

        print('HybridIDF by JT.')

        # Configuration for translation nets
        if len(netts) == 1:
            self.t = torch.nn.ModuleList([netts[0]() for _ in range(num_flows)])
            self.idf_git = 1
            self.beta = nn.Parameter(torch.zeros(len(self.t)))
        elif len(netts) == 4:
            self.t_a = torch.nn.ModuleList([netts[0]() for _ in range(num_flows)])
            self.t_b = torch.nn.ModuleList([netts[1]() for _ in range(num_flows)])
            self.t_c = torch.nn.ModuleList([netts[2]() for _ in range(num_flows)])
            self.t_d = torch.nn.ModuleList([netts[3]() for _ in range(num_flows)])
            self.idf_git = 4
            self.beta = nn.Parameter(torch.zeros(len(self.t_a)))
        else:
            raise ValueError('You can provide either 1 or 4 translation nets.')

        # Extra layers for classification on top of z
        self.classnet = classnet

        # Number of flows (i.e., f’s)
        self.num_flows = num_flows

        # The rounding operator
        self.round = RoundStraightThrough.apply

        # Mean and log-scale for the base distribution pi
        self.mean = nn.Parameter(torch.zeros(1, D))
        self.logscale = nn.Parameter(torch.ones(1, D))

        # Dimensionality of input
        self.D = D

        # Lambda replacement for Python (alpha)
        self.alpha = alpha

        # PyTorch's negative log-likelihood loss function
        self.nll = nn.NLLLoss(reduction='none')  # Requires log-softmax as input!

    # The coupling layer as introduced before
    def coupling(self, x, index, forward=True):
        if self.idf_git == 1:
            (xa, xb) = torch.chunk(x, 2, 1)

            if forward:
                yb = xb + self.beta[index] * self.round(self.t[index](xa))
            else:
                yb = xb - self.beta[index] * self.round(self.t[index](xa))

            return torch.cat((xa, yb), 1)

        elif self.idf_git == 4:
            (xa, xb, xc, xd) = torch.chunk(x, 4, 1)

            if forward:
                ya = xa + self.beta[index] * self.round(self.t_a[index](torch.cat((xb, xc, xd), 1)))
                yb = xb + self.beta[index] * self.round(self.t_b[index](torch.cat((ya, xc, xd), 1)))
                yc = xc + self.beta[index] * self.round(self.t_c[index](torch.cat((ya, yb, xd), 1)))
                yd = xd + self.beta[index] * self.round(self.t_d[index](torch.cat((ya, yb, yc), 1)))
            else:
                yd = xd - self.beta[index] * self.round(self.t_d[index](torch.cat((xa, xb, xc), 1)))
                yc = xc - self.beta[index] * self.round(self.t_c[index](torch.cat((xa, xb, yd), 1)))
                yb = xb - self.beta[index] * self.round(self.t_b[index](torch.cat((xa, yc, yd), 1)))
                ya = xa - self.beta[index] * self.round(self.t_a[index](torch.cat((yb, yc, yd), 1)))

            return torch.cat((ya, yb, yc, yd), 1)

    # Permutation layer
    def permute(self, x):
        return x.flip(1)

    # Flow transformation: forward pass
    def f(self, x):
        z = x
        for i in range(self.num_flows):
            z = self.coupling(z, i, forward=True)
            z = self.permute(z)
        return z

    # Inverse pass for the flow
    def f_inv(self, z):
        x = z
        for i in reversed(range(self.num_flows)):
            x = self.permute(x)
            x = self.coupling(x, i, forward=False)
        return x

    # Classification function
    def classify(self, x):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)
        return torch.argmax(y_pred, dim=1)

    # Calculate classification loss (negative log-likelihood for p(y|x))
    def class_loss(self, x, y):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)
        return self.nll(torch.log(y_pred), y)

    # Sampling function
    def sample(self, batchSize):
        z = self.prior_sample(batchSize=batchSize, D=self.D)
        x = self.f_inv(z)
        return x.view(batchSize, 1, self.D)

    # Log-probability of the base distribution (a.k.a. prior)
    def log_prior(self, x):
        log_p = log_integer_probability(x, self.mean, self.logscale)
        return log_p.sum(1)

    # Sampling from the base distribution
    def prior_sample(self, batchSize, D=2):
        y = torch.rand(batchSize, self.D)
        x = torch.exp(self.logscale) * torch.log(y / (1. - y)) + self.mean
        return torch.round(x)

    # Hybrid model objective function: forward pass
    def forward(self, x, y, reduction='avg'):
        z = self.f(x)
        y_pred = self.classnet(z)  # output: probabilities (i.e., softmax)

        idf_loss = -self.log_prior(z)
        class_loss = self.nll(torch.log(y_pred), y)  # Remember to use logarithm on top of softmax!

        if reduction == 'sum':
            return (class_loss + self.alpha * idf_loss).sum()
        else:
            return (class_loss + self.alpha * idf_loss).mean()
# Number of invertible transformations (flows)
num_flows = 2

# Define the IDF net (this one uses 1 translation net)
nett = lambda: nn.Sequential(
    nn.Linear(D // 2, M), 
    nn.LeakyReLU(),
    nn.Linear(M, M), 
    nn.LeakyReLU(),
    nn.Linear(M, D // 2)
)

netts = [nett]

# Define a classifier with 3 layers
classnet = nn.Sequential(
    nn.Linear(D, M),
    nn.LeakyReLU(),
    nn.Linear(M, M),
    nn.LeakyReLU(),
    nn.Linear(M, K),
    nn.Softmax(dim=1)
)

# Initialize the HybridIDF model
model = HybridIDF(netts, classnet, num_flows, D=D, alpha=alpha)


In [None]:
# Define the dimensionality of the input and the hidden layer size
D = 2  # Dimensionality of the input data
M = 64  # Size of hidden layers
K = 10  # Number of classes for classification
alpha = 1.0  # Scaling factor for the loss function

# Number of invertible transformations (flows)
num_flows = 2

# Define the IDF net (this one uses 1 translation net)
nett = lambda: nn.Sequential(
    nn.Linear(D // 2, M), 
    nn.LeakyReLU(),
    nn.Linear(M, M), 
    nn.LeakyReLU(),
    nn.Linear(M, D // 2)
)

netts = [nett]

# Define a classifier with 3 layers
classnet = nn.Sequential(
    nn.Linear(D, M),
    nn.LeakyReLU(),
    nn.Linear(M, M),
    nn.LeakyReLU(),
    nn.Linear(M, K),
    nn.Softmax(dim=1)
)

# Initialize the HybridIDF model
model = HybridIDF(netts, classnet, num_flows, D=D, alpha=alpha)


In [3]:
import math

class SimpleNN:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases with small random values
        self.weights1 = [[0.01 for _ in range(input_size)] for _ in range(hidden_size)]
        self.weights2 = [[0.01 for _ in range(hidden_size)] for _ in range(output_size)]
        self.bias1 = [0.01 for _ in range(hidden_size)]
        self.bias2 = [0.01 for _ in range(output_size)]
    
    def relu(self, x):
        """ReLU activation function"""
        return max(0, x)
    
    def softmax(self, x):
        """Softmax function to convert outputs into probabilities"""
        exp_values = [math.exp(i) for i in x]
        total = sum(exp_values)
        return [i / total for i in exp_values]
    
    def forward(self, x):
        """Forward pass through the network"""
        # First layer (input to hidden)
        hidden = [sum([x[i] * self.weights1[j][i] for i in range(len(x))]) + self.bias1[j] for j in range(len(self.weights1))]
        hidden = [self.relu(h) for h in hidden]  # Apply ReLU activation
        
        # Second layer (hidden to output)
        output = [sum([hidden[i] * self.weights2[j][i] for i in range(len(hidden))]) + self.bias2[j] for j in range(len(self.weights2))]
        
        # Softmax output layer
        output = self.softmax(output)
        
        return output

# Example usage:
if __name__ == "__main__":
    # Initialize the model
    model = SimpleNN(input_size=2, hidden_size=3, output_size=2)

    # Example input
    x = [1.0, 2.0]
    
    # Perform a forward pass through the network
    output = model.forward(x)
    
    # Print the output (softmax probabilities)
    print("Output probabilities:", output)


Output probabilities: [0.5, 0.5]
