<div style="width: 100%">
    <img style="width: 100%" src="https://storage.googleapis.com/kaggle-datasets-images/681739/1196904/5c9764c44d37ca06ae29daeaa405e3a3/dataset-cover.jpg"/>
</div>

In [None]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from pylab import *
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp

import warnings
warnings.filterwarnings('ignore')

<h1 id="dataset" style="color:blue; background:white; border:0.5px dotted cyan;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
path = '../input/water-potability/water_potability.csv'
df = pd.read_csv(path)
df.fillna(df.mean(), inplace=True)
df.head()

## Standardize

In [None]:
features = df.drop('Potability', axis=1)

scaler = StandardScaler()
stand_features = scaler.fit_transform(features)
stand_features = stand_features.astype(np.float32)

labels = df['Potability']

## Dimension reduction

In [None]:
pca = PCA(n_components=2)
pca.fit(stand_features)
X = pca.transform(stand_features)

# colors indices
idx_one = list(df[df['Potability'] == 1].index)
idx_zero = list(df[df['Potability'] == 0].index)

## Plot the 2D graph

In [None]:
plt.figure(figsize=(14,8))
plt.plot(X[idx_one][:,0], X[idx_one][:,1], 'bo')
plt.plot(X[idx_zero][:,0], X[idx_zero][:,1], 'ro')

<h1 id="real" style="color:blue; background:white; border:0.5px dotted cyan;"> 
    <center>Real NVP
        <a class="anchor-link" href="#real" target="_self">¶</a>
    </center>
</h1>

In [None]:
tfb = tfp.bijectors
tfd = tfp.distributions

## Density Estimation using Real NVP

Unsupervised learning of probabilistic models is a central yet challenging problem in machine learning. Specifically, designing models with tractable learning, sampling, inference and evaluation is crucial in solving this task. We extend the space of such models using real-valued non-volume preserving (real NVP) transformations, a set of powerful invertible and learnable transformations, resulting in an unsupervised learning algorithm with exact log-likelihood computation, exact sampling, exact inference of latent variables, and an interpretable latent space. We demonstrate its ability to model natural images on four datasets through sampling, log-likelihood evaluation and latent variable manipulations. 

## Model

In [None]:
class RealNVP(tf.keras.models.Model):

    def __init__(self, *, output_dim, num_masked, **kwargs):
        super().__init__(**kwargs)
        self.output_dim = output_dim
        self.nets=[]

        bijectors=[]
        num_blocks = 5
        h = 32 
        for i in range(num_blocks): 
            net = tfb.real_nvp_default_template([h, h])
            bijectors.append(
                tfb.RealNVP(shift_and_log_scale_fn=net, 
                            num_masked=num_masked))
            bijectors.append(tfb.Permute([1,0]))
            self.nets.append(net) 
        bijector = tfb.Chain(list(reversed(bijectors[:-1])))

        self.flow = tfd.TransformedDistribution(#G
            distribution=tfd.MultivariateNormalDiag(loc=[0., 0.]), 
            bijector=bijector)
        
    def call(self, *inputs): 
        return self.flow.bijector.forward(*inputs)

In [None]:
model = RealNVP(output_dim=2, num_masked=1)
_ = model(X) 
print(model.summary())

## Training

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
# needs to be called other-wise @tf.function has problem
-tf.reduce_mean(model.flow.log_prob(X)) 

@tf.function
def train_step(X): 
    with tf.GradientTape() as tape:
        loss = -tf.reduce_mean(model.flow.log_prob(X)) 
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss      

from time import time
start = time()

for i in range(len(df)):

    loss = train_step(X)
    if (i % 100 == 0):
        print("i:{:4d}, loss:{:1.3f}, time:{:1.3f}"
                .format(i ,loss.numpy(), (time()-start)))
        start = time()

<h1 id="analysis" style="color:blue; background:white; border:0.5px dotted cyan;"> 
    <center>Analysis
        <a class="anchor-link" href="#analysis" target="_self">¶</a>
    </center>
</h1>

In [None]:
# generate random data
Z = np.random.normal(0,1,(5000,2))

plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
plt.plot(Z[:,0], Z[:,1], 'bo')
plt.title('$Z \sim N(0,1)$')
plt.xlabel('$z_1$')
plt.ylabel('$z_2$')

# predict dimensions
Xs = model(Z).numpy()

plt.subplot(1,2,2)
plt.title('Transformed distribution')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.plot(Xs[idx_one][:,0], Xs[idx_one][:,1], 'bo')
plt.plot(Xs[idx_zero][:,0], Xs[idx_zero][:,1], 'ro')
plt.xlim(-5.,5.)
plt.ylim(-4.,4.)