## MODEL LIKE

In [None]:
# Load libriaries and functions.
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import tensorflow_probability as tfp


In [None]:
tfk = tf.keras
tf.keras.backend.set_floatx("float64")
tfd = tfp.distributions

In [None]:
tf.config.list_physical_devices("GPU")

In [None]:
# Define helper functions.
scaler = StandardScaler()
detector = IsolationForest(n_estimators=1000, random_state=42) # (of outliers)
neg_log_likelihood = lambda x, rv_x: -rv_x.log_prob(x)

In [None]:
# Load data and keep only first six months due to drift.
data = pd.read_excel("data/AirQualityUCI.xlsx")
data = data[data["Date"] <= "2004-09-10"]
data.head()

In [None]:
# Select columns and remove rows with missing values.
columns = ["PT08.S1(CO)", "PT08.S3(NOx)", "PT08.S4(NO2)", "PT08.S5(O3)", "T", "AH", "CO(GT)", "C6H6(GT)", "NOx(GT)", "NO2(GT)"]
data = data[columns].dropna(axis=0)
data.head()

In [None]:
# Scale data to zero mean and unit variance.
X_t = scaler.fit_transform(data)

In [None]:
# Remove outliers.
is_inlier = detector.fit_predict(X_t)
X_t = X_t[(is_inlier > 0),:]

In [None]:
# Restore frame.
dataset = pd.DataFrame(X_t, columns=columns)
dataset.head()

In [None]:
# Select labels for inputs and outputs.
inputs = ["PT08.S1(CO)", "PT08.S3(NOx)", "PT08.S4(NO2)", "PT08.S5(O3)", "T", "AH"]
outputs = ["CO(GT)", "C6H6(GT)", "NOx(GT)", "NO2(GT)"]

In [None]:
# Define some hyperparameters.
n_epochs, n_batches, n_samples = 50, 10, dataset.shape[0]
buffer_size, batch_size = n_samples, np.floor(n_samples/n_batches)

In [None]:
# Define training and test data sizes.
n_train = int(0.7*dataset.shape[0])
n_train

In [None]:
# Define dataset instance.
data = tf.data.Dataset.from_tensor_slices((dataset[inputs].values, dataset[outputs].values))
data = data.shuffle(n_samples, reshuffle_each_iteration=True)

In [None]:
# Define train and test data instances.
data_train = data.take(n_train).batch(batch_size).repeat(n_epochs)
data_test = data.skip(n_train).batch(1)

In [None]:
# Define prior for regularization.
prior = tfd.Independent(
    tfd.Normal(loc=tf.zeros(len(outputs), dtype=tf.float64), scale=1.0),
    reinterpreted_batch_ndims=1)

In [None]:
# Define model instance.
model = tfk.Sequential([

    # input
    tfk.layers.InputLayer(
        input_shape=(len(inputs),),
    name="input"),
    
    # dense for inputs
    tfk.layers.Dense(
        10, 
    activation="relu", name="dense_1"),
    
    # dense for weights
    tfk.layers.Dense(
        tfp.layers.MultivariateNormalTriL.params_size(len(outputs)), # uncertainty in the parameters weights
    activation=None, name="distribution_weights"),
    
    # (declaration of the) posterior probability distribution structure
    tfp.layers.MultivariateNormalTriL(
        len(outputs), activity_regularizer=tfp.layers.KLDivergenceRegularizer(prior, weight=1/n_batches), # activity_regularizer acts as prior for the output layer
    name="output")

], name="model")


In [None]:
# Compile model.
model.compile(optimizer="adam", loss=neg_log_likelihood)

In [None]:
# Run training session.
# MIN 4 MIN 4 MIN 4
model.fit(data_train, epochs=n_epochs, validation_data=data_test, verbose=False)

In [None]:
# Describe model.
model.summary()

To account for uncertainty in parameter weights, the dense layers have to be exchanged with

- Flipout layers (``DenseFlipout``)
- Variational layers (``DenseVariational``)

Such a model has more parameters, since every weight is parametrized by normal distribution with non-shared mean and standard deviation. \
Weights will be resampled for different predictions.

In [None]:
tfp.layers.DenseFlipout(10, activation="relu", name="dense_1")

The default prior distribution over weights is `tfd.Normal(loc=0., scale=1.)` and can be adjusted using the ``kernel_prior_fn``

In [None]:
# Predict.
samples = 500
iterations = 10
test_iterator = tf.compat.v1.data.make_one_shot_iterator(data_test)
X_true, Y_true, Y_pred = np.empty(shape=(samples, len(inputs))), np.empty(shape=(samples, len(outputs))), np.empty(shape=(samples, len(outputs), iterations))

In [None]:
for i in range(samples):
    features, labels = test_iterator.get_next()
    X_true[i,:] = features
    Y_true[i,:] = labels.numpy()
    for k in range(iterations):
        Y_pred[i,:,k] = model.predict(features)

In [None]:
# Calculate mean and standard deviation.
Y_pred_m = np.mean(Y_pred, axis=-1)
Y_pred_s = np.std(Y_pred, axis=-1)
Y_pred_m, Y_pred_s

## DATA

In [None]:
import pandas as pd
df_en = pd.read_csv('energy_dataset.csv')
df_we = pd.read_csv('weather_features.csv')

In [None]:
[c for c in df_en.columns if not c.startswith('generation')]

In [None]:
[c for c in df_en.columns if c.startswith('generation')]

In [None]:
df_we.describe()

In [None]:
df_en.describe()