# Example 5.1 Modeling CSV data with Multilayer Perceptron Networks
The first example from the Deep Learning book is modeling CSV data with a multilayer perceptron network (Patterson and Gibson. 175). This is entended to be a gentile introduction to the DL4J API using a simple model. My plan was to implement this exact model in TensorFlow using modern toolsets like Pandas, for loading data and Keras for creating, training and testing the model. I thought this would be the simpilest model to translate into TensorFlow and Keras, but I was wrong.

The largest stumbling block in this transformation was the use of the Negative Log-Likelihood as the loss function. The log-likelihood is a function that is used in traditional pattern recognition to estimate parmaeters. 

The likelihood is the product of all of the data given the model parameters; e.g., 

$$L = \prod_{k=1}^{N} p(x_k | \Theta) $$

Applying the negative log to the likehood, we get

$$NLL = \sum_{k=1}^{N} -\ln p(x_k | \Theta) $$

where, $$p(x_k | \Theta)$$ is the Gaussian probability of $$x_k$$ given the model parameters $$\Theta$$

The equation for the Gaussian probability is $$ p(x) = \frac{1}{\sqrt{2\pi\sigma^2}} e^{-1/2 ((x - \mu)^2/\sigma^2)}$$

Applying the natural logarithm into the negative log likehood function, we have

$$ NLL = \sum_{k=1}^{N} \frac{ln(2\pi\sigma^2)}{2} + \frac{(x_k - \mu)^2}{2\sigma^2} $$

If we assume that the observed values are samples from a Gaussian distribution with a predicted mean and variance, we can minimize the loss using the negative log-likehood criterion in place of the mean-squared error, with the following loss function, where $$y_k$$ is the true value and $$x_k$$ is the predicted value



## Configure imports

In [1]:
import tensorflow.python.platform
import tensorflow as tf
import numpy as np

import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

import wget  # pip3 install wget

import importlib


matplotlib_loader = importlib.find_loader("matplotlib")
PLT_FOUND = matplotlib_loader is not None
if PLT_FOUND:
    import matplotlib as pyplot


Example 5.1 with TensorFlow version: 2.3.0
Eager execution: True


The data used in this example is artifical, two parameter data of two different labels.

We are going to read a few lines from one of the data files to determine how the data is organized.

In [2]:

path_prefix = os.path.join("data", "example1")
filenameTrain = "saturn_data_train.csv"
filenameTest = "saturn_data_eval.csv"

localFilenameTrain = os.path.join(path_prefix, filenameTrain)
localFilenameTest = os.path.join(path_prefix, filenameTest)

# Data by Dr. Jason Baldridge (http://www.jasonbaldridge.com) to test neural network frameworks.
# Read "https://github.com/jasonbaldridge/try-tf/tree/master/simdata" and copy
# to data/example1
if (
    not os.path.isdir(path_prefix)
    or not os.path.exists(localFilenameTrain)
    or not os.path.exists(localFilenameTest)
):
    # The actual URL for the raw data is:
    URL = "https://raw.githubusercontent.com/jasonbaldridge/try-tf/master/simdata/"
    print("Missing Saturn simulation data!")
    print("Downloading from", URL)
    os.mkdir(path_prefix)
    wget.download(URL + "/" + filenameTrain, localFilenameTrain)
    wget.download(URL + "/" + filenameTest, localFilenameTest)

print("\n\nExample 5.1 with TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))
print("\nThe first five lines from the training data file:")
fd = open(localFilenameTrain)
for i in range(5):
    sys.stdout.write(fd.readline())
fd.close()


1,-7.1239700674365,-5.05175898010314

0,1.80771566423302,0.770505522143023

1,8.43184823707231,-4.2287794074931

0,0.451276074541732,0.669574142606103

0,1.52519959303934,-0.953055551414968



Here, we can see tha the file is arranged into three columns. The first column is the label of the two different groups of data (group 0 and group 1). The second column is are the two features. We will assume that these two features are simply the coordinates of the point that is in the labeled group; i.e., x and y. 


In [3]:
NUM_LABELS = 2

def pack_features_vector(features, labels):
    """Pack the features into a single array."""
    features = tf.stack(list(features.values()), axis=1)
    return features, labels

def get_dataset(file_path, **kwargs):
    """Extract tf.data.Dataset representations of labels and features in CSV files given data in the format of label, feat[0], feat[1]. feat[2], etc..

    Args:
        file_path (string): The path to one or more CSV files to load.

    Returns:
        tf.data.Dataset : A object that holds the (fetures, labels) data from the CSV file in batches.
    """
    # Use the 'experimental' make_csv_dataset to load the input data from the CSV file
    dataset = tf.data.experimental.make_csv_dataset(file_path, num_epochs=1, **kwargs)

    # Pack the features from a map of tensorflow data itnoa single feature vector.
    dataset = dataset.map(pack_features_vector)

    # Convert the integer lables in the dataset to one-hot encoded values.
    dataset = dataset.map(lambda x, y: (x, tf.one_hot(y, depth=NUM_LABELS)))
    if PLT_FOUND:
        pyplot.figure()
        # There are only two labels in this dataset 0 or 1
        idx = labels > 0.5
        pyplot.scatter(feat[idx, 0], feat[idx, 1], marker="+", c="#ff0000")
        idx = labels <= 0.5
        pyplot.scatter(feat[idx, 0], feat[idx, 1], marker="o", c="#00ff00")
        pyplot.show()

    return dataset


In [None]:
BATCH_SIZE = 50
NUM_EPOCHS = 40  # Number of epochs, full passes of the data
NUM_INPUTS = 2
NUM_OUTPUTS = 2
NUM_HIDDEN_NODES = 20
MY_SEED = 123

# Constants that specify the data to load from the .csv files.
COLUMN_NAMES = ["label", "x", "y"]
LABEL_NAME = COLUMN_NAMES[0]
LABELS = [0, 1]


# Load the training data set and test data set into batches and suffle the input data before use.
training_batches = get_dataset(
    localFilenameTrain,
    batch_size=BATCH_SIZE,
    column_names=COLUMN_NAMES,
    label_name=LABEL_NAME,
    shuffle=True,
    shuffle_seed=MY_SEED,
)

print("\nDataset element defintion:\n\t", training_batches.element_spec)

testing_batches = get_dataset(
    localFilenameTest,
    batch_size=BATCH_SIZE,
    column_names=COLUMN_NAMES,
    label_name=LABEL_NAME,
    shuffle=True,
    shuffle_seed=MY_SEED,
)


## Models
Next, make the regression model to predict the label. For this example, the model has two layers. The input layer is an multilayer perceptron network with an RELU activation function and the output layer is is a softmax activation function with a negative log likelihood loss function. 

The weight initializer from the Deep Learning book is Xavier.


## Loss functions
Let's examine the negative log likelihood function again. 

$$ NLL = \sum_{k=1}^{N} \frac{ln(2\pi\sigma^2)}{2} + \frac{(x_k - \mu)^2}{2\sigma^2} $$

If we assume that the mean is 0.0 and the variance is 1.0, the negative log likelihood function simplifies to,

$$ NLL = \sum_{k=1}^{N} \frac{ln(2\pi)}{2} + \frac{(x_k - \mu)^2}{2} $$

$$ NLL = \frac{N ln(2\pi)}{2} + \sum_{k=1}^{N} \frac{(x_k - \mu)^2}{2} $$



In [None]:
class MeanSquaredError(tf.keras.losses.Loss):
    """Custom loss function for calculating the loss as the mean-sequared error between the true output and the predicted output"""
    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, y_pred.dtype)
        return tf.reduce_mean(tf.square(y_pred - y_true), axis=-1)

class NegativeLogLikelihood(tf.keras.losses.Loss):
    """Custom loss function for calculating the loss as negative log likelihood between the true output and the predicted output"""
    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, y_pred.dtype)
        return tf.reduce_mean(tf.square(y_pred - y_true), axis=-1)


In [None]:
# Build the model. For this example, the model has two layers. The input layer is
# an multilayer perceptron network with an RELU activation function and the output
# layer is is a softmax activation function with a negative log likelihood loss function.
#
# The weight initializer in the Deep Learning book is Xavier and it is seeded with MY_SEED (123)
initializer = tf.keras.initializers.GlorotNormal(seed=MY_SEED)

model = Sequential(
    [
        tf.keras.layers.Dense(
            NUM_HIDDEN_NODES, activation="relu", kernel_initializer=initializer
        ),
        tf.keras.layers.Dense(
            NUM_OUTPUTS, activation="softmax", kernel_initializer=initializer
        ),
    ]
)

# Optimizer is Adam, loss function is mean squared error
model.compile(
    loss=MeanSquaredError(),
    optimizer=tf.optimizers.Adam(),
    metrics=["accuracy"],
)



print("\n\nFit the training data.")
history = model.fit(training_batches, epochs=NUM_EPOCHS, verbose=1)
model.summary()

if PLT_FOUND:
    # plot history
    pyplot.plot(history.history["loss"], label="loss")
    pyplot.plot(history.history["accuracy"], label="accuracy")
    pyplot.title("Training loss and accuracy (MSE loss)")
    pyplot.legend()
    pyplot.show()

# Run against the test set. Final evaluation of the model
scores = model.evaluate(testing_batches, verbose=0)
print("Test set analysis accuracy: %.2f%%" % (scores[1] * 100))
