In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import tensorflow as tf

from utils.utils import make_writer
%matplotlib inline

# Most common bugs I

## Resources

- [Chapter 4 of Deep learning book. Numerical computation](https://www.deeplearningbook.org/contents/numerical.html)

## Incorrect tensor shapes

### Most common reasons:

- Flipped dimensions when using tf.reshape.
- Sum, avg, softmax over wrong dimension.
- Forgot to flatten after conv layers.
- Forgot to get rid of extra "1" dimensions, e.g. if shape is (None, 1, 1, 4).

- In TF2, as well as in other libraries, you can accidentally broadcast tensors and then it can fail silently or just output wrong results.

In [2]:
y_true = np.array([0.1, 0.7, 0.02, 0.08, 0.05, 0.05])
y_true_extra_dim = np.expand_dims(y_true, -1)
y_pred = np.array([0.1, 0.6, 0.05, 0.05, 0.1, 0.1])

In [None]:
print(f'y_true: {y_true} \n')
print(f'Shape of y_true: {y_true.shape} \n')
print(f'y_true_extra_dim: {y_true_extra_dim} \n')
print(f'Shape of y_true_extra_dim: {y_true_extra_dim.shape} \n')

In [None]:
y_pred

In [None]:
y_pred.shape

Say we want to divide y_true by y_pred. What shapes do we expect to get?

In [None]:
(y_true / y_pred).shape

In [None]:
(y_true_extra_dim / y_pred)

In [None]:
(y_true_extra_dim / y_pred).shape

#### KL-divergence

KL-divergence is used in some models like VAEs or Bayesian models.

In [None]:
kl = tf.keras.losses.KLDivergence()

print(f'KLD for y_true: {kl(y_true, y_pred).numpy()} \n')
print(f'KLD for y_true_extra_dim: {kl(y_true_extra_dim, y_pred).numpy()}')

## Pre-processing inputs incorrectly

- Forgot to standardize/scale.
    -  It makes the resulting model dependent on the choice of units used in the input.
- Too much augmentation.


### Regression example with Auto MPG data

#### Load the data and create a pandas DataFrame

In [9]:
dataset_path = tf.keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")

In [None]:
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
dataset = pd.read_csv(dataset_path, names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

dataset.head()

- Drop NAs
- Drop categorical data for simplicity

In [None]:
dataset = dataset.dropna()
dataset.drop('Origin', axis=1, inplace=True)
dataset.head()

### Plot the data

In [None]:
sns.pairplot(dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")
plt.show()

We can also plot some statistics of the dataset.

In [None]:
stats = dataset.describe()
stats = stats.transpose()
stats

### Create labels and train set

In [14]:
labels = dataset.pop('MPG')
labels = np.array(labels).astype('float32')

Let's make the difference in scales for some features even more pronounced.

In [15]:
dataset['Horsepower'] = dataset['Horsepower'] * 1000
dataset['Displacement'] = dataset['Displacement'] / 1000
train_set = np.array(dataset).astype('float32')

Let's plot the statistics again.

In [None]:
stats = dataset.describe()
stats = stats.transpose()
stats

### Model

In [17]:
class RegressorNet(tf.keras.Model):
    
    def __init__(self, input_shape, optimizer):
        super(RegressorNet, self).__init__()
        
        self.optimizer = optimizer
        self.regressor = tf.keras.Sequential([
            tf.keras.layers.Input(input_shape),
            tf.keras.layers.Dense(64, activation='relu', name='dense_1'),
            tf.keras.layers.Dense(64, activation='relu', name='dense_2'),
            tf.keras.layers.Dense(1, activation='linear', name='dense_out')
        ])
    
    def summary(self):
        self.regressor.summary()
    
    def call(self, X):
        return self.regressor(X)
    
    def get_loss(self, X, y_true):
        y_pred = self(X)
        l2_loss = tf.keras.losses.mean_squared_error(y_true, y_pred)
        return l2_loss
    
    def grad_step(self, X, y_true):
        with tf.GradientTape() as tape:
            loss = self.get_loss(X, y_true)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss, gradients

In [None]:
optimizer = tf.keras.optimizers.Adam()
model = RegressorNet(input_shape=train_set.shape[1], optimizer=optimizer)
model.summary()

### Train and output to Tensorboard

In [19]:
def train(model, epochs, X, y, save_dir):
    
    writer = make_writer(os.path.join('summaries'), save_dir)
    
    for epoch in range(0, epochs + 1):
        
        if epoch % 100 == 0:
            print('Epoch {} is running...'.format(epoch))
            
        # Gradient update step
        loss, gradients = model.grad_step(X, y.reshape(-1, 1))
        loss = tf.math.reduce_mean(loss, axis=0)
        
        if epoch % 100 == 0:
            print(f'{loss}')
        
        # Tensorboard
        with writer.as_default():
            tf.summary.scalar('Train loss', loss, step=epoch)

            for layer_number, layer in enumerate(model.trainable_variables):
                tf.summary.histogram(layer.name, gradients[layer_number], step=epoch, buckets=1)
       

In [None]:
train(model, 1000, train_set, labels, 'scaling/regression_not_standard')

### Excercise

- Write a function to standardize the data and apply it.
- Train with the new data for 1000 epochs and send the Tensorboard output to a new directory.
- Why does the training depend so much on the scaling?


## Incorrect input to the loss/ incorrect loss

- Softmaxed outputs to a loss that expects logits or vice-versa.
- One-hot encoded labels to a sparse categorical cross-entropy loss.
- ReLU in the last layer for regression problems.
- E.g. MSE loss when categorical loss is expected.

### MNIST example

In [35]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train / 255.0
# Add a channels dim
x_train = tf.expand_dims(x_train[:1000].astype('float32'), axis=-1)
y_train = y_train[:1000]

In [36]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(1000).batch(32)

In [37]:
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = tf.keras.layers.Conv2D(32, 3, activation='relu', name='conv_1')
        self.flatten = tf.keras.layers.Flatten(name='flatten')
        self.d1 = tf.keras.layers.Dense(128, activation='relu', name='dense_1')
        self.d2 = tf.keras.layers.Dense(10, name='dense_out')
            

    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)

# Create an instance of the model
model = MyModel()

In [38]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [39]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [40]:
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions) 
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)
    return gradients

In [None]:
EPOCHS = 5
writer = make_writer(os.path.join('summaries'), 'loss_bug/logits_false')

for epoch in range(EPOCHS):

    train_loss.reset_states()
    train_accuracy.reset_states()

    for images, labels in train_ds:
        gradients = train_step(images, labels)

    # Tensorboard
    with writer.as_default():
        tf.summary.scalar('Train loss', train_loss.result(), step=epoch)
        tf.summary.scalar('Train Accuracy', train_accuracy.result() * 100, step=epoch)
        
        for layer_number, layer in enumerate(model.trainable_variables):
            tf.summary.histogram('/'.join(layer.name.split('/')[1:]), gradients[layer_number], step=epoch, buckets=1)    

    message = (f'Epoch: {epoch + 1}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result() * 100}'
              )
    print(message)  

### Excercise

- Fix the loss above
- Reinitialize the model and retrain.
- Output to a new Tensorboard directory to compare the results.