# SGD with a Line #

In [1]:
NUM_EXAMPLES = 256
BATCH_SIZE = 8
EPOCHS = 50 # actually steps
LR = 0.1


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import animation, rc
rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')


# Define model
class Model(object):
  def __init__(self, w_init=-1.0, b_init=-1.0):
    self.W = tf.Variable(w_init)
    self.b = tf.Variable(b_init)

  def __call__(self, x):
    return self.W * x + self.b

def loss(target_y, predicted_y):
  return tf.reduce_mean(tf.square(target_y - predicted_y))

def train(model, inputs, outputs, learning_rate):
  with tf.GradientTape() as t:
    current_loss = loss(outputs, model(inputs))
  dW, db = t.gradient(current_loss, [model.W, model.b])
  model.W.assign_sub(learning_rate * dW)
  model.b.assign_sub(learning_rate * db)

# Data
TRUE_W = 3.0
TRUE_b = 2.0
SEED = 3141

inputs  = tf.random.normal(shape=[NUM_EXAMPLES], seed=SEED)
noise   = tf.random.normal(shape=[NUM_EXAMPLES], seed=SEED+1)
outputs = inputs * TRUE_W + TRUE_b + noise

ds = (tf.data.Dataset
      .from_tensor_slices((inputs, outputs))
      .shuffle(1000, seed=SEED)
      .batch(BATCH_SIZE)
      .repeat())
ds = iter(ds)

model = Model()



# Collect the history of W-values and b-values to plot later
Ws, bs, xs, ys, ls = [], [], [], [], []

fig = plt.figure(dpi=100, figsize=(8, 3))

# Regression Line
ax1 = fig.add_subplot(131)
ax1.set_title("Fitted Line")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.set_xlim(-3, 2.5)
ax1.set_ylim(-8, 11)
p10, = ax1.plot(inputs, outputs, 'r.', alpha=0.1) # full dataset
p11, = ax1.plot([], [], 'C3.') # batch, color Red
p12, = ax1.plot([], [], 'k') # fitted line, color Black

# Loss
ax2 = fig.add_subplot(132)
ax2.set_title("Training Loss")
ax2.set_xlabel("Batches Seen")
ax2.set_xlim(0, EPOCHS)
ax2.set_ylim(0, 40)
p20, = ax2.plot([], [], 'C0') # color Blue

# Weights
ax3 = fig.add_subplot(133)
ax3.set_title("Weights")
ax3.set_xlabel("Batches Seen")
ax3.set_xlim(0, EPOCHS)
ax3.set_ylim(-2, 4)
ax3.plot(range(EPOCHS), [TRUE_W for _ in range(EPOCHS)], 'C5--')
ax3.plot(range(EPOCHS), [TRUE_b for _ in range(EPOCHS)], 'C8--')
p30, = ax3.plot([], [], 'C5') # W color Brown
p30.set_label('W')
p31, = ax3.plot([], [], 'C8') # b color Green
p31.set_label('b')
ax3.legend()

fig.tight_layout()

def init():
    return [p10]

def update(epoch):
  x, y = next(ds)
  y_pred = model(x)
  current_loss = loss(y, y_pred)

  Ws.append(model.W.numpy())
  bs.append(model.b.numpy())
  xs.append(x.numpy())
  ys.append(y_pred.numpy())
  ls.append(current_loss.numpy())
  p11.set_data(x.numpy(), y.numpy())
  inputs = tf.linspace(-3.0, 2.5, 30)
  p12.set_data(inputs, Ws[-1]*inputs + bs[-1])
  p20.set_data(range(epoch), ls)
  p30.set_data(range(epoch), Ws)
  p31.set_data(range(epoch), bs)
  
    
  train(model, x, y, learning_rate=LR)
#   print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %
#         (epoch, Ws[-1], bs[-1], current_loss))

  return p11, p12, p20

ani = animation.FuncAnimation(fig, update, frames=range(1, EPOCHS), init_func=init, blit=True, interval=100)
plt.close()
ani
# ffmpeg -i temp.mp4 -filter_complex "[0:v] fps=12,scale=480:-1,split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif



RuntimeError: Requested MovieWriter (ffmpeg) not available

<matplotlib.animation.FuncAnimation at 0x7f27fc070d00>

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import animation, rc
rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')

# NUM_EXAMPLES = 256
# BATCH_SIZE = 8
# STEPS = 50 # actually steps
# LR = 0.1


def animate_sgd(num_examples, batch_size, steps, learning_rate,
                true_w=3.0, true_b=2.0, seed=0):
    # Define model
    class Model(object):
        def __init__(self, w_init=-1.0, b_init=-1.0):
            self.W = tf.Variable(w_init)
            self.b = tf.Variable(b_init)

        def __call__(self, x):
            return self.W * x + self.b
            
    def loss(target_y, predicted_y):
        return tf.reduce_mean(tf.square(target_y - predicted_y))

    def train(model, inputs, outputs, learning_rate):
        with tf.GradientTape() as t:
            current_loss = loss(outputs, model(inputs))
            dW, db = t.gradient(current_loss, [model.W, model.b])
            model.W.assign_sub(learning_rate * dW)
            model.b.assign_sub(learning_rate * db)
    # Data
    inputs  = tf.random.normal(shape=[num_examples], seed=seed)
    noise   = tf.random.normal(shape=[num_examples], seed=seed+1)
    outputs = inputs * true_w + true_b + noise
    ds = (tf.data.Dataset
          .from_tensor_slices((inputs, outputs))
          .shuffle(1000, seed=seed)
          .batch(batch_size)
          .repeat())
    ds = iter(ds)
    model = Model()
    # Collect the history of W-values and b-values to plot later
    Ws, bs, xs, ys, ls = [], [], [], [], []
    # Construct plot
    fig = plt.figure(dpi=100, figsize=(8, 3))

    # Regression Line
    ax1 = fig.add_subplot(131)
    ax1.set_title("Fitted Line")
    ax1.set_xlabel("x")
    ax1.set_ylabel("y")
    ax1.set_xlim(-3, 2.5)
    ax1.set_ylim(-8, 11)
    p10, = ax1.plot(inputs, outputs, 'r.', alpha=0.1) # full dataset
    p11, = ax1.plot([], [], 'C3.') # batch, color Red
    p12, = ax1.plot([], [], 'k') # fitted line, color Black

    # Loss
    ax2 = fig.add_subplot(132)
    ax2.set_title("Training Loss")
    ax2.set_xlabel("Batches Seen")
    ax2.set_xlim(0, steps)
    ax2.set_ylim(0, 40)
    p20, = ax2.plot([], [], 'C0') # color Blue

    # Weights
    ax3 = fig.add_subplot(133)
    ax3.set_title("Weights")
    ax3.set_xlabel("Batches Seen")
    ax3.set_xlim(0, steps)     # 
    ax3.set_ylim(-2, 4)
    ax3.plot(range(steps), [true_w for _ in range(steps)], 'C5--')
    ax3.plot(range(steps), [true_b for _ in range(steps)], 'C8--')
    p30, = ax3.plot([], [], 'C5') # W color Brown
    p30.set_label('W')
    p31, = ax3.plot([], [], 'C8') # b color Green
    p31.set_label('b')
    ax3.legend()

    fig.tight_layout()

    def init():
        return [p10]

    def update(epoch):
        x, y = next(ds)
        y_pred = model(x)
        current_loss = loss(y, y_pred)
          
        Ws.append(model.W.numpy())
        bs.append(model.b.numpy())
        xs.append(x.numpy())
        ys.append(y_pred.numpy())
        ls.append(current_loss.numpy())
        p11.set_data(x.numpy(), y.numpy())
        inputs = tf.linspace(-3.0, 2.5, 30)
        p12.set_data(inputs, Ws[-1]*inputs + bs[-1])
        p20.set_data(range(epoch), ls)
        p30.set_data(range(epoch), Ws)
        p31.set_data(range(epoch), bs)

        train(model, x, y, learning_rate=learning_rate)
        #   print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %
        #         (epoch, Ws[-1], bs[-1], current_loss))
        
        return p11, p12, p20

    ani = animation.FuncAnimation(
        fig,
        update,
        frames=range(1, steps),
        init_func=init,
        blit=True,
        interval=100,
    )
    plt.close()
    return ani
    
# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v] fps=12,scale=480:-1,split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [3]:
animate_sgd(
    num_examples=8192,
    batch_size=4096,
    steps=50,
    learning_rate=0.99,
)

# SGD with a Quadratic #

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

import matplotlib.pyplot as plt
from matplotlib import animation, rc
rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')

# Parameters
TRUE_W0 = 3.0
TRUE_W1 = 2.0
TRUE_b = 1.0
NUM_EXAMPLES = 1024
BATCH_SIZE = 64
EPOCHS = 150 # actually steps
LR = 0.005
SEED = 3141

# Define Model
model = keras.Sequential([
    layers.Dense(8, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(1)
])
model.compile(
    optimizer=keras.optimizers.SGD(LR),
    loss='mse',
)


# Data

inputs  = tf.random.normal(shape=(NUM_EXAMPLES,1), seed=SEED)
noise   = tf.random.normal(shape=(NUM_EXAMPLES,1), seed=SEED+1)
# outputs = TRUE_W0 * inputs ** 2 + TRUE_W1 * inputs + TRUE_b + noise
outputs = TRUE_W0 * inputs ** 2 + noise
outputs = tf.squeeze(outputs)

ds = (tf.data.Dataset
      .from_tensor_slices((inputs, outputs))
      .repeat()
      .shuffle(1000, seed=SEED)
      .batch(BATCH_SIZE))
ds = iter(ds)


# Collect the history of W-values and b-values to plot later
Ws, bs, xs, ys, ls = [], [], [], [], []

# Create Figure
fig = plt.figure(dpi=150, figsize=(8, 3))

# Regression Curve
ax1 = fig.add_subplot(121)
ax1.set_title("Fitted Curve")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.set_xlim(-3, 3)
ax1.set_ylim(-5, 20)
p10, = ax1.plot(inputs, outputs, 'r.', alpha=0.1) # full dataset
p11, = ax1.plot([], [], 'C3.') # batch
p12, = ax1.plot([], [], 'k') # fitted line

# Loss
ax2 = fig.add_subplot(122)
ax2.set_title("Training Loss")
ax2.set_xlabel("Batches Seen")
ax2.set_xlim(0, EPOCHS)
ax2.set_ylim(0, 40)
p20, = ax2.plot([], [], 'C0')


fig.tight_layout()

def init():
    return [p10]

def update(epoch):
  x, y = next(ds)
  y_pred = model(x)
  current_loss = model.evaluate(x, y)
  x = tf.squeeze(x)
  y = tf.squeeze(y)
  y_pred = tf.squeeze(y_pred)
    
  xs.append(x.numpy())
  ys.append(y_pred.numpy())
  ls.append(current_loss)
  p11.set_data(x.numpy(), y.numpy())
  inputs = tf.linspace(-3.0, 3.0, 30)
  p12.set_data(inputs, model.predict(inputs))
  p20.set_data(range(epoch), ls)
  
  model.train_on_batch(x, y)

  return p11, p12, p20

ani = animation.FuncAnimation(fig, update, frames=range(1, EPOCHS), init_func=init, blit=True, interval=100)
plt.close()
ani
# ani.save('/home/ronen/Downloads/anim-anim.gif', dpi=80, writer='imagemagick')

# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

RuntimeError: Requested MovieWriter (ffmpeg) not available

<matplotlib.animation.FuncAnimation at 0x7f27e01277c0>

## With Function ##

In [4]:
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from matplotlib import animation, rc
rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')


def animate_curve_fitting(model,
                             X, y,
                             batch_size=64,
                             epochs=16,
                             lr=0.005,
                             shuffle_buffer=5000,
                             seed=0,
                             verbose=1):
    num_examples = X.shape[0]
    steps_per_epoch = num_examples // batch_size
    total_steps = steps_per_epoch * epochs
    
    ds = (tf.data.Dataset
          .from_tensor_slices((X, y))
          .repeat()
          .cache()
          .shuffle(shuffle_buffer, seed=seed)
          .batch(batch_size))
    ds_iter = ds.as_numpy_iterator()

    x_min = X.min()
    x_max = X.max()
    X_pop = np.linspace(x_min, x_max, 1000)
    y_min = y.min()
    y_max = y.max()

    # Parameters
    xs = []
    ys = []
    curves = []
    # Callback to save parameters
    def save_params(batch, logs):
        x, y = next(ds_iter)
        xs.append(x.squeeze())
        ys.append(y.squeeze())
        curve = model.predict(X_pop)
        curves.append(curve)

    save_params_cb = keras.callbacks.LambdaCallback(
        on_batch_begin=save_params,
    )

    # Train model to collect parameters
    model.fit(
        ds,
        epochs=epochs,
        callbacks=[save_params_cb],
        steps_per_epoch=steps_per_epoch,
        verbose=verbose,
    )

    # Create Figure
    fig = plt.figure(dpi=150, figsize=(4, 3))
    # Regression Curve
    ax1 = fig.add_subplot(111)
    ax1.set_title("Fitted Curve")
    ax1.set_xlabel("x")
    ax1.set_ylabel("y")
    ax1.set_xlim(x_min, x_max)
    ax1.set_ylim(y_min, y_max)
    p10, = ax1.plot(X, y, 'r.', alpha=0.1) # full dataset
    p11, = ax1.plot([], [], 'C3.') # batch
    p12, = ax1.plot([], [], 'k') # fitted line
    # Complete Figure
    fig.tight_layout()

    def init():
        return [p10]

    def update(frame):
        x = xs[frame]
        y = ys[frame]
        p11.set_data(x, y)
        p12.set_data(X_pop, curves[frame])
        return p11, p12

    ani = \
        animation.FuncAnimation(
            fig,
            update,
            frames=range(1, total_steps),
            init_func=init,
            blit=True,
            interval=100,
        )
    plt.close()

    return ani

In [5]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(16),
    layers.Activation('relu'),
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-exp-lr.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

MovieWriter ffmpeg unavailable; using Pillow instead.


ValueError: unknown file extension: .mp4

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(16),
    layers.Activation('relu'),
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(1)
])
model.compile(
    optimizer=keras.optimizers.Adam(0.01),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(16),
    layers.Activation('relu'),
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(1)
])
model.compile(
    optimizer='sgd',
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/sgd.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('elu'),
    layers.Dense(16),
    layers.Activation('elu'),
    layers.Dense(8),
    layers.Activation('elu'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-elu.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('swish'),
    layers.Dense(16),
    layers.Activation('swish'),
    layers.Dense(8),
    layers.Activation('swish'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-swish.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('tanh'),
    layers.Dense(16),
    layers.Activation('tanh'),
    layers.Dense(8),
    layers.Activation('tanh'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-tanh.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(16),
    layers.Activation('relu'),
    layers.Dense(8),
    layers.Activation('relu'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.RMSprop(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/rmsprop-exp-lr.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(1024),
    layers.Activation('relu'),
    layers.Dense(1024),
    layers.Activation('relu'),
    layers.Dense(1024),
    layers.Activation('relu'),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(0.01),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-big.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

In [None]:
X = np.random.normal(loc=0.0, scale=1.0, size=256)
err = np.random.normal(loc=0.0, scale=1.0, size=256)
y = 2 * np.square(X) + err

model = keras.Sequential([
    layers.Dense(256),
    layers.Activation('relu'),
    layers.Dropout(0.5),
    layers.Dense(512),
    layers.Activation('relu'),
    layers.Dropout(0.5),
    layers.Dense(256),
    layers.Activation('relu'),
    layers.Dropout(0.5),
    layers.Dense(1)
])
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=2,
    decay_rate=0.96,
    staircase=False,
)
model.compile(
    optimizer=keras.optimizers.Adam(lr_schedule),
    loss='mse',
)

ani = animate_curve_fitting(model, X, y, batch_size=32, epochs=32, verbose=0)
plt.close()
ani.save('/kaggle/working/adam-dropout.mp4')
ani

# Convert mp4 to gif:
# ffmpeg -i temp.mp4 -filter_complex "[0:v],split [a][b];[a] palettegen [p];[b][p] paletteuse" temp.gif

# Animated Loss Surface #

In [3]:
def true_model(W, b):
    def outputs(inputs):
        err = tf.random.normal(shape=inputs.shape)
#         err = 0.0
        return tf.squeeze(inputs * W + b + err)
    return outputs

def loss_grid(X, y, w_space, b_space):
  def bias_augment(X):
    """Augment matrix X with bias column."""
    aug = tf.constant([0]*X.shape[0] + [1],
                      shape=(X.shape[0]+1, 1),
                      dtype=tf.float32)
    X = tf.concat([X, tf.zeros(shape=(1, X.shape[1]))],
                  axis=0)
    X = tf.concat([X, aug], axis=1)
    return X

  W = \
    tf.expand_dims(
      tf.stack(
        tf.meshgrid(w_space, b_space)),
      axis=1)
  Xb = bias_augment(X)
  WX = tf.einsum("mnij,bm->bnij", W, Xb)
  b = WX[-1]
  WX = tf.squeeze(WX[:-1])
  y_pred = WX + b

  batch_size = X.shape[0]
  loss = \
    tf.reduce_mean(tf.square(
      tf.reshape(y, (batch_size, 1, 1)) - y_pred
    ), axis=0)

  return loss

def plot_loss_surface(w_space, b_space, loss, levels=tf.square(range(0, 8)), dpi=150):
  fig = plt.figure(dpi=150)
  ax = fig.add_subplot(111)
  ax.set_xlim(b_MIN, b_MAX)
  ax.set_ylim(W_MIN, W_MAX)
  ax.set_aspect('equal')
  ax.set_xlabel('Bias')
  ax.set_ylabel('Weight')
  ax.set_title('The Loss Surface')
  levels = tf.square(range(0, 8))
  CS = ax.contour(w_space, b_space, loss,
                  levels=levels,
                  cmap='gray')
  ax.clabel(CS, inline=True, fontsize=8, fmt="%1d")
  artists, _ = CS.legend_elements()
  return artists



# ## Define Linear Model ##

# Define model
class Model(object):
  def __init__(self, w_init=0.0, b_init=0.0):
    self.W = tf.Variable(tf.reshape(w_init, shape=(1,)), dtype=tf.float32)
    self.b = tf.Variable(tf.reshape(b_init, shape=(1,)), dtype=tf.float32)

  def __call__(self, x):
    return self.W * x + self.b

def loss_fn(target_y, predicted_y):
    return tf.reduce_mean(tf.square(target_y - predicted_y))

def train(model, inputs, outputs, learning_rate):
  with tf.GradientTape() as t:
    current_loss = loss_fn(outputs, model(inputs))
    dW, db = t.gradient(current_loss, [model.W, model.b])
    model.W.assign_sub(learning_rate * dW)
    model.b.assign_sub(learning_rate * db)


# ## Data ##
# +
TRUE_W = 3.0
TRUE_b = 2.0
NUM_EXAMPLES = 32
BATCH_SIZE = 4
LEARNING_RATE = 0.05

W_MIN = -3.0
W_MAX = 8.0
b_MIN = -3.0
b_MAX = 8.0

X = tf.random.normal(shape=[NUM_EXAMPLES, 1])
y = tf.squeeze(true_model(TRUE_W, TRUE_b)(X))
w_space = tf.linspace(W_MIN, W_MAX, 128)
b_space = tf.linspace(b_MIN, b_MAX, 128)

loss = loss_grid(X, y, w_space, b_space)

# ## Data Pipeline ##
inputs = tf.squeeze(X)
outputs = true_model(TRUE_W, TRUE_b)(inputs)
ds = (tf.data.Dataset
      .from_tensor_slices((inputs, outputs))
      .shuffle(1000)
      .batch(BATCH_SIZE)
      .repeat())
ds = iter(ds)

model = Model(w_init=-1.0, b_init=-1.0)

# Empty containers for values to save
Ws, bs, xs, ys, ls = [], [], [], [], []

fig = plt.figure(dpi=150)
ax = fig.add_subplot(111)
ax.set_xlim(b_MIN, b_MAX)
ax.set_ylim(W_MIN, W_MAX)
ax.set_aspect('equal')
ax.set_xlabel('Bias')
ax.set_ylabel('Weight')
ax.set_title('The Loss Surface')
levels = tf.square(range(0, 8))
CS = ax.contour(w_space, b_space, loss,
                levels=levels,
                cmap='bone')
ax.clabel(CS, inline=True, fontsize=8, fmt="%1d")

p1, = plt.plot([TRUE_W], [TRUE_b], 'kx')
p2, = plt.plot([], [], color='red', alpha=0.5)
p3, = plt.plot([], [], 'r.')

def init():
    return p1,

def update(epoch):
  x, y = next(ds)
  y_pred = model(x)
  current_loss = loss_fn(y, y_pred)

  Ws.append(model.W.numpy())
  bs.append(model.b.numpy())
  xs.append(x.numpy())
  ys.append(y_pred.numpy())
  ls.append(current_loss.numpy())
  p2.set_data(Ws, bs)
  p3.set_data(Ws[-1], bs[-1])

  train(model, x, y, learning_rate=0.05)
#   print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %
#         (epoch, Ws[-1], bs[-1], current_loss))

  return p3,

ani = animation.FuncAnimation(fig, update, frames=range(1, 128), interval=100, init_func=init, blit=True)
plt.close()
ani

# Random Slice from Loss Surface #

In [None]:
def make_random_vector(vc):
  stddevs = [tf.math.reduce_std(w) for w in vc]
  v0 = [tf.random.normal(shape=w.shape, stddev=1.0)
         for w, s in zip(vc, stddevs)]
  v1 = [tf.random.normal(shape=w.shape, stddev=1.0)
         for w, s in zip(vc, stddevs)]
  def coord_vec(x, y):
    return [x * w0 + y * w1 + wc
            for w0, w1, wc in zip(v0, v1, vc)]
  return coord_vec


def make_random_loss_grid(x_coord, y_coord, model, inputs, outputs):
  vc = model.get_weights()
  coord_vec = make_random_vector(vc)

  loss_grid = np.empty([len(x_coord), len(y_coord)])
  for i, x in enumerate(x_coord):
    for j, y in enumerate(y_coord):
      weights = coord_vec(x, y)
      model.set_weights(weights)
      loss = model.test_on_batch(inputs, outputs, return_dict=True)['loss']
      loss_grid[i, j] = loss

  model.set_weights(vc)

  return loss_grid


# # Data #
# #+
TRUE_W = 3.0
TRUE_b = 2.0
NUM_EXAMPLES = 1024
BATCH_SIZE = 64
x = tf.random.normal(shape=(NUM_EXAMPLES, 1))
y = TRUE_W * x + TRUE_b + tf.random.normal(shape=x.shape, stddev=0.1)
y = tf.squeeze(y)
ds = (tf.data.Dataset
      .from_tensor_slices((x, y))
      .repeat()
      .shuffle(1000)
      .batch(BATCH_SIZE))

# Model
model = keras.Sequential([
  layers.Dense(32, activation='relu'),
  layers.Dense(32, activation='relu'),
  layers.Dense(1),
])
model.compile(
  loss='mse',
  optimizer='sgd',
)
model.fit(
    ds,
    steps_per_epoch=NUM_EXAMPLES//BATCH_SIZE,
    epochs=100,
    verbose=0,
)


# # Loss Grid #

# Create Loss Grid
SIZE = NUM_EXAMPLES
ds_iter = iter(ds.unbatch().batch(SIZE))
inputs, outputs = next(ds_iter)

ZOOM = 0.25
POINTS = 24
NUM_LEVELS = 20
xs = tf.linspace(-1.0, 1.0, num=POINTS)**3*ZOOM
ys = tf.linspace(-1.0, 1.0, num=POINTS)**3*ZOOM

pts = 1.05 ** np.arange(0, -100, -5)
xs = np.concatenate((-1*pts, np.sort(pts))) * ZOOM
ys = xs
zs = make_random_loss_grid(xs, ys, model, inputs, outputs)

fig = plt.figure(dpi=150, figsize=(5, 5))
ax = fig.add_subplot(111)
ax.set_title('The Loss Surface')
# Set Levels
min_loss = zs.min()
max_loss = zs.max()
levels = tf.exp(
  tf.linspace(tf.math.log(min_loss),
              tf.math.log(max_loss),
              num=NUM_LEVELS))
# Create Contour Plot
CS = ax.contour(
  xs, ys, zs,
  levels=levels,
  cmap='magma',
  linewidths=0.75,
  norm=mpl.colors.LogNorm(vmin=min_loss, vmax=max_loss*2.0)
)
ax.clabel(CS, inline=True, fontsize=8, fmt="%1.2f");