In [5]:
import matplotlib.pyplot as plt
import numpy as np

%matplotlib widget
%matplotlib notebook


X_COEF=1
Y_COEF=40

def cost_func(x, y):
    Z = X_COEF*(X+2)**2 + Y_COEF*(Y+2)**2
    return Z

x = np.linspace((-5.5-2), (5.5-2), 100)
y = np.linspace(-5.5-2, 5.5-2, 100)
X, Y = np.meshgrid(x, y)
levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100]
Z = cost_func(X, Y)
c = plt.contour(X, Y, Z, levels, colors='g')
pass



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# SGD and momentum together - no animation #

In [36]:
# Gradeint Descent with large step size with animation

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

X_COEF=1
Y_COEF=20
alpha = 0.02517
alpha = 0.1

x_offset = 0
y_offset = 0



def cost_func(x, y):
    Z = X_COEF*(x+x_offset)**2 + Y_COEF*(y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([2*X_COEF*(x[0]), 2*Y_COEF*(x[1])])



def gd2(x, grad, alpha, max_iter=10):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0,:] = x
    for i in range(max_iter):
        x = x - alpha * grad(x)
        xs[i+1,:] = x
    return xs


def gd2_momentum(x, grad, alpha, beta=0.9, max_iter=10):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0
    for i in range(max_iter):
        v = beta*v + (1-beta)*grad(x)
        vc = v/(1+beta**(i+1))
        x = x - alpha * vc
        xs[i+1, :] = x
    return xs




def plot_contour():
    x = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225, 256, 289]
    Z = cost_func(X,Y)
    fig, ax = plt.subplots()
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
    ax.contour(X, Y, Z, levels, colors='black')
    return fig,ax

fig,ax = plot_contour()
x0 = np.array([5-x_offset,5-y_offset])
xs = gd2(x0, grad2, alpha, max_iter=100)
c=ax.plot(xs[:, 0], xs[:, 1], 'o-')

fig,ax = plot_contour()
x0 = np.array([5-x_offset,5-y_offset])
xs = gd2_momentum(x0, grad2, alpha,beta=0.9, max_iter=100)
c=ax.plot(xs[:, 0], xs[:, 1], 'o-')






Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Static 1D plots: SGD, Momentum, Adam, Adamax, with comparison plot of Adam and Adamax denominator 

In [25]:
# Gradeint Descent with large step size with animation

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

X_COEF=1
Y_COEF=20

alpha = 0.1
x_offset = 0# 2
y_offset = 0# 1.5


def cost_func_x(x):
    Z = X_COEF*(x+x_offset)**2
    return Z

def cost_func_y(y):
    Z = 0.5*Y_COEF*(y+y_offset)**2
    return Z


def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([X_COEF*(x[0]+x_offset), Y_COEF*(x[1]+y_offset)])


def gd2(x, grad, alpha, max_iter):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0,:] = x
    for i in range(max_iter):
        x = x - alpha * grad(x)
        xs[i+1,:] = x
    return xs


def gd2_momentum(x, grad, alpha, max_iter, beta=0.9):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0

    for i in range(max_iter):
        v = beta*v + (1-beta)*grad(x)
        vc = v/(1+beta**(i+1))
        x = x - alpha * vc
        xs[i+1, :] = x
    return xs


def gd2_adam(x, grad, alpha, beta_1=0.9, beta_2=0.999, max_iter=10):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0
    m = 0
    v_list=[]
    epsilon=10^(-7)
    for i in range(max_iter):
        m = beta_1 * m + (1-beta_1) * grad(x)
        v = beta_2* v + (1-beta_2) * pow(grad(x), 2)
        m_hat = m/(1-pow(beta_1,(i+1)))
        v_hat = v/(1-pow(beta_2,(i+1)))


        delta = alpha *m_hat/(np.sqrt(v_hat)+epsilon)
        x = x - alpha * delta
        v_list.append((np.sqrt(v_hat)[1]+epsilon))

    plt.plot([j for j in range(len(v_list))], v_list, label=r'$Adam:\;\sqrt{\hat{v}_t}+\epsilon$', c="red", lw=2)
    plt.legend()
    plt.show()
    return xs

def gd2_adamax(x, grad, alpha, beta_1=0.9, beta_2=0.999, max_iter=10):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    u = [0,0]
    m = 0
 
    epsilon=10^(-7)
    u_list=[]
    for i in range(max_iter):
        m = beta_1 * m + (1-beta_1) * grad(x)
        u[0] = max(beta_2 * u[0], abs(grad(x)[0]))
   
        u[1] = max(beta_2 * u[1], abs(grad(x)[1]))

        delta = alpha *m/(1-pow(beta_1, (i+1) )) / u
        x = x - alpha * delta
        u_list.append(u[1])

        xs[i+1, :] = x
#     ax = plt.gca()
    plt.plot([j for j in range(len(u_list))], u_list, label=r'$Adamax:\; max(\beta_2 \cdot  u_{t-1}, \left | g_t \right |)$', c="blue", lw=2)
    plt.legend()
    plt.show()
    plt.xticks([])
    plt.yticks([])
    plt.xlabel("t")
    plt.ylabel(r"$Denominator\;of\;\Delta w_t$")

  

    return xs

def plot_loss_x_contour():
    x = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    fig, ax = plt.subplots()
    z = cost_func_x(x)
    ax.plot(x, z)
#     plt,show()
    return fig,ax

def plot_loss_y_contour():
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    fig, ax = plt.subplots()
    z = cost_func_y(y)
    ax.plot(y, z)
#     plt,show()
    return fig,ax

max_iter = 100
fig,ax = plot_loss_x_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2(x0, grad2, alpha, max_iter)
c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]),'.r-')


fig,ax = plot_loss_x_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)

c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]), '.r-')


fig,ax = plot_loss_y_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2(x0, grad2, alpha, max_iter)
c=ax.plot(xs[:, 1], cost_func_y(xs[:, 1]), '.r-')

fig,ax = plot_loss_y_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
c=ax.plot(xs[:, 1], cost_func_y(xs[:, 1]), '.r-')


fig,ax =plot_loss_y_contour()
x0 = np.array([5, 5])
x0 = np.array([(5+x_offset),(5+y_offset)])

xs = gd2_adam(x0, grad2, alpha,beta_1=0.9, beta_2=0.999, max_iter=500)
c=ax.plot(xs[:, 1], cost_func_y(xs[:, 1]), '.r-')

fig,ax =plot_loss_y_contour()
x0 = np.array([5, 5])
x0 = np.array([(5+x_offset),(5+y_offset)])

xs = gd2_adamax(x0, grad2, alpha,beta_1=0.9, beta_2=0.999, max_iter=500)
c=ax.plot(xs[:, 1], cost_func_y(xs[:, 1]), '.r-')




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# ADAM from https://towardsdatascience.com/how-to-implement-an-adam-optimizer-from-scratch-76e7b217f1cc

In [47]:
import numpy as np

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import animation, rc
%matplotlib widget
%matplotlib notebook




class AdamOptim():
    def __init__(self, eta=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.m_dw, self.v_dw = 0, 0
        self.m_db, self.v_db = 0, 0
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.eta = eta
    def update(self, t, w, b, dw, db):
        ## dw, db are from current minibatch
        ## momentum beta 1
        # *** weights *** #
        self.m_dw = self.beta1*self.m_dw + (1-self.beta1)*dw
        # *** biases *** #
        self.m_db = self.beta1*self.m_db + (1-self.beta1)*db

        ## rms beta 2
        # *** weights *** #
        self.v_dw = self.beta2*self.v_dw + (1-self.beta2)*(dw**2)
        # *** biases *** #
        self.v_db = self.beta2*self.v_db + (1-self.beta2)*(db)

        ## bias correction
        m_dw_corr = self.m_dw/(1-self.beta1**t)
        m_db_corr = self.m_db/(1-self.beta1**t)
        v_dw_corr = self.v_dw/(1-self.beta2**t)
        v_db_corr = self.v_db/(1-self.beta2**t)

        ## update weights and biases
        w = w - self.eta*(m_dw_corr/(np.sqrt(v_dw_corr)+self.epsilon))
        b = b - self.eta*(m_db_corr/(np.sqrt(v_db_corr)+self.epsilon))
        return w, b
    
def loss_function(m):
    return m**2-2*m+1

def cost_func_y(m):
    return 0.5*20* m**2
## take derivative
def grad_function(m):
    return 20*m
def check_convergence(w0, w1):
    return (w0 == w1)

def plot_loss_y_contour():
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    fig, ax = plt.subplots()
    z = cost_func_y(y)
    ax.plot(y, z)
#     plt,show()
    return fig,ax

w_0 = 5
b_0 = 5
adam = AdamOptim()
t = 1 
converged = False

# Keras declarations:
wt = []
xt =[]
yt=[]

w_k = tf.Variable(5.0)
b_k = tf.Variable(5.0)

xt_k=[]
X_COEF=1*0.5
Y_COEF=20*0.5
loss = lambda: 0.5* Y_COEF*(w_k)**2         # d(loss)/d(var1) = var1
opt = tf.keras.optimizers.Adam(learning_rate=alpha, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')


for j in range(5):
    dw = grad_function(w_0)
    db = grad_function(b_0)
    w_0_old = w_0
    w_0, b_0 = adam.update(t,w=w_0, b=b_0, dw=dw, db=db)
    xt.append(w_0)
    yt.append(cost_func_y(w_0))
   

    step_count = opt.minimize(loss, var_list=[ w_k]).numpy()
    xt_k.append(w_k.numpy())
    print(w_0, w_k.numpy())


# print(wt)
fig, ax = plot_loss_y_contour()
ax.plot( xt, yt,'.r-')

lossf = lambda yy:  Y_COEF*(yy)**2         # d(loss)/d(var1) = var1

fig, ax = plot_loss_y_contour()
loss_y = [lossf(xt) for xt in xt_k]

ax.plot( xt_k, loss_y,'.r-')

4.990000000001 4.9
4.9765623191371215 4.8000584
4.960910889659414 4.7002134
4.943708959923487 4.600508
4.925387855436054 4.5009837


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f22ec4fc910>]



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# local minima - sgd and nomenutm - static graphs # 

In [1]:
# Gradeint Descent with large step size with animation

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

X_COEF=1
Y_COEF=40
alpha = 0.02517
alpha = 0.0067
x_offset = 0
y_offset = 0
max_iter= 150

def cost_func(x, y):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)+ Y_COEF*(y+y_offset)**2
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
    #return np.array([4 * x**3 - 10 * x - 3, 2*Y_COEF*(x[1]+y_offset)])





def gd2(x, grad, alpha, max_iter):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0,:] = x
    for i in range(max_iter):
        x = x - alpha * grad(x)
        xs[i+1,:] = x
    return xs


def gd2_momentum(x, grad, alpha, max_iter, beta=0.9):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0
    for i in range(max_iter):
        v = beta*v + (1-beta)*grad(x)
        vc = v/(1+beta**(i+1))
        x = x - alpha * vc
        xs[i+1, :] = x
    return xs


def plot_contour():
    x = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225, 256, 289]
    Z = cost_func(X,Y)
    fig, ax = plt.subplots()
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
    ax.contour(X, Y, Z, levels, colors='black')
    return fig,ax

fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2(x0, grad2, alpha, max_iter)
c=ax.plot(xs[:, 0], xs[:, 1], 'o-')
fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
c=ax.plot(xs[:, 0], xs[:, 1], 'o-')






Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Local Minima - static 1D presentation

In [29]:
# Local Minima - static 1D presentation

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

X_COEF=1
Y_COEF=40
alpha = 0.02517
alpha = 0.0067
x_offset = 0
y_offset = 0
max_iter= 150

def cost_func(x, y):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)+ Y_COEF*(y+y_offset)**2
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def cost_func_x(x):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
    #return np.array([4 * x**3 - 10 * x - 3, 2*Y_COEF*(x[1]+y_offset)])





def gd2(x, grad, alpha, max_iter):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0,:] = x
    for i in range(max_iter):
        x = x - alpha * grad(x)
        xs[i+1,:] = x
    return xs


def gd2_momentum(x, grad, alpha, max_iter, beta=0.9):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0
    for i in range(max_iter):
        v = beta*v + (1-beta)*grad(x)
        vc = v/(1+beta**(i+1))
        x = x - alpha * vc
        xs[i+1, :] = x
    return xs


def plot_contour():
    x = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225, 256, 289]
#     Z = cost_func(X,Y)
    fig, ax = plt.subplots()
#     ax.set_yticklabels([])
#     ax.set_xticklabels([])
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
#     ax.contour(X, Y, Z, levels, colors='black')
    z = cost_func_x(x)
    ax.plot(x, z)
#     plt,show()
    return fig,ax


fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2(x0, grad2, alpha, max_iter)
c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]), '.r-')

fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]), '.r-')

# fig,ax = plot_contour()
# x0 = np.array([-(5+x_offset),-(2+y_offset)])
# xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
# c=ax.plot(xs[:, 0], xs[:, 1], 'o-')






Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# scratch not completed - Sadle point 

In [34]:
## scratch not completed - Sadle point 
# https://stackoverflow.com/questions/63587633/why-does-keras-sgd-optimizer-minimize-not-reach-global-minimum-in-this-examp

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

X_COEF=1
Y_COEF=40
alpha = 0.02517
alpha = 0.0067
x_offset = 0
y_offset = 0
max_iter= 150
def cost_func_x(x):
    import math
    for xx in x:
        return 4.0*math.cos(xx-1)+np.divide(math.cos(2.0*math.pi*xx),xx)

def cost_func(x, y):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)+ Y_COEF*(y+y_offset)**2
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def cost_func_x(x):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
    #return np.array([4 * x**3 - 10 * x - 3, 2*Y_COEF*(x[1]+y_offset)])





def gd2(x, grad, alpha, max_iter):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0,:] = x
    for i in range(max_iter):
        x = x - alpha * grad(x)
        xs[i+1,:] = x
    return xs


def gd2_momentum(x, grad, alpha, max_iter, beta=0.9):
    xs = np.zeros((1 + max_iter, x.shape[0]))
    xs[0, :] = x
    v = 0
    for i in range(max_iter):
        v = beta*v + (1-beta)*grad(x)
        vc = v/(1+beta**(i+1))
        x = x - alpha * vc
        xs[i+1, :] = x
    return xs


def plot_contour():
    x = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    y = np.linspace(-5.5-x_offset, 5.5-y_offset, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225, 256, 289]
#     Z = cost_func(X,Y)
    fig, ax = plt.subplots()
#     ax.set_yticklabels([])
#     ax.set_xticklabels([])
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
#     ax.contour(X, Y, Z, levels, colors='black')
    z = cost_func_x(x)
    ax.plot(x, z)
#     plt,show()
    return fig,ax


fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2(x0, grad2, alpha, max_iter)
c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]), '.r-')

fig,ax = plot_contour()
x0 = np.array([-(5+x_offset),-(2+y_offset)])
xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
c=ax.plot(xs[:, 0], cost_func_x(xs[:, 0]), '.r-')

# fig,ax = plot_contour()
# x0 = np.array([-(5+x_offset),-(2+y_offset)])
# xs = gd2_momentum(x0, grad2, alpha, max_iter=100,beta=0.9)
# c=ax.plot(xs[:, 0], xs[:, 1], 'o-')






Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 2d Section of local minima

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.animation as animation

%matplotlib widget
%matplotlib notebook

x = np.linspace(-9,5, 100)
x_offset =2
x0 = (x + x_offset)

y = (x0**4 - 10 * x0** 2 - 3 * x0)
fig, ax = plt.subplots()
# ax.set_yticklabels([])
# ax.set_xticklabels([])
ax.plot(x,y)

        





Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f08ad3636d0>]

# SGD and Momentum 2D Animation #

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation, rc

%matplotlib widget
%matplotlib notebook

rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')

X_COEF=1 #20
Y_COEF=40
alpha = 0.02517
x_offset = 2
y_offset = 1.5


def cost_func(x, y):
    Z = X_COEF*(x+x_offset)**2 + Y_COEF*(y+y_offset)**2
    return Z

def grad2(x):
    return np.array([2*X_COEF*(x[0]+x_offset), 2*Y_COEF*(x[1]+y_offset)])

def gd2(x, alpha, grad=grad2):
    x = x - alpha * grad(x)
    return x

v = 0

def gd2_momentum_1(x, frame, alpha, grad=grad2, beta=0.9):
    global v
    v = beta*v + (1-beta)*grad(x)
    vc = v/(1+beta**(frame+1))
    x = x - alpha * vc
    return x



def plot_contour():
    x = np.linspace(-5.5-2, 5.5-2, 100)
    y = np.linspace(-5.5-2, 5.5-2, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225]
    Z = cost_func(X,Y)
    fig, ax = plt.subplots()
#     ax.set_yticklabels([])
#     ax.set_xticklabels([])
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
    
    ax.contour(X, Y, Z, levels, colors='blue')
    return fig,ax

def animate_sgd(frame):
    global xs
    global x
    
    global alpha
    x = gd2(x, alpha, grad2)
    xs[frame,:] = x
    p3.set_data(xs[frame,0], xs[frame,1])
    p2.set_data(xs[1:frame-1,0], xs[1:frame-1,1])
    return p2,

def animate_momentum(frame):
    global xs
    global x
    global alpha

    
    beta=0.9
    x = gd2_momentum_1(x, frame, alpha, grad2, beta)   
    xs[frame,:] = x
    p3.set_data(xs[frame,0], xs[frame,1])
    p2.set_data(xs[1:frame-1,0], xs[1:frame-1,1])
    return p2,

max_iter=100
fig, ax = plot_contour()
x = np.array([-3-x_offset,-2-y_offset])
xs = np.zeros((1 + max_iter, x.shape[0]))
xs[0,:] = x

p2, = plt.plot([], [], color='red', alpha=0.6)
p3, = plt.plot([], [], 'r.')
anim = animation.FuncAnimation(fig, animate_sgd, frames=range(1, max_iter), interval=10)
anim.save('/home/ronen/Downloads/sgd-2d-non-convergance-animation.gif', dpi=80, writer='imagemagick', fps=5)

x = np.array([-3-x_offset,-2-y_offset])
xs = np.zeros((1 + max_iter, x.shape[0]))
xs[0,:] = x
fig, ax = plot_contour()
p2, = plt.plot([], [], color='red', alpha=0.6)
p3, = plt.plot([], [], 'r.')
anim = animation.FuncAnimation(fig, animate_momentum, frames=range(1, max_iter), interval=10)
anim.save('/home/ronen/Downloads/sgd-momentum-2d-animation.gif', dpi=80, writer='imagemagick', fps=5)

anim

ModuleNotFoundError: No module named 'ipympl'

# local minima, sgd and momenut - 2d animation 

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation, rc

%matplotlib widget
%matplotlib notebook

rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')

X_COEF=1 #20
Y_COEF=40

alpha = 0.0067
x_offset = 0
y_offset = 0
max_iter= 150

def cost_func(x, y):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)+ Y_COEF*(y+y_offset)**2
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
    #return np.array([4 * x**3 - 10 * x - 3, 2*Y_COEF*(x[1]+y_offset)])

    
def gd2(x, alpha, grad=grad2):
    x = x - alpha * grad(x)
    return x

v = 0

def gd2_momentum_1(x, frame, alpha, grad=grad2, beta=0.9):
    global v
    v = beta*v + (1-beta)*grad(x)
    vc = v/(1+beta**(frame+1))
    x = x - alpha * vc
    return x



def plot_contour():
    x = np.linspace(-5.5-x_offset, 5.5-x_offset, 100)
    y = np.linspace(-5.5-x_offset, 5.5-x_offset, 100)
    X, Y = np.meshgrid(x, y)
    levels = [0.1,1,2,4,9, 16, 25, 36, 49, 64, 81, 100, 121, 144,169, 196, 225, 256, 289]
    Z = cost_func(X,Y)
    fig, ax = plt.subplots()
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    # ax.set(xlim=(-5.5, 5.5), ylim=(-1.5, 1.5))
    
    ax.contour(X, Y, Z, levels, colors='blue')
    return fig,ax

def animate_sgd(frame):
    global xs
    global x
    
    global alpha
    if frame > 0:
        x = gd2(x, alpha, grad2)
    xs[frame,:] = x
    p3.set_data(xs[frame,0], xs[frame,1])
    if frame > 0:
        p2.set_data(xs[0:frame-1,0], xs[0:frame-1,1])
    return p2,

def animate_momentum(frame):
    global xs
    global x
    global alpha

    
    beta=0.9
    if frame > 0:
        x = gd2_momentum_1(x, frame, alpha, grad2, beta)   
    xs[frame,:] = x
    p3.set_data(xs[frame,0], xs[frame,1])
    if frame > 0:
        p2.set_data(xs[0:frame-1,0], xs[:frame-1,1])
    return p2,

max_iter=100
fig, ax = plot_contour()
x = np.array([-3-x_offset,-2-y_offset])
x = np.array([-(5+x_offset),-(5+y_offset)])

xs = np.zeros((1 + max_iter, x.shape[0]))
xs[0,:] = x

p2, = plt.plot([], [], color='red', alpha=0.6)
p3, = plt.plot([], [], 'r.')

anim = animation.FuncAnimation(fig, animate_sgd, frames=range(0, max_iter), interval=10)
anim.save('/home/ronen/Downloads/sgd-2d-non-convergance-animation.gif', dpi=80, writer='imagemagick', fps=5)

x = np.array([-3-x_offset,-2-y_offset])
x = np.array([-(5+x_offset),-(5+y_offset)])

xs = np.zeros((1 + max_iter, x.shape[0]))
xs[0,:] = x
fig, ax = plot_contour()
p2, = plt.plot([], [], color='red', alpha=0.6)
p3, = plt.plot([], [], 'r.')
anim = animation.FuncAnimation(fig, animate_momentum, frames=range(0, max_iter), interval=10)
anim.save('/home/ronen/Downloads/sgd-momentum-2d-animation.gif', dpi=80, writer='imagemagick', fps=5)

# anim



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

NameError: name 'plot' is not defined

  return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
  return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])


## 3D Animation: SGD and Momentum

In [5]:
# # #####3D:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.animation as animation
from matplotlib import animation, rc


max_iter=100

X_COEF=1 #20
Y_COEF=40
alpha = 0.02517

x_offset = 2
y_offset = 1.5
max_iter = 70


def cost_func(x, y):
    Z = X_COEF*(x+x_offset)**2 + Y_COEF*(y+y_offset)**2
    return Z


def grad2(x):
    return np.array([2*X_COEF*(x[0]+x_offset), 2*Y_COEF*(x[1]+y_offset)])


def gd2(x, alpha, grad=grad2):
    x = x - alpha * grad(x)
    return x

v = 0

def gd2_momentum_1(x, frame, alpha, grad=grad2, beta=0.9):
    global v
    v = beta*v + (1-beta)*grad(x)
    vc = v/(1+beta**(frame+1))
    x = x - alpha * vc
    return x



rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')



def plot_loss_func_3d():
    xx = np.linspace(-5.5-x_offset, 5.5-y_offset, 50)
    yy = np.linspace(-5.5-x_offset, 5.5-y_offset, 50)
    X, Y = np.meshgrid(xx, yy)
    Z = cost_func(X, Y)
    fig1 = plt.figure(figsize=(16, 6))
    ax1 = fig1.gca(projection='3d')

    surf = ax1.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.5, 
                           linewidth=0, antialiased=False)

    ax1.set_xlabel('b', fontsize=20)
    ax1.set_ylabel('w1', fontsize=20)
    ax1.set_zlabel('J(b, w)', fontsize=20)
    return fig1, ax1

# 

def rotate(angle):
    ax1.view_init(azim=angle)

    
def sgd_scatter_point(angle):
    global x
    global alpha
    x = gd2(x, alpha, grad2)   
    z=cost_func(x[0], x[1])
    ax1.scatter(x[0], x[1], z, color='red', marker='o')


def sgd_momentum_scatter_point(angle):
    global x
    global gd_algorithm
    global alpha

    beta=0.9
    x = gd2_momentum_1(x, angle, alpha, grad2, beta)   
    z=cost_func(x[0], x[1])
    ax1.scatter(x[0], x[1], z, color='red', marker='o')
    
    
plt.show()
fig1, ax1 = plot_loss_func_3d()

rot_animation = animation.FuncAnimation(fig1, rotate, frames=np.arange(0, max_iter), interval=50)
rot_animation.save('loss_function_3d_animation.gif', dpi=80, writer='imagemagick')

fig1, ax1 = plot_loss_func_3d()
ax1.view_init(48, 0)
x = np.array([-3-x_offset,-2-y_offset])
rot_animation = animation.FuncAnimation(fig1, sgd_momentum_scatter_point, frames=np.arange(0, max_iter), interval=50)
rot_animation.save('gradient_descent_momentum_3d_animation.gif', dpi=80, writer='imagemagick')


fig1, ax1 = plot_loss_func_3d()

ax1.view_init(30, 0)
x = np.array([-3-x_offset,-2-y_offset])
rot_animation = animation.FuncAnimation(fig1, sgd_scatter_point, frames=np.arange(0, max_iter), interval=50)
rot_animation.save('gradient_descent_non_converging_3d_animation.gif', dpi=80, writer='imagemagick')

rot_animation

# local minima sgd and moment - 3d animation

In [None]:
# # #####3D:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.animation as animation
from matplotlib import animation, rc

%matplotlib widget
%matplotlib notebook

max_iter=100

X_COEF=1 #20
Y_COEF=40
alpha = 0.02517

x_offset = 0
y_offset = 0
max_iter = 70

alpha = 0.0067
x_offset = 0
y_offset = 0
max_iter= 150

def cost_func(x, y):
    Z = (x +x_offset)**4 - 10 * (x +x_offset)** 2 - 3 * (x +x_offset)+ Y_COEF*(y+y_offset)**2
#     Z = X_COEF*(X+x_offset)**2 + Y_COEF*(Y+y_offset)**2
    return Z

def f2(x):
    return X_COEF*(x[0]+x_offset)**2 + Y_COEF*(x[1]+y_offset)**2

def grad2(x):
    return np.array([4 * x[0]**3 - 20 * x[0] - 3, 2*Y_COEF*(x[1]+y_offset)])
    #return np.array([4 * x**3 - 10 * x - 3, 2*Y_COEF*(x[1]+y_offset)])




v = 0

def gd2_momentum_1(x, frame, alpha, grad=grad2, beta=0.9):
    global v
    v = beta*v + (1-beta)*grad(x)
    vc = v/(1+beta**(frame+1))
    x = x - alpha * vc
    return x



rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')



def plot_loss_func_3d():
    xx = np.linspace(-5.5-x_offset, 5.5-y_offset, 50)
    yy = np.linspace(-5.5-x_offset, 5.5-y_offset, 50)
    X, Y = np.meshgrid(xx, yy)
    Z = cost_func(X, Y)
    fig1 = plt.figure(figsize=(16, 6))
    ax1 = fig1.gca(projection='3d')

    surf = ax1.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.5, 
                           linewidth=0, antialiased=False)

    ax1.set_xlabel('b', fontsize=20)
    ax1.set_ylabel('w1', fontsize=20)
    ax1.set_zlabel('J(b, w)', fontsize=20)
    ax1.xaxis.set_ticklabels([])
    ax1.yaxis.set_ticklabels([])
    ax1.zaxis.set_ticklabels([])
    plt.show()
    return fig1, ax1

# 

def rotate(angle):
    ax1.view_init(azim=angle)

    
def sgd_scatter_point(angle):
    global x
    global alpha
    x = gd2(x, alpha, grad2)   
    z=cost_func(x[0], x[1])
    ax1.scatter(x[0], x[1], z, color='red', marker='o')


def sgd_momentum_scatter_point(angle):
    global x
    global gd_algorithm
    global alpha

    beta=0.9
    x = gd2_momentum_1(x, angle, alpha, grad2, beta)   
    z=cost_func(x[0], x[1])
    ax1.scatter(x[0], x[1], z, color='red', marker='o')
    
 

fig1, ax1 = plot_loss_func_3d()
plt.show()


rot_animation = animation.FuncAnimation(fig1, rotate, frames=np.arange(0, 360, 10), interval=50)
rot_animation.save('loss_function_3d_animation.gif', dpi=80, writer='imagemagick')

fig1, ax1 = plot_loss_func_3d()
ax1.view_init(48, 0)
x = np.array([-3-x_offset,-2-y_offset])
x = np.array([-(5+x_offset),-(2+y_offset)])

rot_animation = animation.FuncAnimation(fig1, sgd_momentum_scatter_point, frames=np.arange(0, max_iter), interval=50)
rot_animation.save('gradient_descent_momentum_3d_animation.gif', dpi=80, writer='imagemagick')


fig1, ax1 = plot_loss_func_3d()

ax1.view_init(30, 0)
x = np.array([-3-x_offset,-2-y_offset])
x = np.array([-(5+x_offset),-(2+y_offset)])

rot_animation = animation.FuncAnimation(fig1, sgd_scatter_point, frames=np.arange(0, max_iter), interval=50)
rot_animation.save('gradient_descent_non_converging_3d_animation.gif', dpi=80, writer='imagemagick')

# rot_animation


saddle 3d test

# Animated Loss Surface #

In [1]:
# Animated Loss Surface #

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import animation, rc

rc('animation', html='html5')
plt.style.use('seaborn-whitegrid')


def true_model(W, b):
    def outputs(inputs):
        err = tf.random.normal(shape=inputs.shape)
#         err = 0.0
        return tf.squeeze(inputs * W + b + err)
    return outputs

def loss_grid(X, y, w_space, b_space):
  def bias_augment(X):
    """Augment matrix X with bias column."""
    aug = tf.constant([0]*X.shape[0] + [1],
                      shape=(X.shape[0]+1, 1),
                      dtype=tf.float32)
    X = tf.concat([X, tf.zeros(shape=(1, X.shape[1]))],
                  axis=0)
    X = tf.concat([X, aug], axis=1)
    return X

  W = \
    tf.expand_dims(
      tf.stack(
        tf.meshgrid(w_space, b_space)),
      axis=1)
  Xb = bias_augment(X)
  WX = tf.einsum("mnij,bm->bnij", W, Xb)
  b = WX[-1]
  WX = tf.squeeze(WX[:-1])
  y_pred = WX + b

  batch_size = X.shape[0]
  loss = \
    tf.reduce_mean(tf.square(
      tf.reshape(y, (batch_size, 1, 1)) - y_pred
    ), axis=0)

  return loss

def plot_loss_surface(w_space, b_space, loss, levels=tf.square(range(0, 8)), dpi=150):
  fig = plt.figure(dpi=150)
  ax = fig.add_subplot(111)
  ax.set_xlim(b_MIN, b_MAX)
  ax.set_ylim(W_MIN, W_MAX)
  ax.set_aspect('equal')
  ax.set_xlabel('Bias')
  ax.set_ylabel('Weight')
  ax.set_title('The Loss Surface')
  levels = tf.square(range(0, 8))
  CS = ax.contour(w_space, b_space, loss,
                  levels=levels,
                  cmap='gray')
  ax.clabel(CS, inline=True, fontsize=8, fmt="%1d")
  artists, _ = CS.legend_elements()
  return artists



# ## Define Linear Model ##

# Define model
class Model(object):
  def __init__(self, w_init=0.0, b_init=0.0):
    self.W = tf.Variable(tf.reshape(w_init, shape=(1,)), dtype=tf.float32)
    self.b = tf.Variable(tf.reshape(b_init, shape=(1,)), dtype=tf.float32)

  def __call__(self, x):
    return self.W * x + self.b

def loss_fn(target_y, predicted_y):
    return tf.reduce_mean(tf.square(target_y - predicted_y))

def train(model, inputs, outputs, learning_rate):
  with tf.GradientTape() as t:
    current_loss = loss_fn(outputs, model(inputs))
    dW, db = t.gradient(current_loss, [model.W, model.b])
    model.W.assign_sub(learning_rate * dW)
    model.b.assign_sub(learning_rate * db)


# ## Data ##
# +
TRUE_W = 3.0
TRUE_b = 2.0
NUM_EXAMPLES = 32
BATCH_SIZE = 4
LEARNING_RATE = 0.05

W_MIN = -3.0
W_MAX = 8.0
b_MIN = -3.0
b_MAX = 8.0

X = tf.random.normal(shape=[NUM_EXAMPLES, 1])
y = tf.squeeze(true_model(TRUE_W, TRUE_b)(X))
w_space = tf.linspace(W_MIN, W_MAX, 128)
b_space = tf.linspace(b_MIN, b_MAX, 128)

loss = loss_grid(X, y, w_space, b_space)

# ## Data Pipeline ##
inputs = tf.squeeze(X)
outputs = true_model(TRUE_W, TRUE_b)(inputs)
ds = (tf.data.Dataset
      .from_tensor_slices((inputs, outputs))
      .shuffle(1000)
      .batch(BATCH_SIZE)
      .repeat())
ds = iter(ds)

model = Model(w_init=-1.0, b_init=-1.0)

# Empty containers for values to save
Ws, bs, xs, ys, ls = [], [], [], [], []

fig = plt.figure(dpi=150)
ax = fig.add_subplot(111)
ax.set_xlim(b_MIN, b_MAX)
ax.set_ylim(W_MIN, W_MAX)
ax.set_aspect('equal')
ax.set_xlabel('Bias')
ax.set_ylabel('Weight')
ax.set_title('The Loss Surface')
levels = tf.square(range(0, 8))
CS = ax.contour(w_space, b_space, loss,
                levels=levels,
                cmap='bone')
ax.clabel(CS, inline=True, fontsize=8, fmt="%1d")

p1, = plt.plot([TRUE_W], [TRUE_b], 'kx')
p2, = plt.plot([], [], color='red', alpha=0.5)
p3, = plt.plot([], [], 'r.')

def init():
    return p1,

def update(epoch):
  x, y = next(ds)
  y_pred = model(x)
  current_loss = loss_fn(y, y_pred)

  Ws.append(model.W.numpy())
  bs.append(model.b.numpy())
  xs.append(x.numpy())
  ys.append(y_pred.numpy())
  ls.append(current_loss.numpy())
  p2.set_data(Ws, bs)
  p3.set_data(Ws[-1], bs[-1])
  train(model, x, y, learning_rate=0.05)


  return p2,

ani = animation.FuncAnimation(fig, update, frames=range(1, 128), interval=100, init_func=init, blit=True)
# plt.close()
ani