In [34]:
import numpy as np
import matplotlib.pyplot as plt

In [35]:
def f1(x):
    x,y,z = x
    return (x - z)**2 + (2*y + z)**2 + (4*x - 2*y + z)**2 + x + y

def grad_f1(x):
    x,y,z = x
    dx = 2*(x - z) + 8*(4*x - 2*y + z) + 1
    dy = 4*(2*y + z) - 4*(4*x - 2*y + z) + 1
    dz = -2*(x - z) + 2*(2*y + z) + 2*(4*x - 2*y + z)
    
    return np.array([dx, dy, dz])

def hess_f1(x):
    dxx = 34 # 2 + 32
    dxy = -16
    dxz = 6 #-2 + 8
    dyy = 0
    dyz = 0
    dzz = 2
    
    return np.array([[dxx, dxy, dxz], [dxy, dyy, dyz], [dxz, dyz, dzz]])

In [36]:
f1_x1 = np.array([0,0,0]) 
f1(f1_x1), grad_f1(f1_x1), hess_f1(f1_x1)

(np.int64(0),
 array([1, 1, 0]),
 array([[ 34, -16,   6],
        [-16,   0,   0],
        [  6,   0,   2]]))

In [37]:
def f2(x):
    x,y,z = x
    return (x - 1)**2 + (y - 1)**2 + 100*(y-x**2)**2 + 100*(z-y**2)**2

def grad_f2(x):
    x,y,z = x
    dx = 2*x - 2 - 400 * x * y + 400 * x**3
    dy = 2*y - 2 + 200 * (y - x**2) - 400*z*y + 400 * y**3
    dz = 200 * (z - y**2)
        
    return np.array([dx, dy, dz])

def hess_f2(x):
    x,y,z = x
    
    dxx = 2 - 400 * y + 1200 * x**2
    dxy = -400 * x
    dxz = 0
    dyy = 2 + 200 - 400 * z + 1200 * y**2
    dyz = -400 * y
    dzz = 200
    return np.array([[dxx, dxy, dxz], [dxy, dyy, dyz], [dxz, dyz, dzz]])

In [38]:
f2_x1 = np.array([1.2, 1.2, 1.2])
f2(f2_x1), grad_f2(f2_x1), hess_f2(f2_x1)

(np.float64(11.6),
 array([115.6,  67.6, -48. ]),
 array([[1250., -480.,    0.],
        [-480., 1450., -480.],
        [   0., -480.,  200.]]))

In [39]:
def f3(x):
    x,y = x
    return (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x + x*y**3)**2

def grad_f3(x):
    x,y = x
    
    dx = -12.75 * 6*x + 3*y - 4*x*y - 2*x*y**2 + 4.5*y**2 + 2*x*y**4 - 4*x*y**3 + 5.25 * y**3 + 2*x*y**6
    dy = 3*x + 9*y - 4*x*y - 2*x**2 + 15.75*y**2 + 2*x**2 * y - 6 * x * y**2 + 4*x*y**3 + 6*x*y**5
    
    return np.array([dx, dy])

def hess_f3(x):
    x,y = x
    
    dxx = 6 - 4*y - 2*y**2 + 2*y**4 - 4*y**3 + 2*y**6
    dxy = 3 - 4*x - 4*x*y + 9*y + 8*x*y**3 - 12*x*y**2 + 15.75*y**2 + 12*x*y**5
    dyy = 9 - 4*x + 31.5*y + 2*x**2 - 12*x*y + 12*x*y**2 + 30*x*y**4
    return np.array([[dxx, dxy], [dxy, dyy]])

In [40]:
f3_x1 = np.array([1,1])
f3(f3_x1), grad_f3(f3_x1), hess_f3(f3_x1)

(np.float64(14.203125),
 array([-69.75,  27.75]),
 array([[ 0.  , 27.75],
        [27.75, 68.5 ]]))

In [41]:
def learning_rate():
    return 0.01

In [42]:
def gd(x, grad_f, lr_fn, n_steps):
    for i in range(n_steps):
        x = x - lr_fn() * grad_f(x)
        
    return x 
    

In [52]:
def polyak(x: np.ndarray, grad_f, lr_fn, mu, n_steps):
    x_prev = x
        
    for i in range(n_steps):
        dx = grad_f(x)
        x = x - lr_fn() * dx  + mu * (x - x_prev)
        
        x_prev = x
        
    return x

In [44]:
def nesterov(x, f, grad_f, lr_fn, mu, n_steps):
    x_prev = x
        
    for i in range(n_steps):
        dx = grad_f(x + mu * (x - x_prev))
        x = x - lr_fn() * dx + mu * (x_prev - x)
        
        x_prev = x
        
    return x

In [74]:
def adagrad(x, grad_f, lr_fn, n_steps, eps=1e-8):
    x = np.array(x, dtype=float)
    n = len(x)
    
    grad_sq_sum = np.zeros(n)
    
    for k in range(n_steps):
        grad = np.array(grad_f(x))
        
        # Accumulate squared gradients
        grad_sq_sum += grad ** 2
        
        # Compute D_k^{-1/2}
        D_inv_sqrt = np.diag(1.0 / (np.sqrt(grad_sq_sum) + eps))
        
        # Adaptive step
        step = lr_fn() * D_inv_sqrt @ grad
        x_new = x - step
        
        x = x_new

    return x


In [76]:
f1(adagrad(f1_x1, grad_f1, learning_rate, 100))

np.float64(-0.17994617813818667)

Newton and BFGS

In [56]:
def newton(x0, grad_f, hess_f, n_steps=100):
    x = np.array(x0, dtype=float)
    
    for k in range(n_steps):
        grad = np.array(grad_f(x))
        hess = np.array(hess_f(x))
        
        hess_inv = np.linalg.inv(hess)
        
        
        # Compute Newton step
        x = x - hess_inv @ grad
        
    return x

## TODO: BFGS

# Testing

In [57]:
def minimum2(f):
    """Finds approximate minimum of f(x)"""
    xs = np.linspace(-1, 1, 1000)
    ys = np.linspace(-1, 1, 1000)
    X, Y = np.meshgrid(xs, ys)  
    Z = f(np.array([X, Y]))  
    
    min_idx = np.unravel_index(np.argmin(Z), Z.shape)  
    bx, by, bz = X[min_idx], Y[min_idx], Z[min_idx]
    return np.array([bx, by, bz])

In [58]:
def minimum3(f):
    """Finds approximate minimum of f(x)"""
    xs = np.linspace(-1, 1, 100)
    ys = np.linspace(-1, 1, 100)
    zs = np.linspace(-1, 1, 100)
    
    X, Y, Z = np.meshgrid(xs, ys, zs)  
    W = f(np.array([X, Y, Z]))  
    
    min_idx = np.unravel_index(np.argmin(W), W.shape)  
    bx, by, bz, bw = X[min_idx], Y[min_idx], Z[min_idx], W[min_idx]
    return np.array([bx, by, bz, bw])

In [59]:
f1_xopt = minimum3(f1)
f2_xopt = minimum3(f2)
f3_xopt = minimum2(f3)
print("f1 minimum:", f1_xopt)
print("f2 minimum:", f2_xopt)
print("f3 minimum:", f3_xopt)

f1 minimum: [-0.17171717 -0.23232323  0.17171717 -0.19773493]
f2 minimum: [1. 1. 1. 0.]
f3 minimum: [ 1.         -0.18718719  4.36852892]


In [61]:
### f1:
f1x0 = np.array([0, 0, 0])
f1x1 = np.array([1, 1, 0])

steps = [2, 5, 10, 100]
f = f1
grad_f = grad_f1
hess_f = hess_f1

for step in steps:
    print("Steps, ", step)
    print("GD: ", f(gd(f1x0, grad_f, learning_rate, step)))
    print("Polyak: ", f(polyak(f1x0, grad_f, learning_rate, 0.9, step)))
    print("Nesterov: ", f(nesterov(f1x0, f, grad_f, learning_rate, 0.9, step)))
    print("Adagrad: ", f(adagrad(f1x0, grad_f, learning_rate, 0.9, step)))
    print("Newton: ", f(newton(f1x0, grad_f, hess_f, step)))
    print()

Steps,  2
GD:  -0.03525736
Polyak:  -0.03525736
Nesterov:  -0.03525736
Adagrad:  -0.03168485025852985
Newton:  1.119140625

Steps,  5
GD:  -0.07265853373031797
Polyak:  -0.07265853373031797
Nesterov:  -0.07265853373031797
Adagrad:  -0.05633449574487685
Newton:  42.050811767578125

Steps,  10
GD:  -0.11272088687798694
Polyak:  -0.11272088687798694
Nesterov:  -0.11272088687798694
Adagrad:  -0.08148910484708378
Newton:  300225.61898127175

Steps,  100
GD:  -0.1975283975531976
Polyak:  -0.1975283975531976
Nesterov:  -0.1975283975531976
Adagrad:  -0.17994617813818667
Newton:  6.410613048651009e+80

