In [305]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [306]:
def f1(x):
    x,y,z = x
    return (x - z)**2 + (2*y + z)**2 + (4*x - 2*y + z)**2 + x + y

def grad_f1(x):
    x,y,z = x
    dx = 2*(x - z) + 8*(4*x - 2*y + z) + 1
    dy = 4*(2*y + z) - 4*(4*x - 2*y + z) + 1
    dz = -2*(x - z) + 2*(2*y + z) + 2*(4*x - 2*y + z)
    
    return np.array([dx, dy, dz])

def hess_f1(x):
    dxx = 34 # 2 + 32
    dxy = -16
    dxz = 6 #-2 + 8
    dyy = 0
    dyz = 8
    dzz = 6
    
    return np.array([[dxx, dxy, dxz], [dxy, dyy, dyz], [dxz, dyz, dzz]])

In [307]:
f1_x1 = np.array([0,0,0]) 
f1(f1_x1), grad_f1(f1_x1), hess_f1(f1_x1)

(np.int64(0),
 array([1, 1, 0]),
 array([[ 34, -16,   6],
        [-16,   0,   8],
        [  6,   8,   6]]))

In [308]:
def f2(x):
    x,y,z = x
    return (x - 1)**2 + (y - 1)**2 + 100*(y-x**2)**2 + 100*(z-y**2)**2

def grad_f2(x):
    x,y,z = x
    dx = 2*x - 2 - 400 * x * y + 400 * x**3
    dy = 2*y - 2 + 200 * (y - x**2) - 400*z*y + 400 * y**3
    dz = 200 * (z - y**2)
        
    return np.array([dx, dy, dz])

def hess_f2(x):
    x,y,z = x
    
    dxx = 2 - 400 * y + 1200 * x**2
    dxy = -400 * x
    dxz = 0
    dyy = 2 + 200 - 400 * z + 1200 * y**2
    dyz = -400 * y
    dzz = 200
    return np.array([[dxx, dxy, dxz], [dxy, dyy, dyz], [dxz, dyz, dzz]])

In [309]:
f2_x1 = np.array([1.2, 1.2, 1.2])
f2(f2_x1), grad_f2(f2_x1), hess_f2(f2_x1)

(np.float64(11.6),
 array([115.6,  67.6, -48. ]),
 array([[1250., -480.,    0.],
        [-480., 1450., -480.],
        [   0., -480.,  200.]]))

In [328]:
def f3(x):
    x,y = x
    return (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x + x*y**3)**2

def grad_f3(x):
    x,y = x
    
    dx = -12.75 * 6*x + 3*y - 4*x*y - 2*x*y**2 + 4.5*y**2 + 2*x*y**4 - 4*x*y**3 + 5.25 * y**3 + 2*x*y**6
    dy = 3*x + 9*y*x - 4*x**2*y - 2*x**2 + 15.75*y**2 *x + 2*x**2 * y - 6 * x**2 * y**2 + 4*x**3*y**3 + 6*x**2*y**5
    
    return np.array([dx, dy])

def hess_f3(x):
    x,y = x
    
    dxx = 6 - 4*y - 2*y**2 + 2*y**4 - 4*y**3 + 2*y**6
    dxy = 3 - 4*x - 4*x*y + 9*y + 8*x*y**3 - 12*x*y**2 + 15.75*y**2 + 12*x*y**5
    dyy = 9*x - 4*x**2 + 31.5*y*x + 2*x**2 - 12*x**2*y + 12*x**2*y**2 + 30*x**2*y**4
    return np.array([[dxx, dxy], [dxy, dyy]])

In [311]:
f3_x1 = np.array([1,1])
f3(f3_x1), grad_f3(f3_x1), hess_f3(f3_x1)

(np.float64(14.203125),
 array([-69.75,  27.75]),
 array([[ 0.  , 27.75],
        [27.75, 68.5 ]]))

In [312]:
def learning_rate(i):
    return 0.01 * 0.99**i

In [313]:
def gd(x, grad_f, lr_fn, n_steps, t=None):
    
    if t is not None:
        i = 0
        start = time.time()
        while time.time() - start < t:
            x = x - lr_fn(i) * grad_f(x)
            i += 1
    else:
        for i in range(n_steps):
            x = x - lr_fn(i) * grad_f(x)
        
    return x 
    

In [314]:
def polyak(x: np.ndarray, grad_f, lr_fn, mu, n_steps, t=None):
    x_prev = x
    
    if t is not None:
        start = time.time()
        i = 0
        while time.time() - start < t:
            dx = grad_f(x)
            x = x - lr_fn(i) * dx  + mu * (x - x_prev)
        
            x_prev = x
            i+=1
    else:
        for i in range(n_steps):
            dx = grad_f(x)
            x = x - lr_fn(i) * dx  + mu * (x - x_prev)
            
            x_prev = x
        
    return x

In [315]:
def nesterov(x, f, grad_f, lr_fn, mu, n_steps, t=None):
    x_prev = x
    
    if t is not None:
        start = time.time()
        i = 0
        while time.time() - start < t:
            dx = grad_f(x + mu * (x - x_prev))
            x = x - lr_fn(i) * dx + mu * (x_prev - x)
        
            x_prev = x
            i+=1
    else:
        for i in range(n_steps):
            dx = grad_f(x + mu * (x - x_prev))
            x = x - lr_fn(i) * dx + mu * (x_prev - x)
            
            x_prev = x
        
    return x

In [316]:
def adagrad(x, grad_f, lr_fn, n_steps, eps=1e-8, t=None):
    x = np.array(x, dtype=float)
    n = len(x)
    
    grad_sq_sum = np.zeros(n)
    
     
    if t is not None:
        start = time.time()
        i = 0
        while time.time() - start < t:
            grad = np.array(grad_f(x))
        
            # Accumulate squared gradients
            grad_sq_sum += grad ** 2
            
            # Compute D_k^{-1/2}
            D_inv_sqrt = np.diag(1.0 / (np.sqrt(grad_sq_sum) + eps))
            
            # Adaptive step
            step = lr_fn(i) * D_inv_sqrt @ grad
            x_new = x - step
            
            x = x_new
            i+= 1
    else:
        for i in range(n_steps):
            grad = np.array(grad_f(x))
            
            # Accumulate squared gradients
            grad_sq_sum += grad ** 2
            
            # Compute D_k^{-1/2}
            D_inv_sqrt = np.diag(1.0 / (np.sqrt(grad_sq_sum) + eps))
            
            # Adaptive step
            step = lr_fn(i) * D_inv_sqrt @ grad
            x_new = x - step
            
            x = x_new

    return x


In [317]:
f1(adagrad(f1_x1, grad_f1, learning_rate, 100))

np.float64(-0.15969334544827538)

Newton and BFGS

In [318]:
import numpy as np

def is_positive_definite(H):
    try:
        np.linalg.cholesky(H)
        return True
    except np.linalg.LinAlgError:
        return False


In [319]:
def newton(x0, grad_f, hess_f, n_steps=100, t=None):
    x = np.array(x0, dtype=float)
    
    if t is not None:
        start = time.time()
        i = 0
        while time.time() - start < t:
            grad = np.array(grad_f(x))
            hess = np.array(hess_f(x))
            
            hess_inv = np.linalg.inv(hess)
            
            # Compute Newton step
            x = x - learning_rate(i) * hess_inv @ grad
            i += 1

    else:
        for i in range(n_steps):
            grad = np.array(grad_f(x))
            grad = np.clip(grad, -10, 10)

            hess = np.array(hess_f(x)) + 1e-4 * np.eye(len(x))
            
            hess_inv = np.linalg.inv(hess)
            
            
            # Compute Newton step
            x = x - learning_rate(i) * np.linalg.solve(hess, grad)

        
    return x

In [320]:
print("Newton: ", f1(newton(np.array([0,0,0]), grad_f1, hess_f1, 100)))


Newton:  0.07391146130764983


## TODO: BFGS

# Testing

In [321]:
def minimum2(f):
    """Finds approximate minimum of f(x)"""
    xs = np.linspace(-1, 1, 1000)
    ys = np.linspace(-1, 1, 1000)
    X, Y = np.meshgrid(xs, ys)  
    Z = f(np.array([X, Y]))  
    
    min_idx = np.unravel_index(np.argmin(Z), Z.shape)  
    bx, by, bz = X[min_idx], Y[min_idx], Z[min_idx]
    return np.array([bx, by, bz])

In [322]:
def minimum3(f):
    """Finds approximate minimum of f(x)"""
    xs = np.linspace(-1, 1, 100)
    ys = np.linspace(-1, 1, 100)
    zs = np.linspace(-1, 1, 100)
    
    X, Y, Z = np.meshgrid(xs, ys, zs)  
    W = f(np.array([X, Y, Z]))  
    
    min_idx = np.unravel_index(np.argmin(W), W.shape)  
    bx, by, bz, bw = X[min_idx], Y[min_idx], Z[min_idx], W[min_idx]
    return np.array([bx, by, bz, bw])

In [323]:
f1_xopt = minimum3(f1)
f2_xopt = minimum3(f2)
f3_xopt = minimum2(f3)
print("f1 minimum:", f1_xopt)
print("f2 minimum:", f2_xopt)
print("f3 minimum:", f3_xopt)

f1 minimum: [-0.17171717 -0.23232323  0.17171717 -0.19773493]
f2 minimum: [1. 1. 1. 0.]
f3 minimum: [ 1.         -0.18718719  4.36852892]


In [324]:
### f1:
f1x0 = np.array([0, 0, 0])
f1x1 = np.array([1, 1, 0])

steps = [2, 5, 10, 100]
f = f1
grad_f = grad_f1
hess_f = hess_f1

print("True minimum", f1_xopt[-1])
for step in steps:
    print("Steps, ", step)
    print("GD: ", f(gd(f1x0, grad_f, learning_rate, step)))
    print("Polyak: ", f(polyak(f1x0, grad_f, learning_rate, 0.9, step)))
    print("Nesterov: ", f(nesterov(f1x0, f, grad_f, learning_rate, 0.9, step)))
    print("Adagrad: ", f(adagrad(f1x0, grad_f, learning_rate, step)))
    print("Newton: ", f(newton(f1x0, grad_f, hess_f, step)))
    print()

True minimum -0.19773492500765227
Steps,  2
GD:  -0.035101752536
Polyak:  -0.035101752536
Nesterov:  -0.035101752536
Adagrad:  -0.03156182988530955
Newton:  0.0015077275337369443

Steps,  5
GD:  -0.07159791032768308
Polyak:  -0.07159791032768308
Nesterov:  -0.07159791032768308
Adagrad:  -0.055560190582874064
Newton:  0.00381477728519076

Steps,  10
GD:  -0.10989895086006889
Polyak:  -0.10989895086006889
Nesterov:  -0.10989895086006889
Adagrad:  -0.07927804062158532
Newton:  0.0077552790699727935

Steps,  100
GD:  -0.1945330755830976
Polyak:  -0.1945330755830976
Nesterov:  -0.1945330755830976
Adagrad:  -0.15969334544827538
Newton:  0.07391146130764983



In [327]:
def test_time(f, grad_f, hess_f, x0, step, times):
    print("True minimum", f1_xopt[-1])
    for _time in times:
        print("Time, ", _time)
        print("GD: ", f(gd(x0, grad_f, learning_rate, step, t=_time)))
        print("Polyak: ", f(polyak(x0, grad_f, learning_rate, 0.9, step, t=_time)))
        print("Nesterov: ", f(nesterov(x0, f, grad_f, learning_rate, 0.9, step, t=_time)))
        print("Adagrad: ", f(adagrad(x0, grad_f, learning_rate, 0.9, step, t=_time)))
        print("Newton: ", f(newton(x0, grad_f, hess_f, step, t=_time)))
        print()

In [None]:
test_time()

In [None]:
import time 

f1x0 = np.array([0, 0, 0])
f1x1 = np.array([1, 1, 0])
f2x0 = np.array([1.2, 1.2, 1.2])
f2x1 = np.array([-1, 1.2, 1.2])
f2x0 = np.array([1,1])
f2x1 = np.array([4.5, 4.5])

times = [.1, 1, 2]
f = f1
grad_f = grad_f1
hess_f = hess_f1
step = 1000


True minimum -0.19773492500765227
Time,  0.1
GD:  -0.1975114672774948
Polyak:  -0.1975114672774948
Nesterov:  -0.1975114672774948
Adagrad:  -0.0019650811794469543
Newton:  0.140621296199624

Time,  1
GD:  -0.1975114672774948
Polyak:  -0.1975114672774948
Nesterov:  -0.1975114672774948
Adagrad:  -0.0019650811794469543
Newton:  0.140621296199624

Time,  2
GD:  -0.1975114672774948
Polyak:  -0.1975114672774948
Nesterov:  -0.1975114672774948
Adagrad:  -0.0019650811794469543
Newton:  0.140621296199624

