In [None]:
import itertools

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Optimisation of a scalar function
===

A naive example
---

We ware interested in a method that can find a local minimum of a continuous function, only being able to evaluate the gradient. In that case, we can just follow the gradient for small step.

In [None]:
f = lambda x: x ** 2
df = lambda x: 2 * x

We need to set a few values to do the optimisation. For one, we need a starting point: ``wrt``. Then we need to determine how big each step is, that is the step rate. It will be used to multiply the gradient with for each step.

In [None]:
step_rate = .2
wrt = -10
n_steps = 10

Now we just iterate.

In [None]:
def gradient_descent(wrt, f, df, step_rate, n_steps):
    path = [wrt]
    for i in range(n_steps):
        wrt -= step_rate * df(wrt)
        path.append(wrt)
    return path

Let us produce several runs and save the solution path during the optimisation.
We will then inspect the different routes afterwards.

In [None]:
gd_path_x2_0 = gradient_descent(wrt, f, df, .1, n_steps)
gd_path_x2_1 = gradient_descent(wrt, f, df, .01, n_steps)
gd_path_x2_2 = gradient_descent(wrt, f, df, 1, n_steps)

In [None]:
def opt_info1d(minx, maxx, f, paths, ax1, ax2, path_labels=None):
    xs = np.linspace(minx, maxx, 100)
    ys = f(xs)

    ax1.plot(xs, ys, label='$x^2$')
    
    if path_labels is None:
        path_labels = [str(i) for i in range(len(paths))]
    
    colors = list('krgbmc')
    for i, p, l in zip(itertools.count(), paths, path_labels):
        path_costs = [f(j) for j in p]
        ax1.plot(p, path_costs, '%so-' % colors[i], label=l)
        
        ax2.plot(path_costs, '%so-' % colors[i], label=l)
        ax2.set_xlabel('#iteration')
        ax2.set_ylabel('cost')

    ax1.legend()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False)

opt_info1d(-2, 10, f, [gd_path_x2_0, gd_path_x2_1, gd_path_x2_2], axs[0][0], axs[0][1])

Finding a better learning rate
---

In practice, setting the step rate is difficult and requires manual tuning. For quadratic functions, there is an optimal step rate that is the inverse of the curvature. Let's try that.

In [None]:
ddf = lambda x: 2

In [None]:
def curvature_descent(wrt, f, df, ddf, n_steps):
    path = [wrt]
    for i in range(n_steps):
        step_rate = abs(1. / ddf(wrt))
        wrt -= step_rate * df(wrt)
        path.append(wrt)
    return path

In [None]:
cd_path_x2 = curvature_descent(wrt, f, df, ddf, n_steps)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False)

opt_info1d(-2, 10, f, [gd_path_x2_1, cd_path_x2], axs[0][0], axs[0][1])

A more difficult problem
---

In [None]:
f = lambda x: x ** 4
df = lambda x: 4 * x ** 3
ddf = lambda x: 12 * x ** 2

In [None]:
wrt = 1
gd_path_sin_0 = gradient_descent(wrt, f, df, .1, n_steps)
cd_path_sin = curvature_descent(wrt, f, df, ddf, n_steps)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False)

opt_info1d(-.5, 1.5, f, [gd_path_sin_0, cd_path_sin], axs[0][0], axs[0][1])

An even more difficult problem
---

In [None]:
f = lambda x: np.sin(x)
df = lambda x: np.cos(x)
ddf = lambda x: -np.sin(x)

In [None]:
wrt = 1
gd_path_sin_0 = gradient_descent(wrt, f, df, .1, n_steps)
cd_path_sin = curvature_descent(wrt, f, df, ddf, n_steps)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False)

opt_info1d(-5, 10, f, [gd_path_sin_0, cd_path_sin], axs[0][0], axs[0][1])