# XOR-5

Should do [Working efficiently with jupyter lab](https://florianwilhelm.info/2018/11/working_efficiently_with_jupyter_lab/)

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib widget
#%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
#import dill

In [None]:
from nn_v3 import Network, Layer, IdentityLayer, AffineLayer, MapLayer
from nnbench_v2 import NNBench

___

In [None]:
net = Network()
net.extend(AffineLayer(2,2))
#leak = 0
#net.extend(MapLayer(lambda x: (x*(1+leak/2)+abs(x)*(1-leak/2))/2, lambda d: [leak,1][1 if d>0 else 0]))
#net.extend(MapLayer(lambda x: max(0, np.sign(x)) * x, lambda d: max(0, np.sign(d))))
net.extend(MapLayer(np.tanh, lambda d: 1.0 - np.tanh(d)**2))
net.extend(AffineLayer(2,1))
net.extend(MapLayer(np.tanh, lambda d: 1.0 - np.tanh(d)**2))
sigmoid = lambda x: 1/(np.exp(x)+1)
#net.extend(MapLayer(sigmoid, lambda d: sigmoid(d)*(1-sigmoid(d))))
#net.extend(MapLayer(lambda x: max(0, np.sign(x)) * x, lambda d: max(0, np.sign(d))))

In [None]:
net.layers

In [None]:
bench = NNBench(net)

In [None]:
dat = \
[(np.array([-1,-1]), np.array([-1])),
 (np.array([-1,1]), np.array([1])),
 (np.array([1,1]), np.array([-1])),
 (np.array([1,-1]), np.array([1]))]
dc = 0
amp= 1
temp = [(d[0]*amp/2+dc,d[1]*amp/2+dc) for d in dat]

bench.training_data = ((np.array([v[0] for v in temp]),
                        np.array([v[1] for v in temp])),)
bench.training_data

In [None]:
bench.training_data_gen = bench.training_data_gen_fixed

In [None]:
#list(bench.training_data_gen(2))

In [None]:
if False: # it would read in an old nn_v2 net
    with open('slow_xor_1.net', 'rb') as f:
        bench.net_checkpoint = f.read()
    bench.rollback_net()
else:
    bench.randomize_net()
    bench.checkpoint_net()

In [None]:
bench.plot_learning(100)

In [None]:
bench.rollback_net()

In [None]:
# Development space for plotting:
if False:
    bench.rollback_net()
    losses = bench.learn(200)
    fig, ax = plt.subplots()  # Create a figure and an axes.
    ax.plot(losses, label=f"$\eta={bench.net.eta}$")  # Plot some data on the axes.
    ax.set_xlabel('learnings')  # Add an x-label to the axes.
    ax.set_ylabel('loss')  # Add a y-label to the axes.
    ax.set_title("Losses")  # Add a title to the axes.
    ax.set_yscale('log')
    ax.legend()  # Add a legend.
 

In [None]:
bench.net.state_vector()

In [None]:
bench.knobs_plot_learning(100)

In [None]:
bench.rollback_net()

In [None]:
isv = interesting_stubborn_sv = np.array([-4.16013824, -4.37023452, -0.83547458,  0.92877501,  1.48893334,
        1.5066594 ,  1.10828375, -0.71174959,  0.16778073])

In [None]:
bench.net.set_state_from_vector(interesting_stubborn_sv)

In [None]:
net(bench.training_data[0][0])

In [None]:
learnings = bench.learn(100)
net(bench.training_data[0][0])

In [None]:
learnings = bench.learn(100)
net(bench.training_data[0][0])

In [None]:
bench.net.set_state_from_vector(interesting_stubborn_sv)

## Plot the loss surface with `plotly`

In [None]:
if False:
    rates = np.logspace(-2, 0, num=10)
    cube = bench.learn_loss_cube(10000, rates)
    bench.plot_loss_cube()

## Plot the loss surface with `matplotlib`

In [None]:
#bench.mpl_plot_loss_cube()

## Tracks
Examine the trajectory in state space during learning, along state space, and the loss function.
Each learning iteration changes the net state. We can examine those deltas.
Questions:
1. Are there regimes of direction-of-change (DoC) in state space, or does the DoC wander chaotically?
1. What are the spectral characteristics of the DoC? Length characteristics?
1. How do the DoC characteristics relate to the loss function, and it's first difference?
1. How do these trajectories vary with learning rate? Are there clues in these to adapt the learning rate?
1. How do the trajectory characteristics vary across different starting nets?
1. How do these measures vary with the objective function of the learning process, that is, what you're trying to teach the net?
1. How do the different layers with learning state evolve? Do they settle at different times? How does an upstream layer change, as a consequence of learning, affect downstream layers? Down affect up?

### Analysis setup

`bench.learn_track(n)` does n batches of learning, recording the state vector of the network after each step.

In [None]:
bench.randomize_net()
bench.checkpoint_net()

In [None]:
bench.rollback_net()
bench.net.eta = 0.05
lt = bench.learn_track(2000)
lt[0], lt[-1]

Wrangle the state-space trajectory and the losses into form.

In [None]:
trajectory = np.vstack([v[0] for v in lt])
losses = np.vstack([v[1] for v in lt])

Take first differences, which represent the changes at each step

In [None]:
traj_steps = np.diff(trajectory, axis=0)
loss_steps = np.diff(losses, axis=0)

In [None]:
traj_steps[:5]

Find the L2 norm of the trajectory steps $\lVert traj \rVert$:

In [None]:
traj_L2 = np.sqrt(np.einsum('...i,...i', traj_steps, traj_steps))

In [None]:
len(traj_L2), traj_L2[:5], traj_L2[-5:]

Find the angles between trajectory steps, from
$$\mathbf {a} \cdot \mathbf {b} = \left\|\mathbf {a} \right\|\left\|\mathbf {b} \right\|\cos \theta \\
\cos \theta = \frac{\mathbf {a} \cdot \mathbf {b}}{\left\|\mathbf {a} \right\|\left\|\mathbf {b} \right\|} \\
$$
where $\mathbf {a}$ and $\mathbf {b}$ are a state-space trajectory step and the succeeding step respectively

Find $\mathbf {a} \cdot \mathbf {b}$:

In [None]:
trajn_dot_nplus1 = np.einsum('...i,...i', traj_steps[:-1], traj_steps[1:])
trajn_dot_nplus1[:5], np.any(trajn_dot_nplus1 < 0)

Find $\left\|\mathbf {a} \right\|\left\|\mathbf {b} \right\|$:

In [None]:
traj_cos_denom = np.multiply(traj_L2[:-1], traj_L2[1:])

This will be the divisor. Some entries may be zero, so we adapt

In [None]:
len(traj_L2) - np.count_nonzero(traj_L2)

In [None]:
np.equal(traj_L2, 0)

Find $\cos \theta$ by dividing, excluding division by zero:

In [None]:
traj_cos = np.divide(trajn_dot_nplus1, traj_cos_denom, where=traj_cos_denom!=0.0)
traj_cos[:5], traj_cos[-5:], min(traj_cos), max(traj_cos)

In [None]:
#traj_theta = np.arccos(traj_cos)
#traj_theta[:5], traj_theta[-5:]

In [None]:
# Development space for plotting:
if True:
    fig, ax = plt.subplots()  # Create a figure and an axes.
    traj_color = 'xkcd:red'
    loss_color = 'xkcd:blue'
    cos_color = 'xkcd:green'
    ax.set_xlabel('$n$')  # Add an x-label to the axes.
    ax.set_ylabel('$|\Delta state|$', color=traj_color)
    ax.tick_params(axis='y', labelcolor=traj_color)
    ax.set_title(f"$\eta={bench.net.eta}$")  # Add a title to the axes.
    ax.set_yscale('log')
    tnl, = ax.plot(traj_L2, label=f"traj norm", color=traj_color)  # Plot some data on the axes.
    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.tick_params(axis='y', labelcolor=loss_color)
    dll, = ax2.plot(loss_steps, label=f"$\Delta loss$", color=loss_color)  # Plot some data on the axes.
    cosl, = ax2.plot(traj_cos, label=f"$\Delta state cosine$", color=cos_color)
    ax.legend([tnl, dll, cosl], ["$\\|\\Delta state \\|$", "$\\Delta loss$", "$cos(\\theta)\Delta$"])  # Add a legend.
    #ax2.legend()  # Add a legend.
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()
 

## Interesting structures in the loss surface

In [None]:
bench.net.eta = 0.1226
bench.plot_learning(1000)

In [None]:
rates = np.flip(0.175/np.exp(np.arange(100)*0.0075))
rates

In [None]:
cube = bench.learn_loss_cube(500, rates)

In [None]:
bench.plot_loss_cube()

---

# Scratch