# Neural Nets t2

In [1]:
%matplotlib widget
#%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# import Importing_Notebooks
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
import dill

A network built of components which:
1. accept an ordered set of reals (we'll use `numpy.array`, and  call them vectors) at the input port and produce another at the output port - this is forward propagation. ${\displaystyle f\colon \mathbf {R} ^{n}\to \mathbf {R} ^{m}}$
1. accept an ordered set of reals at the output port, representing the gradient of the loss function at the output, and produce the gradient of the loss function at the input port - this is back propagation, aka backprop. ${\displaystyle b\colon \mathbf {R} ^{m}\to \mathbf {R} ^{n}}$
1. from the gradient of the loss function at the output, calculate the partial of the loss function w.r.t the internal parameters ${\displaystyle \frac{\partial E}{\partial w} }$
1. accept a scalar $\alpha$ to control the adjustment of internal parameters. _Or is this effected by scaling the loss gradient before passing??_
1. update internal parameters ${\displaystyle w \leftarrow w - \alpha \frac{\partial E}{\partial w} }$


In [3]:
class Layer:
    def __init__(self):
        pass
    
    def __call__(self, x):
        """Computes response to input"""
        raise NotImplementedError
        
    def backprop(self, output_delE):
        """Uses output error gradient to adjust internal parameters, and returns gradient of error at input"""
        raise NotImplementedError

A network built of a cascade of layers:

In [4]:
class Network:
    def __init__(self):
        self.layers = []
        self.alpha = 0.1 #FIXME
        
    def extend(self, net):
        self.layers.append(net)
        
    def __call__(self, input):
        v = input
        for net in self.layers:
            v = net(v)
        return v
    
    def learn(self, facts):
        for (x, expected) in facts:
            y = self(x)
            e = y - expected
            loss = e.dot(e)/2.0
            agrad = e * self.alpha
            for net in reversed(self.layers):
                agrad = net.backprop(agrad)
        return loss

## Useful Layers

### Identify

In [5]:
class IdentityLayer(Layer):
    def __call__(self, x):
        return x
    
    def backprop(self, output_delE):
        return output_delE

### Affine
A layer that does an [affine transformation](https://mathworld.wolfram.com/AffineTransformation.html) aka affinity, which is the classic fully-connected layer with output offsets.

$$ \mathbf{M} \mathbf{x} + \mathbf{b} = \mathbf{y} $$
where
$$
\mathbf{x} = \sum_{j=1}^{n} x_j \mathbf{\hat{x}}_j \\
\mathbf{b} = \sum_{i=1}^{m} b_i \mathbf{\hat{y}}_i \\
\mathbf{y} = \sum_{i=1}^{m} y_i \mathbf{\hat{y}}_i
$$
and $\mathbf{M}$ can be written
$$
\begin{bmatrix}
    m_{1,1} & \dots & m_{1,n} \\
    \vdots & \ddots & \vdots \\
    m_{m,1} & \dots & m_{m,n}
\end{bmatrix} \\
$$

#### Error gradient back-propagation
$$ 
\begin{align}
 \frac{\partial loss}{\partial\mathbf{x}}
  = \frac{\partial loss}{\partial\mathbf{y}} \frac{\partial\mathbf{y}}{\partial\mathbf{x}}
  = \mathbf{M}\frac{\partial loss}{\partial\mathbf{y}}
\end{align}
$$
_SOLVE: Left-multiply or right-multiply?_

#### Parameter adjustment
$$
 \frac{\partial loss}{\partial\mathbf{M}}
 = \frac{\partial loss}{\partial\mathbf{y}} \frac{\partial\mathbf{y}}{\partial\mathbf{M}}
 = \frac{\partial loss}{\partial\mathbf{y}} \mathbf{x} \\
 \frac{\partial loss}{\partial\mathbf{b}}
 = \frac{\partial loss}{\partial\mathbf{y}} \frac{\partial\mathbf{y}}{\partial\mathbf{b}}
 = \frac{\partial loss}{\partial\mathbf{y}}
$$

In [6]:
class AffinityLayer(Layer):
    """An affine transformation, which is the classic fully-connected layer with offsets"""
    def __init__(self, n, m):
        self.M = np.empty((m, n))
        self.b = np.empty(m)
        self.randomize()
        
    def randomize(self):
        self.M[:] = np.random.randn(*self.M.shape)
        self.b[:] = np.random.randn(*self.b.shape)
        
    def __call__(self, x):
        self.input = x
        self.output = self.M @ x + self.b
        return self.output
    
    def backprop(self, output_delE):
        input_delE = self.M @ output_delE
        self.M -= np.einsum('i,j', output_delE, self.input) # use np.outer?
        self.b -= output_delE
        return input_delE

### Map
Maps a scalar function on the inputs, for e.g. activation layers.

In [7]:
class MapLayer(Layer):
    """Map a scalar function on the input taken element-wise"""
    def __init__(self, fun, dfundx):
        self.vfun = np.vectorize(fun)
        self.vdfundx = np.vectorize(dfundx)

        
    def __call__(self, x):
        self.input = x
        return self.vfun(x)
    
    def backprop(self, output_delE):
        input_delE = self.vdfundx(self.input) * output_delE
        return input_delE

___

## Tests

### One identity layer
See if the wheels turn:

In [8]:
net = Network()
net.extend(IdentityLayer())
all(net(np.arange(3)) == np.arange(3))

True

It does not learn, as expected:

In [9]:
facts = [(np.arange(2*n, 2*n+2), np.arange(2*n+1, 2*n-1, -1)) for n in range(3)]
net.learn(facts)

1.0

In [10]:
net(np.arange(2,4))

array([2, 3])

### One map layer

In [11]:
net = Network()
net.extend(MapLayer(lambda x: x+1, lambda d: 1))
all(net(np.arange(3)) == np.arange(3)+1)

True

It does not learn, as expected:

In [12]:
net.learn(facts), all(net(np.arange(5)) == np.arange(5)+1), net(np.arange(2,4))

(2.0, True, array([3, 4]))

### One affine layer

In [13]:
net = Network()
net.extend(AffinityLayer(2,2))

In [14]:
t = net.layers[0]
t.M, t.b

(array([[-0.21533062,  0.28402263],
        [-0.28379346, -1.35249073]]),
 array([ 0.61141609, -1.15769416]))

Can it learn the identity transformation?

In [15]:
# from nnbench import NNBench
from matplotlib.widgets import Slider, Button, RadioButtons

In [64]:
class NNBench:
    def __init__(self, net, ideal=lambda x:x):
        self.net = net
        self.ideal = ideal
        self.gc_protect = []
        self.seed = 3
    
    def checkpoint_net(self):
        self.net_checkpoint = dill.dumps(self.net)
        
    def rollback_net(self):
        self.net = dill.loads(self.net_checkpoint)
        
    def training_data_gen(self, n):
        """Generate n instances of labelled training data"""
        np.random.seed(self.seed)
        for i in range(n):
            v = np.random.randn(2)
            yield (v, self.ideal(v))
            
    def learn(self, n=100):
        return [self.net.learn([fact]) for fact in self.training_data_gen(n)]
            
    def learning_potential(self, n=100, alpha=None):
        stash = dill.dumps(self.net)
        if alpha is not None: # only change the net's alpha if a value was passed to us
            self.net.alpha = alpha
        loss = self.net.learn(fact for fact in self.training_data_gen(n))
        self.net = dill.loads(stash)
        return -np.log(loss)
        
    def plot_learning(self, n):
        from matplotlib import pyplot as plt
        # self.losses = losses = [self.net.learn(fact for fact in self.training_data_gen(n))]
        losses = self.learn(n)
        plt.yscale('log')
        plt.plot(range(len(losses)),losses)
        plt.show(block=0)
        
    def knobs_plot_learning(self, n):
        # from matplotlib import pyplot as plt
        fig, ax = plt.subplots()
        plt.subplots_adjust(left=0.25, bottom=0.25)
        a0 = 5
        f0 = 3
        
        ###
        losses = [self.net.learn([fact]) for fact in self.training_data_gen(n)]
        l, = plt.plot(range(len(losses)), losses, lw=2)
        ax.margins(x=0)
        plt.yscale('log')

        axcolor = 'lightgoldenrodyellow'
        axfreq = plt.axes([0.25, 0.1, 0.65, 0.03], facecolor=axcolor)
        axamp = plt.axes([0.25, 0.15, 0.65, 0.03], facecolor=axcolor)

        sfreq = Slider(axfreq, '⍺', 0, 1, valinit=self.net.alpha)
        samp = Slider(axamp, 'Num', 1, 1000, valinit=100, valstep=1)
        
        filtfunc = [lambda x:x]
        
        
        big = max(losses)
        ax.set_title(f"maxloss:{big}")
    
        iax = plt.axes([0.025, 0.7, 0.15, 0.15])
        def make_iax_image():
            return np.concatenate([np.concatenate((l.M,np.array([l.b])),axis=0)
                                   for l in self.net.layers
                                  if hasattr(l, 'M')],axis=1)
        def update_iax(img=[iax.imshow(make_iax_image())]):
            img[0].remove()
            img[0] = iax.imshow(make_iax_image())

        
        
        def update(val,ax=ax,loc=[l]):
            n = int(samp.val)
            self.rollback_net()
            sfunc = lambda x: 2**(-1.005/(x+.005))
            self.net.alpha = sfunc(sfreq.val)
            #sfreq.set_label("2.4e"%(self.net.alpha,))
            losses = filtfunc[0]([self.net.learn([fact]) for fact in self.training_data_gen(n)])
            big = max(losses)
            ax.set_title(f"⍺={self.net.alpha},max loss:{big}")
            loc[0].remove()
            loc[0], = ax.plot(range(len(losses)), losses, lw=2,color='xkcd:blue')
            ax.set_xlim((0,len(losses)))
            ax.set_ylim((min(losses),big))
            update_iax()
            fig.canvas.draw_idle()

        sfreq.on_changed(update)
        samp.on_changed(update)

        resetax = plt.axes([0.8, 0.025, 0.1, 0.04])
        button = Button(resetax, 'Reset', color=axcolor, hovercolor='0.975')

    
        def reset(event):
            self.seed += 1
            update()
        button.on_clicked(reset)

        rax = plt.axes([0.025, 0.5, 0.15, 0.15], facecolor=axcolor)
        radio = RadioButtons(rax, ('raw', 'low pass', 'green'), active=0)

        
        def colorfunc(label):
            if label == "raw":
                filtfunc[0] = lambda x:x
            elif label == "low pass":
                filtfunc[0] = lambda x:ndimage.gaussian_filter(np.array(x),3)
            #l.set_color(label)
            #fig.canvas.draw_idle()
            update()
        radio.on_clicked(colorfunc)

        plt.show()
        #return 'gc protect:', update, reset, colorfunc,sfreq,samp, radio, button
        self.gc_protect.append((update, reset, colorfunc,sfreq,samp, radio, button))

In [65]:
bench = NNBench(net)
bench.checkpoint_net()
bench.learning_potential()

nan

In [66]:
bench.plot_learning(100)

In [67]:
bench.ideal = lambda v: np.array([v[1], v[0]])
bench.knobs_plot_learning(100)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Learn thru a map layer

This layer doubles its input:

In [68]:
net = Network()
net.extend(AffinityLayer(2,2))

def dtanh(x):
    v = np.tanh(x)
    return (1+v)*(1-v)

net.extend(MapLayer(lambda x:x*x/2.0, lambda d:d))
#net.extend(MapLayer(np.tanh, dtanh))
bench = NNBench(net)
bench.checkpoint_net()

In [69]:
net.layers[0].M, net.layers[0].b

(array([[-0.32158469,  0.15113037],
        [-0.01862772,  0.48352879]]),
 array([0.76896516, 1.36624284]))

In [70]:
bench.ideal = lambda v: [(v[0]-v[1])**2,0]
#bench.ideal = lambda v: [(v[0]>0)*2-1,(v[0]>v[1])*2-1]
bench.learning_potential()
#bench.knobs_plot_learning(100)

  del sys.path[0]


nan

In [71]:
bench.knobs_plot_learning(100)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

  del sys.path[0]


Look into it:

['ALLOW_THREADS',
 'AxisError',
 'BUFSIZE',
 'CLIP',
 'DataSource',
 'ERR_CALL',
 'ERR_DEFAULT',
 'ERR_IGNORE',
 'ERR_LOG',
 'ERR_PRINT',
 'ERR_RAISE',
 'ERR_WARN',
 'FLOATING_POINT_SUPPORT',
 'FPE_DIVIDEBYZERO',
 'FPE_INVALID',
 'FPE_OVERFLOW',
 'FPE_UNDERFLOW',
 'False_',
 'Inf',
 'Infinity',
 'LowLevelCallable',
 'MAXDIMS',
 'MAY_SHARE_BOUNDS',
 'MAY_SHARE_EXACT',
 'MachAr',
 'NAN',
 'NINF',
 'NZERO',
 'NaN',
 'PINF',
 'PZERO',
 'RAISE',
 'SHIFT_DIVIDEBYZERO',
 'SHIFT_INVALID',
 'SHIFT_OVERFLOW',
 'SHIFT_UNDERFLOW',
 'ScalarType',
 'TooHardError',
 'True_',
 'UFUNC_BUFSIZE_DEFAULT',
 'UFUNC_PYVALS_NAME',
 'WRAP',
 '_UFUNC_API',
 '__SCIPY_SETUP__',
 '__all__',
 '__builtins__',
 '__cached__',
 '__config__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__numpy_version__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_add_newdoc_ufunc',
 '_dep_fft',
 '_deprecated',
 '_distributor_init',
 '_fun',
 '_key',
 '_lib',
 '_msg',
 '_sci',
 'absolute',
 'absolute_import'

In [24]:
net.layers[0].randomize()
net([3, 5])

array([3.2617429 , 6.04950289])

In [25]:
net.layers[0].M, net.layers[0].b

(array([[-1.41364637,  1.16071821],
        [ 0.66063039,  0.42396354]]),
 array([ 0.06821949, -1.07695745]))

Make the affine layer the identity transform:

In [26]:
net.layers[0].M = np.array([[1,0],[0,1]])
net.layers[0].b = np.array([0,0])
net([3,5])

array([ 6, 10])

In [27]:
bench.learning_potential()

-30.508647928518727

In [28]:
bench.knobs_plot_learning(100)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [29]:
net([7,11])

array([14, 22])

In [30]:
net.layers[0].M, net.layers[0].b

(array([[1, 0],
        [0, 1]]),
 array([0, 0]))

What is learning doing to it?

In [31]:
bench.learn(10)

[4.092601131822876e-32,
 3.947771200004956e-30,
 4.125582682120508e-31,
 3.777904178910002e-30,
 1.5037661005775538e-30,
 1.633188592840376e-30,
 1.3620176566706532e-30,
 5.516826325697237e-31,
 7.703719777548943e-33,
 7.888609052210118e-31]

In [32]:
net([7,11])

array([14, 22])

In [33]:
net.layers[0].M, net.layers[0].b

(array([[1, 0],
        [0, 1]]),
 array([0, 0]))

If we take the map layer off again, how does it do?

In [34]:
bench.rollback_net()
bench.net.layers = bench.net.layers[:1]
bench.checkpoint_net()

In [35]:
bench.ideal = lambda v: v
bench.learning_potential()
#bench.knobs_plot_learning(100)

17.53014188241033

It learns just fine, as expected. So we definitely have a problem.

### add a RELU

In [36]:
bench.net.layers = bench.net.layers[:1]
leak = 0
bench.net.extend(MapLayer(lambda x: (x*(1+leak/2)+abs(x)*(1-leak/2))/2, lambda d: [leak,1][1 if d>0 else 0]))
bench.net.layers

[<__main__.AffinityLayer at 0x7f0d293189d0>,
 <__main__.MapLayer at 0x7f0d292a86d0>]

In [37]:
bench.net.layers[0].randomize()
bench.checkpoint_net()
bench.ideal = lambda v: np.array([1,1])
bench.knobs_plot_learning(100)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [38]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


### XOR

In [39]:
net = Network()
net.extend(AffinityLayer(2,2))

In [40]:
t = net.layers[0]
t.M, t.b

(array([[ 1.18691118,  0.10949354],
        [ 1.40113726, -0.73322905]]),
 array([-0.96443749, -0.11239461]))