In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# solution 1 Creating a one-dimensional convolutional layer class that limits the number of channels to one

Create a 1D convolutional layer class SimpleConv1d with the number of channels limited to 1. The basic structure will be the same as that of the FC class for the full convolutional layer created in the previous Sprint. The initial values of Xavier are the same as for the full convolution layer.

Padding is not considered here, and the stride is fixed to 1. You don't need to think about processing multiple data at the same time, and the batch size should be 1 only. The extension of this part will be an advanced topic.

The formula for forward propagation is as follows.
$$\alpha_i = \sum_{s=0}^{F-1}x_{(i+s)}W_s + b$$.

$a_i$ : i-th value of the output array

$F$ : Size of the filter

$x_{(i+s})$ : (i+s)th value of the input array

$w_s$ : s-th value of the array of weights

$b$ : Bias term

All are scalars.

Next is the update formula. This is the same as the full coupling layer in that it can be replaced by AdaGrad, etc.
$$w'_s = w_s -\alpha\frac{\partial L}{\partial w_s}$$$$b' = b -\alpha\frac{\partial L}{\partial b}$$

$\alpha$ : Learning rate

$\frac{\partial L}{\partial w_s}$ : Gradient of loss $L$ for w_s

$\frac{\partial L}{\partial b}$ : Gradient of loss $L$ for $b$.

Here are the backpropagation formulas for finding the gradient $\frac{\partial L}{\partial w_s}$ and $\frac{\partial L}{\partial b}$.

$\frac{\partial L}{\partial w_s} = \sum_{i=0}^{N_{out}-1}\frac{\partial L}{\partial a_i}x_{(i+s}$$$$\frac{\partial L}{\partial b} = \sum_{i=0}^{N_{out}-1}\frac{\partial L}{\partial a_i}$$
$\frac{\partial L}{\partial a_i}$: i-th value of the gradient array

$N_{out} : Size of the output

The formula for the error to be passed to the previous layer is as follows.　　 $$\frac{\partial L}{\partial x_j} = \sum_{s=0}^{F-1}\frac{\partial L}{\partial a_{j-s}}w_s$$

$\frac{\partial L}{\partial x_j}$ : j-th value of the array of errors to be passed to the previous layer.

However, when $j-s&lt;0$ or $j-s&gt;N_{out}-1$, $\frac{\partial L}{\partial a_{j-s}}=0$.

The main difference from the full join layer is that the weights are shared for multiple features. In this case, the gradient is obtained by adding all the errors of the shared ones. The branching in the computational graph can be done by adding up the errors during backpropagation.

#  Output size calculation after one-dimensional convolution

The number of features changes when convolution is performed. How it changes can be obtained from the following formula. Padding and stride are also included. Create a function to do this calculation.
$$N_{out} = \frac{N_{in} + 2P - F}{S} + 1$$.

$$N_{out}$$ : Size of the output (number of features)

$N_{in}$ : size of the input (number of features)

$P$ : Number of paddings in a direction

$F$ : Size of the filter

$S$ : Size of stride

In [10]:
# Sample data
x = np.array([1,2,3,4])
w = np.array([3, 5, 7])
b = np.array([1])
delta_a = np.array([10, 20])

In [11]:
class FC:
    """
    All coupling layers from n_nodes1 to n_nodes2
    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in previous layer
    n_nodes2 : int
      Number of nodes in the next layer
    initializer : instance of initialization method
    optimizer : instance of optimisation method
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        # Initialization
        # Use the initializer method to initialize self.W and self.B
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2
        self.optimizer = optimizer
        #self.W = initializer.W(self.n_nodes1, self.n_nodes2)
        #self.B = initializer.B(self.n_nodes2)
        self.W = np.array([3,5,7]) 
        self.B = np.array([1])
        self.P = 0
        self.Str = 1
        self.s = len(self.W)
        self.a = np.array([])
        self.dW = np.array([])
        self.dX = np.array([])

    def forward(self, X):
        """
        Forward
        Parameters
        ----------
        X : ndarray of the following form, shape (batch_size, n_nodes_bf)
            Input
        Returns
        ----------
        A : ndarray of the following form, shape (batch_size, n_nodes_af)
            Outputs
        """
        #self.X = X
        self.X = X
        self.Xsize = len(self.X)
        self._output_size()
        
        self.a = np.append(self.a, np.array([(self.X[i:i+self.s] @ self.W.T + self.B) for i in range(self.Nout)]))
        return self.a
    
    def _output_size(self):
        self.Nout = int((len(self.X) + 2*self.P - self.s) / self.Str + 1)       
    
    def backward(self, dA):
        """
        Backward
        Parameters
        ----------
        dA : ndarray of the following form, shape (batch_size, n_nodes2)
            The gradient flowed from behind
        Returns
        ----------
        dZ : ndarray of the next shape, shape (batch_size, n_nodes1)
            Gradient flowing forward
        """
        self.dB = np.sum(dA, axis=0)
        self.dW = np.append(self.dW, np.array([(dA @ self.X[i:i+self.Nout].T) for i in range(self.s)]))

        for j in range(len(self.X)):
            if j-self.Nout < 0:
                #display(j)
                #display(np.flip(self.W[ : self.s-(self.Nout-j) ]))
                #display(dA[:j+1])
                self.dX = np.append(self.dX, (dA[:j+1] @ np.flip(self.W[ : self.s-(self.Nout-j) ]).T))
                
            elif j > len(self.X) - self.Nout:
                #display(j)
                #display(np.flip(dA    [-(j-(self.Xsize - self.Nout)):]))
                #display(np.flip(self.W[-(j-(self.Xsize - self.Nout)):]))
                self.dX = np.append(self.dX, (np.flip(dA    [-(j-(self.Xsize - self.Nout)):])) @ 
                                              np.flip(self.W[-(j-(self.Xsize - self.Nout)):]).T)
            else:
                #display(j)
                #display(np.flip(self.W[ j-self.Nout+1 : j+1 ]))
                self.dX = np.append(self.dX, (dA @ np.flip(self.W[ j-self.Nout+1 : j+1 ]).T))

        # Updates
        #self.W = self.optimizer.update(dW, self.W)
        #self.B = self.optimizer.update(dB, self.B)

        return self.dX

#  Experiment of one-dimensional convolutional layer with small array

Check that the forward and back propagation is correct in the small array shown below.

Let the input $x$, the weight $w$ and the bias $b$ be as follows

x = np.array([1,2,3,4])
w = np.array([3, 5, 7])
b = np.array([1])
With forward propagation, the output looks like this

a = np.array([35, 50])
Now consider backpropagation. Suppose the error is as follows

delta_a = np.array([10, 20])
If we backpropagate, we get the following value

delta_b = np.array([30])
delta_w = np.array([50, 80, 110])
delta_x = np.array([30, 110, 170, 140])

Implementation considerations
To implement convolution, you can start with a series of for statements. However, we want to make the computation as efficient as possible, so we will consider a way to compute the following expression at once.
$$a_i = \sum_{s=0}^{F-1}x_{i+s}w_s + b$$.

The bias term is a simple addition, so we look at the weight part. $$\sum_{s=0}^{F-1}x_{i+s}w_s$$.

This is the inner product of an array of w with a portion of x taken out of it. Given a concrete situation, we can calculate it with the following code. In this example, in order to make the flow easier to understand, the adamant product is calculated between each element and then the sum is calculated. This results in the same as the inner product.

x = np.array([1, 2, 3, 4])
w = np.array([3, 5, 7])

a = np.empty((2, 3))

indexes0 = np.array([0, 1, 2]).astype(np.int)
indexes1 = np.array([1, 2, 3]).astype(np.int)

a[0] = x[indexes0]w # x[indexes0]は([1, 2, 3])である a[1] = x[indexes1]w # x[indexes1]is ([2, 3, 4])

a = a.sum(axis=1)
The ndarray method makes use of the fact that arrays can be indexed.

You can also use a two-dimensional array to get a two-dimensional array from a one-dimensional array.

x = np.array([1, 2, 3, 4])
indexes = np.array([[0, 1, 2], [1, 2, 3]]).astype(np.int)

print(x[indexes]) # ([[1, 2, 3], [2, 3, 4]])
With a good combination of this and a broadcast or similar, it is possible to compute them all at once.

There is no right answer to the calculation method of convolution, so please make it efficient in your own way.

Reference
The part of Integer array indexing in the following page is a description of this method.

Indexing - NumPy v1.17 Manual

In [12]:
x = np.array([1,2,3,4])
w = np.array([3, 5, 7])
b = np.array([1])
test = FC(1,2,3,4)

In [13]:
test.forward(x)

array([35., 50.])

In [14]:
delta_a = np.array([10, 20])
test.backward(delta_a)

array([ 30., 110., 170., 140.])

In [15]:
display(test.dB)
display(test.dW)

30

array([ 50.,  80., 110.])

In [19]:
class Conv1d:
    """
    All join layers from number of nodes n_nodes1 to n_nodes2
    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in previous layer
    n_nodes2 : int
      Number of nodes in the next layer
    initializer : instance of initialization method
    optimizer : instance of the optimisation method
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        # Initialization.
        # Use the methods of initializer to initialize self.W and self.B
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2
        self.optimizer = optimizer
        #self.W = initializer.W(self.n_nodes1, self.n_nodes2)
        #self.B = initializer.B(self.n_nodes2)
        self.W = np.array([3,5,7]) 
        self.B = np.array([1])
        self.P = 0
        self.Str = 1
        self.s = len(self.W)
        self.a = np.array([])
        self.dW = np.array([])
        self.dX = np.array([])

    def forward(self, X):
         """
        Forward
        Parameters
        ----------
        X : ndarray of the following form, shape (batch_size, n_nodes_bf)
            Input
        Returns
        ----------
        A : ndarray of the following form, shape (batch_size, n_nodes_af)
            Outputs
        """
        #self.X = X
        self.X = X
        self.Xsize = len(self.X)
        self._output_size()
        
        self.a = np.append(self.a, np.array([(self.X[i:i+self.s] @ self.W.T + self.B) for i in range(self.Nout)]))
        return self.a
    
    def _output_size(self):
        self.Nout = int((len(self.X) + 2*self.P - self.s) / self.Str + 1)       
            
    def backward(self, dA):
        """
        Backwards
        Parameters
        ----------
        dA : ndarray of the following form, shape (batch_size, n_nodes2)
            Gradient flowing from behind
        Returns
        ----------
        dZ : ndarray of the following form, shape (batch_size, n_nodes1)
            Forward flow gradient
        """
        self.dB = np.sum(dA, axis=0)
        self.dW = np.append(self.dW, np.array([(dA @ self.X[i:i+self.Nout].T) for i in range(self.s)]))

        for j in range(len(self.X)):
            if j-self.Nout < 0:
                #display(j)
                #display(np.flip(self.W[ : self.s-(self.Nout-j) ]))
                #display(dA[:j+1])
                self.dX = np.append(self.dX, (dA[:j+1] @ np.flip(self.W[ : self.s-(self.Nout-j) ]).T))
                
            elif j > len(self.X) - self.Nout:
                #display(j)
                #display(np.flip(dA    [-(j-(self.Xsize - self.Nout)):]))
                #display(np.flip(self.W[-(j-(self.Xsize - self.Nout)):]))
                self.dX = np.append(self.dX, (np.flip(dA    [-(j-(self.Xsize - self.Nout)):])) @ 
                                              np.flip(self.W[-(j-(self.Xsize - self.Nout)):]).T)
            else:
                #display(j)
                #display(np.flip(self.W[ j-self.Nout+1 : j+1 ]))
                self.dX = np.append(self.dX, (dA @ np.flip(self.W[ j-self.Nout+1 : j+1 ]).T))
                
        # Updates
        #self.W = self.optimizer.update(dW, self.W)
        #self.B = self.optimizer.update(dB, self.B)

        return self.dX

IndentationError: ignored

#  Creating a one-dimensional convolutional layer class that does not limit the number of channels

Create a class Conv1d for a 1D convolutional layer that does not limit the number of channels to 1.

For example, if you have x, w, b as follows

x = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]) # shape(2, 4), with (number of input channels, number of features).
w = np.ones((3, 2, 3)) # All set to 1 for simplicity of the example. (number of output channels, number of input channels, filter size).
b = np.array([1, 2, 3]) # (number of output channels)
The output will look like this

a = np.array([[16, 22], [17, 23], [18, 24]]) # shape(3, 2), which is (number of output channels, number of features).
This is an example with 2 input channels and 3 output channels. After drawing the computational graph, let's also consider backpropagation by hand calculation. Since only sums and products appear in the computational graph, there is no need to think about differentiation anew.

Supplemental
When adding the number of channels, there is the problem of what order to put the arrays in. The most common order is (batch size, number of channels, number of features) or (batch size, number of features, number of channels), and the order varies depending on the library. The order differs depending on the library, and some can be switched.

In this example, the batch size is the number of channels. In the above example, the batch size is not considered, but (number of channels, number of features).

In [None]:
import itertools
import pprint

In [None]:
class Conv1d:
    """
    All coupling layers from n_nodes1 to n_nodes2
    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in previous layer
    n_nodes2 : int
      Number of nodes in the next layer
    initializer : instance of initialization method
    optimizer : instance of optimization method
    """
    def __init__(self):
        self.P = 0
        self.Str = 1
        self.a = np.array([])
        self.dW = np.array([])
        self.dX = np.array([])
        #self.s=None
    def forward(self, X, W ,B):
        """
        Forward
        Parameters
        ----------
        X : ndarray of the following form, shape (batch_size, n_nodes_bf)
            Input
        Returns
        ----------
        A : ndarray of the following form, shape (batch_size, n_nodes_af)
            Outputs
        """
        self.X = X
        self.Xsize = self.X.shape[1]
        self.W = W 
        self.B = B
        self.s = self.W.shape[2]
        self._output_size()
        self.a = np.append(self.a, np.array([np.sum(self.X[:, i:i+self.s] * self.W[i0,:,:]) + self.B[i0] 
                                             for i0,i in itertools.product(range(self.W.shape[0]), range(self.Nout))]))
        self.a = self.a.reshape(self.W.shape[0], self.Nout)
        return self.a
    def _output_size(self):
        self.Nout = int((self.Xsize + 2*self.P - self.s) / self.Str + 1)       
    def backward(self, dA):
        """
        Backward
        Parameters
        ----------
        dA : ndarray of the following form, shape (batch_size, n_nodes2)
            Gradient flowed from behind
        Returns
        ----------
        dZ : ndarray of the next shape, shape (batch_size, n_nodes1)
            Gradient flowing forward
        """
        self.dB = np.sum(dA, axis=1)   
        for i0 in range(self.W.shape[0]):
            dX_ary = np.zeros((self.W.shape[1], self.W.shape[2]))
            for i in range(self.Nout):
                dX_ary += dA[i0,i] * self.X[:, i:i+self.s]
            self.dW = np.append(self.dW, dX_ary)
        self.dW = self.dW.reshape(3,2,3)
        """
        Under consideration
        self.Xp=np.array([self.X]*self.W.shape[0])
        self.
        self.dW= np.zeros((self.W.shape))
        self.dW = [self.dW + (dA * self.Xp[:,:,i:i+self.s]) for i in range(self.Nout)]
        """
        #self.dW = np.append(self.dW, [dX_ary + (np.ones((self.W[1].shape, self.W[2].shape))*dA[i0,i]) * self.X[:, i:i+self.Nout] 
        #                              for i in range(self.Nout)] for i0 in range(self.W.shape[0]))
        
        #for j in range(len(self.X)):
        #    if j-self.Nout < 0:
        #        self.dX = np.append(self.dX, (dA[:j+1] @ np.flip(self.W[ : self.s-(self.Nout-j) ]).T))
        #        
        #    elif j > len(self.X) - self.Nout:
        #        self.dX = np.append(self.dX, (np.flip(dA    [-(j-(self.Xsize - self.Nout)):])) @ 
        #                                      np.flip(self.W[-(j-(self.Xsize - self.Nout)):]).T)
        #    else:
        #        self.dX = np.append(self.dX, (dA @ np.flip(self.W[ j-self.Nout+1 : j+1 ]).T))

        return self.dW
    
    #def _naiseki_plus(self,dA):
    #    dX_ary = np.zeros((self.W[1].shape, self.W[2].shape))
    #    dX_ary = [dX_ary + (np.ones((self.W[1].shape, self.W[2].shape))*dA[i0,i]) * self.X[:, i:i+self.Nout] for i in range(self.Nout)]
    #    return dX_ary

In [None]:
x = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]) # shape(2, 4), (number of input channels, number of features).
w = np.ones((3, 2, 3)) # all 1 for simplicity of the example. (number of output channels, number of input channels, filter size).
b = np.array([1, 2, 3]) # (number of output channels)

In [None]:
test2dim = Conv1d()
test2dim.forward(x,w,b)

In [None]:
x = np.array([[2, 3, 4, 5], [1, 2, 3, 4]]) # shape(2, 4), (number of input channels, number of features).
#w = np.ones((3, 2, 3)) # all 1 for simplicity of the example. (number of output channels, number of input channels, filter size).
w = np.array([[[1,1,1],
            [1,1,1]],
            [[1,1,1],
            [2,1,1]],
            [[2,1,1]],
            [1,1,2]])
b = np.array([3, 2, 1]) # (number of output channels)

In [None]:
test2dim = Conv1d()
test2dim.forward(x,w,b)

In [None]:
dA = np.array([[52,56],
            [32,35],
            [9,11]])
#dA = np.array([9,11])

In [None]:
dA3=np.array([dA]*3)
dA3

In [None]:
dA3=dA3.transpose(1,2,0)
dA3

In [None]:
test2dim.backward(dA)

#  Learning and estimation

Train and estimate MNIST by replacing some of the full coupling layers of the neural network you have been using with Conv1d and calculate Accuracy. Only the output layer should use the full coupling layer. However, if you have multiple channels, you cannot input to all coupled layer. At that stage, the channels should be set to 1 or they should be smoothed. Since 1D convolution of an image is not practical, accuracy is not required.

In [None]:
from keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder
(X_train, t_train), (X_test, t_test) = mnist.load_data()

In [None]:
X_train  = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)
X_train = X_train.astype(np.float)
X_test = X_test.astype(np.float)
X_train /= 255
X_test /= 255
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
t_train_one_hot = enc.fit_transform(t_train[:, np.newaxis])
t_test_one_hot = enc.fit_transform(t_test[:,  np.newaxis])

In [None]:
X_train = X_train.reshape(60000, 1, 784)
X_test = X_test.reshape(-1, 1, 784)
w = np.ones((3, 1, 3))
b = np.array([1, 2, 3])

In [None]:
class CNN_mnist_FC:
    """
    Convolutional layer
    Parameters
    ----------
    w:convolutional layer weights w.shape (output channel, input channel, filter size)
    b:Bias of the convolutional layer b.shape (output channel, )
    stride:Number of strides
    padding:Number of paddings
    optimizer : instance of the optimization method
    """
    def __init__(self, w, b , optimizer,stride, padding):
        self.optimizer = optimizer
        self.W = w
        self.B = b
        self.stride = stride
        self.padding = padding
    
    

    
    def forward(self, X):
        """
        Forward
        Parameters
        ----------
        X : ndarray of the following form, shape (batch_size, n_nodes1)
            Input
        Returns
        ----------
        A : ndarray of the following form, shape (batch_size, n_nodes2)
            Outputs
        """
        self.A = X
        output_size, chanel_size, filter_size = self.W.shape
        feature_size = self.A.shape[2]
        sample_size = self.A.shape[0]

        a = np.zeros([sample_size, output_size, feature_size-2])
        for samples in range(sample_size):
            for output in range(output_size):
                for j in range(filter_size - 1):
                    sig = 0
                    for chanel in range(chanel_size):
                        for i in range(filter_size):
                            sig += X[samples, chanel, i+j] * self.W[output, chanel, j]
                    a[samples, output, j] = sig + b[output]
        
        return a

    
    def backward(self, dA):
        """
        backward
        Parameters
        ----------
        dA : ndarray of the following form, shape (batch_size, n_nodes2)
            The gradient flowed from backward
        Return value
        ----------
        dZ : ndarray of the following form, shape (batch_size, n_nodes1)
            Gradient flowing forward
        """
        self.n_out = N_OUT(self.stride, self.padding, self.W, self.A)
        
        output_size, chanel_size, filter_size = self.W.shape
        feature_size = self.A.shape[2]
        sample_size = self.A.shape[0]
        
        #LB calculation
        self.LB = dA.sum(axis=0)
        self.LB = self.LB.sum(axis=1)
        
        #Calculation of LW
        self.LW = np.zeros_like(self.W)
        for samples in range(sample_size):
            for output in range(output_size):
                for chanel in range(chanel_size):
                    for i in range(filter_size):
                        for j in range(filter_size -1):
                            self.LW[output, chanel, i] += dA[samples, output, j]*self.A[samples, chanel, j+i]
                        
                        

                    
                    
        Calculation of #dZ
        dZ = np.zeros_like(self.A)
        for samples in range(sample_size):
            for output in range(output_size):
                for chanel in range(chanel_size):
                    for j in range(feature_size):
                        sigma=0
                        for s in range(filter_size):
                            if j - s < 0 or j - s > self.n_out -1:
                                pass
                            else:
                                sigma += dA[samples, output,  j-s] * self.W[output, chanel, s]
                        dZ[samples, chanel, j] += sigma


        
        
        # Updates
        self = self.optimizer.update(self)
        return dZ

In [None]:
class Relu:
    def forward(self, X):
        self.A = X
        return np.maximum(0, X)
    
    def backward(self, Z):
        
        return Z * np.maximum(np.sign(self.A), 0)

In [None]:
class FC:
    """
    All join layers from n_nodes1 to n_nodes2
    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in previous layer
    n_nodes2 : int
      Number of nodes in the next layer
    initializer : instance of initialization method
    optimizer : instance of the optimisation method
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.optimizer = optimizer
        # Initialize.
        # Use the methods of initializer to initialize self.W and self.B
        init = initializer
        self.n_nodes1 = n_nodes1
        self.W = init.W(n_nodes1, n_nodes2)
        self.B = init.B(n_nodes2)
    

    
    def forward(self, X):
        """
        Forward
        Parameters
        ----------
        X : ndarray of the following form, shape (batch_size, n_nodes1)
            Input
        Returns
        ----------
        A : ndarray of the following form, shape (batch_size, n_nodes2)
            Outputs
        """
        self.z = X
        self.a = X@self.W + self.B
        
        return self.a

    
    def backward(self, dA):
        """
        Backward
        Parameters
        ----------
        dA : ndarray of the following form, shape (batch_size, n_nodes2)
            The gradient flowed from behind
        Returns
        ----------
        dZ : ndarray of the next shape, shape (batch_size, n_nodes1)
            Gradient flowing forward
        """
        dZ = dA @ self.W.T
        self.LW = self.z.T @ dA
        self.LB = np.sum(dA, axis=0)
        
        
        # Update
        self = self.optimizer.update(self)
        return dZ

In [None]:
class SimpleInitializer:
    """
    ガウス分布によるシンプルな初期化
    Parameters
    ----------
    sigma : float
      ガウス分布の標準偏差
    """
    def __init__(self, sigma):
        self.sigma = sigma
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W :
        """
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        
        return W
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B :
        """
        B  = self.sigma * np.random.randn(n_nodes2)
        return B

In [None]:
class Softmax:
    def forward(self, A):
        exp_a = np.exp(A)
        softmax_result = np.empty((A.shape[0], A.shape[1]))
        exp_sum = np.sum(exp_a, axis=1)
        for i in range(A.shape[0]):
            softmax_result[i] = exp_a[i] / exp_sum[i]
            
        return softmax_result
    
    def backward(self, Z, Y):
        
        L_A = Z - Y
        self.cross_entropy = -np.average(np.sum(Y*np.log(Z), axis=1))
        
        
        return L_A

In [None]:
class SGD:
    """
    Stochastic gradient descent method
    Parameters
    ----------
    lr : Learning rate
    """
    def __init__(self, lr):
        self.lr = lr
    def update(self, layer):
        """
        Update the weights and bias of a layer
        Parameters
        ----------
        layer : the instance of the layer before the update

        Returns
        ----------
        layer : the instance of the layer after the update
        """

        
        layer.W = layer.W - self.lr * layer.LW
        
        layer.B = layer.B - self.lr*layer.LB

        
        
        
        return layer

In [None]:
def N_OUT(stride, padding, X,  W):
    if X.ndim == 1:
        return int((X.shape[0] + (2*padding) - len(W) / stride) + 1)
    elif X.ndim == 3:
        return int((X.shape[2] + (2*padding) - len(W) / stride) + 1 )

In [None]:
cnn_mnist = CNN_mnist_FC(w, b, SGD(0.1), 1, 0)
A = cnn_mnist.forward(X_train)
relu = Relu()
A_relu = relu.forward(A)
A_flat = A_relu.reshape(A_relu.shape[0], -1)
FC_1 = FC(2346, 10, SimpleInitializer(0.1), SGD(0.1))
A_FC_1 = FC_1.forward(A_flat)
softmax = Softmax()
A_soft = softmax.forward(A_FC_1)
A_delta = softmax.backward(A_soft, t_train_one_hot)
delta_Z = FC_1.backward(A_delta)
delta_Z_reshape = delta_Z.reshape(A_relu.shape)
delta_Z_relu = relu.backward(delta_Z_reshape)
dZ = cnn_mnist.backward(delta_Z_relu)

In [None]:
X_test = X_test.reshape(-1, 1, 784)
t_A = cnn_mnist.forward(X_test)
t_A = relu.forward(t_A)
t_A  = t_A.reshape(t_A.shape[0], -1)
t_A = FC_1.forward(t_A)
C = np.max(t_A, axis=1)
for i in range(t_A.shape[0]):
    t_A[i] = np.exp(t_A[i] - C[i])
t_A = softmax.forward(t_A)
y = np.argmax(t_A, axis=1)
from sklearn.metrics import accuracy_score
print(accuracy_score(t_test, y))