In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**I am a student from HUST, [Huazhong University of Science and Technology](https://en.wikipedia.org/wiki/Huazhong_University_of_Science_and_Technology). This work will be my homework for the lesson，called AI introduction by Associate Professor Yan Jin.**

# source codes

In [None]:
!pwd

In [None]:
cd ../input/mnist-dataset

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import struct
import os
import math

In [None]:
# 读取数据

# 参考博客：https://blog.csdn.net/u013597931/article/details/80099243
def load_mnist_train(path, kind='train'):    
    labels_path = os.path.join(path+'%s-labels-idx1-ubyte/'% kind,'%s-labels-idx1-ubyte'% kind)
    images_path = os.path.join(path+'%s-images-idx3-ubyte/'% kind,'%s-images-idx3-ubyte'% kind)    # 训练集的路径读取
    with open(labels_path, 'rb') as lbpath:          
        magic, n = struct.unpack('>II',lbpath.read(8))
        labels = np.fromfile(lbpath,dtype=np.uint8)
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII',imgpath.read(16))
        images = np.fromfile(imgpath,dtype=np.uint8).reshape(len(labels), 784)
    return images, labels
def load_mnist_test(path, kind='t10k'):
    labels_path = os.path.join(path+'%s-labels-idx1-ubyte/'% kind,'%s-labels-idx1-ubyte'% kind)
    images_path = os.path.join(path+'%s-images-idx3-ubyte/'% kind,'%s-images-idx3-ubyte'% kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',lbpath.read(8))
        labels = np.fromfile(lbpath,dtype=np.uint8)
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII',imgpath.read(16))
        images = np.fromfile(imgpath,dtype=np.uint8).reshape(len(labels), 784)
    return images, labels   

path='/kaggle/input/mnist-dataset/'
train_images,train_labels=load_mnist_train(path)    # 训练数据
test_images,test_labels=load_mnist_test(path)      # 测试数据

fig=plt.figure(figsize=(8,8))
fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(30):
    images = np.reshape(train_images[i], [28,28])
    ax=fig.add_subplot(6,5,i+1,xticks=[],yticks=[])
    ax.imshow(images,cmap=plt.cm.binary,interpolation='nearest')
    ax.text(0,7,str(train_labels[i]))
plt.show()

In [None]:
# 数据规模
print(train_images.shape)
print(test_images.shape)

print(train_labels.shape)
print(test_labels.shape)

In [None]:
# 数据归一化
train_images = train_images / 255.0
test_images = test_images / 255.0

In [None]:
# 抽取一个小批量
Batch_size = 10000
batch_train = np.random.choice(np.arange(train_images.shape[0]), Batch_size)
batch_test = np.random.choice(np.arange(test_images.shape[0]), 2000)
train_images0 = train_images[batch_train]
train_labels0 = train_labels[batch_train]
test_images0 = test_images[batch_test]
test_labels0 = test_labels[batch_test]
print(train_images0.shape)

In [None]:
 np.seterr(divide='ignore', invalid='ignore')  # 忽略掉除于0的元素

In [None]:
# 全连接层的初始化类
class FullyConnected():
    def __init__(self, W, b):
        '''
        Parameter:
        W: 权重矩阵，形状为(N, M), N为输入神经元的个数，M为输出神经元的个数
        b: 偏移量矩阵 (M，)
        '''
        self.W = W    # 赋值
        self.b = b    # 赋值

        self.x = None      # 用来存储输入神经元的矩阵，为反向传播提供便利

        self.dW = None   # 用来存储梯度，梯度下降时需要用来更新权重
        self.db = None
    
    # 全连接层的前向传播
    def forward(self, x):
        '''
        input:
        x: 输入神经元的矩阵, 形状(B, N)，B为批量大小，N为输入神经元的个数
        output:
        y: 输出神经元的矩阵, 形状(B, M)， M为输出神经元的个数
        '''
        self.x = x            # 存储输入神经元的矩阵，便于反向传播计算更新权重
        out = np.dot(self.x, self.W) + self.b   # 完成一次前向传播
        return out    # 返回前向传播结果
    
    
    # 全连接层的反向传播
    def backward(self, dout):
        '''
        input:
        dout: 损失函数相对于全连接层输出的梯度，形状为(B,M)，M是全连接层的输出神经元个数。
        在前向传播时全连接层的输入记录在了self.x中，故由此我们可以利用dout和self.x得到W的梯度
        output:
        dx:  (B, N) 关于输入层的梯度，便于进一步反向传播
        self.W和self.b的梯度分别存储在self.dW和self.db中
        self.dW: (N, M) 与self.W形状相同，self.W的梯度
        self.db: (M,)， self.b的梯度
        将x的梯度返回。
        '''
        # 以下所有即为矩阵的求导方法，我们也可以根据形状输入输出求解
        # 均依赖于公式 Y = X^T + W
        self.db = np.sum(dout, axis=0)    # 需要将得到的所有dout延y轴相加，因为取loss是就除以了batch_size
        self.dW = np.dot(self.x.T, dout)  # 在前向传播时全连接层的输入记录在了self.x中，这一项根据矩阵求导得到我们结果
        dx = np.dot(dout, self.W.T)       # 由矩阵求导得出结果
        return dx                       # 返回对输入层求导的结果，便于记录进一步反向传播

In [None]:
# 卷积层初始化类
class Conv2d():
    '''
    Parameter:
        in_channels: C_in from expected input shape (B, C_in, H_in, W_in).
        channels: C_out from output shape (B, C_out, H_out, W_out).
        kernel_size: default 3.
        stride: default 1.
        padding: default 0.
    '''
    
    def __init__(self, in_channels: int, channels: int, kernel_size: int=3,
                 stride: int=1, padding: int=0, bias: bool=False):
        """
        二维卷积层
        input:
        - W: numpy.array, (C_out, C_in, K_h, K_w)
        - b: numpy.array, (C_out)
        - stride: int
        - pad: int

        """
        self.W = tensor(np.random.randn(channels, in_channels, kernel_size, kernel_size))
        # self.b = b
        self.stride = stride
        self.pad = padding
        self.kernel_size = kernel_size
        self.x = None
        self.col = None
        self.col_W = None
        # self.dW = None   self.W.grad
        # self.db = None


    def forward(self, x):
        """
        input:
            x: input of shape (B, C_in, H_in, W_in).
        output:
            out: output of shape (B, C_out, H_out, W_out).
        """
        FN, C, FH, FW = self.W.shape
        N, C, H, W = x.shape
        out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
        out_w = 1 + int((W + 2*self.pad - FW) / self.stride)

        col = Conv2d_im2col(x)
        col_W = self.W.reshape(FN, -1).T

        out = np.dot(col, col_W)
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
        
        self.x = x
        self.col = col
        self.col_W = col_W
        
        return out
        


    def backward(self, dy):
        """
        input:
            dy: output delta of shape (B, C_out, H_out, W_out).
        output:
            dx: input delta of shape (B, C_in, H_in, W_in).
        """
        def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
            N, C, H, W = input_shape
            out_h = (H + 2 * pad - filter_h) // stride + 1
            out_w = (W + 2 * pad - filter_w) // stride + 1
            col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)

            img = np.zeros((N, C, H + 2 * pad + stride - 1, W + 2 * pad + stride - 1))
            for y in range(filter_h):
                y_max = y + stride * out_h
                for x in range(filter_w):
                    x_max = x + stride * out_w
                    img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]

            return img[:, :, pad:H + pad, pad:W + pad]
        
        FN, C, FH, FW = self.W.shape
        dout = dy
        dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN)
        # self.b.grad = np.sum(dout, axis=0)
        self.W.grad = np.dot(self.col.T, dout)
        self.W.grad = self.W.grad.transpose(1, 0).reshape(FN, C, FH, FW)
        dcol = np.dot(dout, self.col_W.T)
        dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)
        return dx



class Conv2d_im2col(Conv2d):
    '''
    贾杨清所给思想，利用将卷积层转化为二维大矩阵，
    当然还有一个col2im是另外一个关于此将二位大矩阵转化为图片的卷积层，反向传播用。
    '''

    def forward(self, x):

        # TODO Implement forward propogation of
        # 2d convolution module using im2col method.
        input_data = x
        filter_h, filter_w = self.kernel_size, self.kernel_size
        stride = self.stride
        pad = self.pad
        N, C, H, W = input_data.shape
        out_h = (H + 2 * pad - filter_h) // stride + 1
        out_w = (W + 2 * pad - filter_w) // stride + 1

        img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 'constant')
        col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))

        for y in range(filter_h):
            y_max = y + stride * out_h
            for x in range(filter_w):
                x_max = x + stride * out_w
                col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]

        col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
        return col

In [None]:
# 池化层的初始化类
class MaxPool:
    """MaxPooling层

       Parameters:
            kernel_size: default 2.
            stride: default 2.
            padding: default 0.
        """
    def __init__(self, kernel_size: int=2,
                 stride: int=2, padding: int=0):
        '''
        input:
            kernel_size: default 2.
            stride: default 2.
            padding: default 0.
        '''
        self.pool_h = kernel_size
        self.pool_w = kernel_size
        self.stride = stride
        self.pad = padding

        
    def forward(self, x):
        """
        input:
            x: input of shape (B, C, H_in, W_in).
        output:
            out: output of shape (B, C, H_out, W_out).
        """
        
        def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
            N, C, H, W = input_data.shape
            out_h = (H + 2 * pad - filter_h) // stride + 1
            out_w = (W + 2 * pad - filter_w) // stride + 1

            img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 'constant')
            col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))

            for y in range(filter_h):
                y_max = y + stride * out_h
                for x in range(filter_w):
                    x_max = x + stride * out_w
                    col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]

            col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
            return col

        
        N, C, H, W = x.shape
        FN, C, FH, FW = 1, C, self.pool_h, self.pool_w
        out_h = (H + 2 * self.pad - FH) // self.stride + 1
        out_w = (W + 2 * self.pad - FW) // self.stride + 1
        col = im2col(x, FH, FW, self.stride, self.pad)
        col = col.reshape((N*out_h*out_w*C, -1))
        col = np.max(col, axis=-1)
        col = col.reshape(N, out_h, out_w, C)
        col = col.transpose(0, 3, 1, 2)
        return col


    def backward(self, dy):
        """
        input:
            dy: output delta of shape (B, C, H_out, W_out).
        output:
            out: input delta of shape (B, C, H_in, W_in).
        """
        def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
            N, C, H, W = input_shape
            out_h = (H + 2 * pad - filter_h) // stride + 1
            out_w = (W + 2 * pad - filter_w) // stride + 1
            col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)

            img = np.zeros((N, C, H + 2 * pad + stride - 1, W + 2 * pad + stride - 1))
            for y in range(filter_h):
                y_max = y + stride * out_h
                for x in range(filter_w):
                    x_max = x + stride * out_w
                    img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]

            return img[:, :, pad:H + pad, pad:W + pad]
        
        
        dout = dy
        dout = dout.transpose(0, 2, 3, 1)
        pool_size = self.pool_h * self.pool_w
        dmax = np.zeros((dout.size, pool_size))
        dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()
        dmax = dmax.reshape(dout.shape + (pool_size,))
        dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
        dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)
        return dx




In [None]:
class BatchNorm1d:
    '''
    BN层当时参考了一个外国人的博客，地址：https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
    后来的代码也是根据他的伪代码一步一步实现的，整体实现比较乱，因为只有这个代码不是我自己完全写的，如下所示：
    '''
    def __init__(self, length: int, momentum: float=0.9):
        """Module which applies batch normalization to input.

        Args:
            length: L from expected input shape (N, L).
            momentum: default 0.9.
        """
        super(BatchNorm1d, self).__init__()

        L = length
        self.gamma = tensor.ones(L) # initialize the parameters
        self.beta = tensor.zeros(L)  # same as up
        self.momentum = momentum
        self.eps = 1e-5
        self.bn_param = {}
        self.bn_param['running_mean'] = tensor.zeros(L)
        self.bn_param['running_var'] = tensor.ones(L)
        self.x_hat = None
        self.var = None
        self.avg = None
        self.vareps = None
        ...

        # End of todo

    def forward(self, x):
        """Forward propagation of batch norm module.

        Args:
            x: input of shape (N, L).
        Returns:
            out: output of shape (N, L).
        """
        # https://blog.csdn.net/weixin_39228381/article/details/107896863
        # https://zhuanlan.zhihu.com/p/196277511
        
        running_mean = self.bn_param['running_mean']
        running_var = self.bn_param['running_var']
        
        N, L = x.shape  # get the batch and the length of a sample
        self.avg = np.sum(x, axis=0) / N # get the every sample's average
        self.var = np.sum((x - np.tile(self.avg, (N, 1))) ** 2, axis=0) / N  # get the every sample's variance
        self.xmu = x - self.avg
        self.vareps = (self.var + self.eps) ** 0.5   # get the Denominators
        self.x_hat = (x - np.tile(self.avg, (N, 1))) / np.tile(self.vareps, (N, 1))  # get the normalized sequence 
        
        out = self.gamma * self.x_hat + self.beta
        
        running_mean = self.momentum * running_mean + (1 - self.momentum) * self.avg
        running_var = self.momentum * running_var + (1 - self.momentum) * self.var
        self.bn_param['running_mean'] = running_mean
        self.bn_param['running_var'] = running_var
        
        return out
        ...

        # End of todo
#     mu = self.avg
#     xmu = x - mu
#     sq = xmu ** 2 
#     var = 1. / N * np.sum(sq, axis=0)
#     sqrtvar = np.sqrt(var + eps)
#     ivar = 1 / sqrtvar
#     x_hat = xmu * ivar
#     gammax = gamma * x_hat
        
    def backward(self, dy):
        """Backward propagation of batch norm module.

        Args:
            dy: output delta of shape (N, L).
        Returns:
            dx: input delta of shape (N, L).
        """

        # TODO Implement backward propogation
        # of 1d batchnorm module.
#         N, L = dy.shape
#         var_plus_eps = self.vareps
#         self.gamma.grad = np.sum(self.x_hat * dy, axis=0)
#         self.beta.grad = np.sum(dy, axis=0)
        
#         dx_hat = dy * self.gamma   # x_hat's grad
#         x_hat = self.x_hat

# #         dx = N * dx_hat - np.sum(dx_hat, axis=0) + (1.0/N) * np.sum(dx_hat, axis=0) * np.sum(dx_hat * x_hat, axis=0) - x_hat * np.sum(dx_hat * x_hat, axis=0) 
# #         dx *= (1 - 1.0/N) / var_plus_eps
        
#         dx = dx_hat * (1 - 1. / N) * (1. / var_plus_eps) * (1 - 1. / (N * self.var) * self.xmu ** 2)
#         return dx

        xhat,gamma,xmu,ivar,sqrtvar,var,eps = self.x_hat, self.gamma, self.xmu, self.vareps, 1 / self.vareps, self.var, self.eps

        #get the dimensions of the input/output
        N,D = dout.shape

        #step9
        dbeta = np.sum(dout, axis=0)
        dgammax = dout #not necessary, but more understandable

        #step8
        dgamma = np.sum(dgammax*xhat, axis=0)
        dxhat = dgammax * gamma

        #step7
        divar = np.sum(dxhat*xmu, axis=0)
        dxmu1 = dxhat * ivar

        #step6
        dsqrtvar = -1. /(sqrtvar**2) * divar

        #step5
        dvar = 0.5 * 1. /np.sqrt(var+eps) * dsqrtvar

        #step4
        dsq = 1. /N * np.ones((N,D)) * dvar

        #step3
        dxmu2 = 2 * xmu * dsq

        #step2
        dx1 = (dxmu1 + dxmu2)
        dmu = -1 * np.sum(dxmu1+dxmu2, axis=0)


        #step1
        dx2 = 1. /N * np.ones((N,D)) * dmu
        
        #step0
        dx = dx1 + dx2

        return dx

In [None]:
# 激活函数的初始化类
class Sigmoid():
    '''
    Parameter:
    z:激活函数作用后得到的矩阵
    '''
    def __init__(self):
        self.z = None
    
    
    # Sigmoid激活函数的前向传播。
    def forward(self, y):
        '''
        input: 
        y:全连接层前向传播得到矩阵，形状为(B, N)
        output:
        z:激活函数作用后得到矩阵，形状为(B, N)
        '''
        z = np.exp(y) / (1 + np.exp(y)) # 利用np.exp直接对矩阵运算
        self.z = z   # 赋值
        return z  # 返回矩阵
    
    # sigmoid的反向传播
    def backward(self, dout):
        '''
        input: 
        dout：损失函数相对于sigmoid输出的梯度
        output:
        dz:相对于矩阵y得到的梯度
        '''
        dz = dout * self.z * (np.ones(self.z.shape) - self.z)
        return dz

class Relu:
    def __init__(self):
        '''
        Parameter:
        z:激活函数作用后得到的矩阵
        '''
        self.mask = None
    
    # Relu激活函数的前向传播。
    def forward(self, y):
        '''
        input: 
        y:全连接层前向传播得到矩阵，形状为(B, N)
        output:
        z:激活函数作用后得到矩阵，形状为(B, N)
        '''
        self.mask = (y <= 0)   # 得到关于y大于小于0的真值的矩阵
        z = y.copy()       # 深度拷贝一个y矩阵
        z[self.mask] = 0   # 将小于零的值赋为0
        return z   # 返回矩阵

    def backward(self, dout):
        '''
        input: 
        dout：损失函数相对于relu输出的梯度
        output:
        dz:相对于矩阵y得到的梯度
        '''
        dout[self.mask] = 0
        dz = dout
        return dz
    

class Cos:
    def __init__(self):
        '''
        Parameter:
        z:激活函数作用后得到的矩阵
        '''
        self.z = None
        
    def forward(self, y):
        '''
        input: 
        y:全连接层前向传播得到矩阵，形状为(B, N)
        output:
        z:激活函数作用后得到矩阵，形状为(B, N)
        '''
        self.z = y
        return np.cos(y)
    
    def backward(self, dout):
        '''
        input: 
        dout：损失函数相对于relu输出的梯度
        output:
        dz:相对于矩阵y得到的梯度
        '''        
        return dout * np.sin(-self.z)
    
class Tanh:
    
    def __init__(self):
        '''
        Parameter:
         x: 输入的矩阵
        '''
        self.x = None
    
    def forward(self, x):
        '''
        input: 
        x:全连接层前向传播得到矩阵，形状为(B, N)
        output:
        激活函数作用后得到矩阵，形状为(B, N)
        '''
        self.x = x
        return np.tanh(x)

    def backward(self, dout):
        '''
        input: 
        dout：损失函数相对于relu输出的梯度
        output:
        dz:相对于矩阵y得到的梯度
        '''
        return dout * (1 - np.tanh(self.x) ** 2)

In [None]:
# 误差损失函数的类定义（Cross Entropy+softmax)
# 激活函数softmax
def softmax(y):
    '''
    input:
    y:最终得到的预测输出结果矩阵
    output:
    将其使用softmax归一化返回处理后的矩阵（利于计算损失函数）
    '''
    y = y - np.max(y, axis=1, keepdims=True)     # 防止产生exp溢出的危险，所以每一行都减去最大值，且由加减值性质易得不会对值产生影响
    return np.exp(y) / np.sum(np.exp(y), axis=1, keepdims=True)   # 返回softmax处理后矩阵，利于进一步计算损失函数

# 类定义
class SoftmaxWithLoss():
    '''
    Parameter:
    y : 预测输出结果矩阵，需要进一步softmax处理并利用其求出误差损失，形状为(B, 10)
    label: 真实标签矩阵，形状为 (B， 1)
    '''
    
    def __init__(self):
        self.loss = None
        self.z = None
        self.label = None
    
    # SoftMax + Cross Entropy的前向传播
    def forward(self, y, label):
        '''
        input:
        y : 预测输出结果矩阵，需要进一步softmax处理并利用其求出误差损失，形状为(B, 10)
        label: 真实标签矩阵，形状为 (B， 1)
        output:
        loss: 交叉熵损失
        '''
        z = softmax(y)      # 使用激活函数将输出矩阵归一化
        batch_size = z.shape[0]   # 得到batch_size
        loss = -np.sum(np.log(z[np.arange(batch_size), label])) / batch_size    # 求出平均损失误差值，使用交叉熵，利用one-hot特性得到每组输入的log值
#                 loss = -np.sum(np.log(z[np.arange(batch_size), t] + 1e-7)) / batch_size
        self.loss = loss  # 记录损失值
        self.z = z     
        self.label = label    # 存储记录
        return loss    # 返回误差损失
    
    # SoftMax + Cross Entropy的反向传播
    def backward(self):
        '''
        output:
        交叉熵+softmax梯度
        '''
        batch_size = self.z.shape[0]  # 得到batch_size
        dz = np.copy(self.z)       # 深拷贝
        for label_, z_ in zip(self.label, dz):   # 由求梯度+onehot编码推出仅需在真实值所在位置减1即得梯度
            z_[label_] -= 1
        dz /= batch_size   # 取平均
        return dz   # 返回梯度

In [None]:
# 神经网络实现
class Network:
    # 初始化
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size=10, lr=0.1):
        '''
        Parameters:
        input_size, hidden_size1, hidden_size2, output_size:
        分别为输入层神经元个数、隐藏层神经元数、隐藏层神经元个数、输出层神经元个数数(手写数字识别默认为10), 学习率(默认为0.1)
        output:None
        '''
        W1 = np.random.randn(input_size, hidden_size1)  # 随机初始化权重
        W2 = np.random.randn(hidden_size1, hidden_size2)
        W3 = np.random.randn(hidden_size2, output_size)
        b1 = np.random.randn(hidden_size1)
        b2 = np.random.randn(hidden_size2)
        b3 = np.random.randn(output_size)
        
        
        self.lr = lr     # 学习率
        self.layer_1 = FullyConnected(W1, b1)
        self.sigmoid_1 = Sigmoid()
        self.layer_2 = FullyConnected(W2, b2)
        self.sigmoid_2 = Sigmoid()
        self.layer_last = FullyConnected(W3, b3)
        self.loss = SoftmaxWithLoss()
    
    # 神经网络前向传播
    def forward(self, x, label):
        '''
        input:
        x: 形状为(B,N)，输入的原始数据， B为批量Batch_size
        label:输入B个数据的分类类别，形状为(B, 1)
        output:
        最后输出的预测向量以及我们得到的误差
        '''
        y1 = self.layer_1.forward(x)  # 前向传播，一步步往后走
        z1 = self.sigmoid_1.forward(y1)
        y2 = self.layer_2.forward(z1)
        z2 = self.sigmoid_2.forward(y2)
        y3 = self.layer_last.forward(z2)
        loss = self.loss.forward(y3, label)
        
        return y3, loss
    
    
    # 神经网络反向传播
    def backward(self):
        '''
        input:None
        output:
        各项参数的梯度
        '''
        d = self.loss.backward()     # 反向传播，一步步往前走，和前向完全相反
        d = self.layer_last.backward(d)
        d = self.sigmoid_2.backward(d)
        d = self.layer_2.backward(d)
        d = self.sigmoid_1.backward(d)
        d = self.layer_1.backward(d)    # 至此，我们单次反向传播完成。
        
        return self.layer_1.dW, self.layer_1.db, self.layer_2.dW, self.layer_2.db, self.layer_last.dW, self.layer_last.db  # 将每层间权重的W 、偏移量b梯度返回
        # 这一步只是为了方便检查展示，并没有很大的用途


    # 神经网络更新权重
    def refresh(self):
        lr = self.lr
        self.layer_1.W -= lr * self.layer_1.dW
        self.layer_1.b -= lr * self.layer_1.db
        self.layer_2.W -= lr * self.layer_2.dW
        self.layer_2.b -= lr * self.layer_2.db
        self.layer_last.W -= lr * self.layer_last.dW
        self.layer_last.b -= lr * self.layer_last.db
    
    # 训练模型并判断正确率
    def fit_pred(self, train_images, train_labels, test_images, test_labels, Epochs=5, batch_size=100, losses=None, accuracy=None):
        '''
        input：
        train_images:训练集
        test_labels:测试集
        output:None
        '''
        samples_num = train_images.shape[0]   # 得到训练集数量
        pred, pred_loss, right_rate = self.predict(test_images, test_labels)    # 计算测试集精度
        print("Initial Test -- Average loss:{:.4f}, Accuracy:{:.3f}\n".format(pred_loss, right_rate))  # 第一次训练现在训练前看看准确率
        if losses is not None and accuracy is not None:
            losses.append(pred_loss)
            accuracy.append(right_rate)
        for epoch in range(1, Epochs + 1):   # 在训练集里面跑5次
            i = 0
            while i < samples_num:
                self.forward(train_images[i:i+batch_size], train_labels[i:i+batch_size])  # 每次训练batch_size个样本
                self.backward()         # 反向传播
                self.refresh()          # 更新参数
#                 print("Train Epoch: {}\t batch_size_index:{} Loss:{:.6f}".format(epoch, i+1, self.loss.loss))
                i += batch_size
                
#             self.lr = (0.95 ** epoch) * self.lr     # 更新学习率，防止其因为学习率过大而导致无法有效下降。
            print("Train Epoch: {}\t Loss:{:.6f}".format(epoch, self.loss.loss))
            pred, pred_loss, right_rate = self.predict(test_images, test_labels)    # 计算测试集精度
            print("Test -- Average loss:{:.4f}, Accuracy:{:.3f}\n".format(pred_loss, right_rate))
            if losses is not None and accuracy is not None:
                losses.append(pred_loss)
                accuracy.append(right_rate)
        return losses, accuracy
    
    # 预测模型
    def predict(self, test_images, test_labels):
        '''
        '''
        pred, loss = self.forward(test_images, test_labels)  # 预测值和损失
        pred = np.argmax(pred, axis=1)   # 求出预测标签
        return pred, loss, right_rate(pred, test_labels)   # 返回预测值向量和损失误差以及正确率。
    def inference_(self, inference_images):
        x = inference_images
        y1 = self.layer_1.forward(x)  # 前向传播，一步步往后走
        z1 = self.sigmoid_1.forward(y1)
        y2 = self.layer_2.forward(z1)
        z2 = self.sigmoid_2.forward(y2)
        y3 = self.layer_last.forward(z2)
        return y3

In [None]:
# 计算预测数据正确率
def right_rate(pred_label, label):
    '''
    input:
    pred_label:预测的结果数组
    label:实际的类别数组
    output:
    rate:正确率
    '''
    diff = pred_label - label
    return diff.tolist().count(0) / len(diff)

In [None]:
model = Network(784, 200, 60, lr=0.3)

In [None]:
# model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30)
# model.lr = 0.03
# model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=10)
# model.lr = 0.01
# model.fit_pred(test_images,test_labels, test_images, test_labels, Epochs=20)

In [None]:
# df_train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
# train_img = df_train.values[:, 1:] / 255.0
# train_lab = df_train.values[:, 0]

In [None]:
# train_lab.shape, train_img.shape

In [None]:
# model.lr = 0.05
# model.fit_pred(train_img,train_lab, test_images, test_labels, Epochs=30)

In [None]:
# df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
# inference_images = df_test.values / 255.0

In [None]:
# inference_images.shape

In [None]:
# pred = model.inference_(inference_images)
# pred = np.argmax(pred, axis=1)   # 求出预测标签

In [None]:
# from pandas import Series,DataFrame
# data = {'ImageId':Series(list(range(1, 28001))),
#        'Label':Series(pred)}
# submit = DataFrame(data)
# print(submit)

In [None]:
# 转化为csv文件
# submission = pd.concat([submit['ImageId'], submit['Label']], axis=1)
# submission.to_csv('/kaggle/working/submission_2.csv', index=False)

# The model of every activation
- 每个模型不同激活函数的loss曲线
    * sigmoid
    * relu
    * Cos
    * tanh
- 每个模型不同的学习率的曲线（限定20个epoch）
- SGD和Adam算法

### 每个模型不同激活函数

In [None]:
sig_loss = []
sig_acc = []
relu_loss = []
relu_acc = []
cos_loss = []
cos_acc = []
tanh_loss = []
tanh_acc = []

In [None]:
all_loss = {}
accuracy = {}

### sigmoid

In [None]:
# 当激活函数是sigmoid时
model = Network(784, 200, 60, lr=0.3)
model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30, losses=sig_loss, accuracy=sig_acc)

In [None]:
# 画图部分
all_loss['sigmoid'] = sig_loss
accuracy['sigmoid'] = sig_acc

### relu

In [None]:
# 当激活函数时relu时
model = Network(784, 200, 60, lr=0.1)  # 这里尝试过，发现如果学习率调到0.3Relu模型不收敛,即使学习率调的比较低有时也不收敛，实际上是由于Relu的梯度太高所指
model.sigmoid_1 = Relu()
model.sigmoid_2 = Relu()
model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30, losses=relu_loss, accuracy=relu_acc)

In [None]:
# 画图部分
all_loss['relu'] = relu_loss
accuracy['relu'] = relu_acc

### Cos function

In [None]:
# 当激活函数时Cos时
model = Network(784, 200, 60, lr=0.5)  # 尝试过各种学习率，都不行，可能是函数代码有问题
model.sigmoid_1 = Cos()
model.sigmoid_2 = Cos()
model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30, losses=cos_loss, accuracy=cos_acc)
# model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30)

In [None]:
# 画图部分
all_loss['cos'] = cos_loss
accuracy['cos'] = cos_acc

### tanh

In [None]:
# 当激活函数时Tanh时
model = Network(784, 200, 60, lr=0.2) 
model.sigmoid_1 = Tanh()
model.sigmoid_2 = Tanh()
model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30, losses=tanh_loss, accuracy=tanh_acc)
# model.fit_pred(train_images,train_labels, test_images, test_labels, Epochs=30)

In [None]:
# 画图部分
all_loss['tanh'] = tanh_loss
accuracy['tanh'] = tanh_acc

In [None]:
all_loss, accuracy

# Visualization

## loss图

In [None]:
# 设置颜色
colors = ['blue', 'yellow', 'green', 'red']
colors = dict(zip(all_loss.keys(), colors))
print(colors)

In [None]:
epoch = np.arange(1, 32)
plt.figure(figsize=(10, 7))
plt.title('The loss of Every model with epoch')
for name, loss in all_loss.items():
    # print(name, colors[name])
    plt.plot(epoch, loss, color=colors[name], label=name, linewidth=1.5)

    for x, y in zip(epoch, loss):
        plt.text(x, y, '%3s'%round(y, 1), ha='center', va='bottom')


plt.legend(loc='best')
plt.savefig('/kaggle/working/all_loss.png')
plt.show()

In [None]:
epoch = np.arange(1, 32)
plt.figure(figsize=(29, 20))
for index, data in enumerate(all_loss.items(), 1):
    name, loss = data
    plt.subplot(2, 2, index)
    plt.title(f'The loss of model {name} with epoch')
    plt.plot(epoch, loss, color=colors[name], label=name, linewidth=1.5)
    plt.legend(loc='best')
    
plt.savefig('/kaggle/working/The loss of every model with epoch')
plt.show()

## accuracy

In [None]:
epoch = np.arange(1, 32)
plt.figure(figsize=(10, 7))
plt.title('The Accuracy of Every model with epoch')
for name, rate in accuracy.items():
    print(name, colors[name])
    plt.plot(epoch, rate, color=colors[name], label=name, linewidth=1.5)

    for x, y in zip(epoch, rate):
        plt.text(x, y, '%3s'%round(y, 2), ha='center', va='bottom')


plt.legend(loc='best', borderpad=3)
plt.savefig('/kaggle/working/all_accuracy.png')
plt.show()

In [None]:
epoch = np.arange(1, 32)
for name, rate in accuracy.items():
    plt.figure()
    plt.title(f'The Accuracy of model {name} with epoch')
    plt.plot(epoch, rate, color=colors[name], label=name)
    plt.legend(loc='best')
    plt.savefig('/kaggle/working/%s_accuracy.png'%name)
    plt.show()

**That' s all! Wish you good luck!**