# 导入数据，展开为一维数组

In [47]:
import numpy as np
import sys,os
sys.path.append(os.pardir)# 为了导入父目录中的文件而进行的设定
from mnist import load_mnist
(x_train,t_train),(x_test,t_test) = load_mnist(flatten=True,normalize=False,one_hot_label=True) #将图像展开为一维数组
#以图像、标签形式导入

In [48]:
print(x_train.shape)
print(t_train.shape)
print(x_test.shape)
print(t_test.shape)

(60000, 784)
(60000, 10)
(10000, 784)
(10000, 10)


# 参数学习

- minibatch
- 计算梯度
- 更新参数
- repeat

In [49]:
#交叉熵损失函数 ???
def cross_entropy_error(y,t):
    if y.ndim==1: #当y维度为1，需要改变数据的形状    ？？？
        t=t.reshape(1,t.size)
        y=y.reshape(1,y.size)
    batch_size=y.shape[0]
    delta=1e-7
    return -np.sum(t*np.log(y+delta))/batch_size 
    #delta是作为保护，防止出现np.log(0)，这一-inf会使后续计算无法进行

In [50]:
def softmax(a):
    c=np.max(a)
    exp_a=np.exp(a-c) #防溢出：易证，本函数同加减某个数不会改变最终结果
    sum_exp_a=np.sum(exp_a)
    y=exp_a/sum_exp_a
    return y
class Relu:
    def __init__(self):
        self.mask=None
    def forward(self,x):
        self.mask=(x<=0) #True、False的数组
        out=x.copy()  #大于0处保持不变
        out[self.mask]=0 #输入值小于等于0处设为0
        return out
    def backward(self,dout):
        dout[self.mask]=0 #小于0处导数改为0
        dx=dout #其他部分保持不变
        return dx

class Affine:
    def __init__(self,W,b):
        self.W=W
        self.b=b
        self.x=None
        self.dW=None
        self.db=None
    def forward(self,x):
        self.x=x
        out=x @ self.W+self.b
        print(out)
        return out
    def backward(self,dout):
        dx=dout @ self.W.T #传给前一层
        self.dW=self.x.T @ dout  # 本层的对W的导数（是长这样的吗？）
        self.db=np.sum(dout,axis=0)
        return dx

class SoftmaxWithLoss:
    def __init__(self):
        self.loss=None
        self.y=None
        self.t=None
    def forward(self,x,t):
        self.t=t
        self.y=softmax(x)
        self.loss=cross_entropy_error(self.y,self.t)
        print(self.loss)
        return self.loss
    def backward(self,dout=1): #这里设了最后一层的loss为1，但是事实上貌似不完全是？
        batch_size=self.t.shape[0]  #啥意思
        dx=(self.y-self.t)/batch_size
        # print(dx)
        return dx

In [51]:
from collections import OrderedDict
class FiveLayerNet:
    def __init__(self,input_size,hsize1,hsize2,hsize3,hsize4,output_size,weight_init_std=0.01):
        #初始化权重
        self.params={}
        self.params["W1"]=weight_init_std*np.random.randn(input_size,hsize1)
        self.params["b1"]=np.zeros(hsize1)
        self.params["W2"]=weight_init_std*np.random.randn(hsize1,hsize2)
        self.params["b2"]=np.zeros(hsize2)
        self.params["W3"]=weight_init_std*np.random.randn(hsize2,hsize3)
        self.params["b3"]=np.zeros(hsize3)
        self.params["W4"]=weight_init_std*np.random.randn(hsize3,hsize4)
        self.params["b4"]=np.zeros(hsize4)
        self.params["W5"]=weight_init_std*np.random.randn(hsize4,output_size)
        self.params["b5"]=np.zeros(output_size)

        #生成层
        self.layers=OrderedDict()  #有序字典
        self.layers["Affine1"]=Affine(self.params["W1"],self.params["b1"])
        self.layers['Relu1']=Relu()
        self.layers["Affine2"]=Affine(self.params["W2"],self.params["b2"])
        self.layers['Relu2']=Relu()
        self.layers["Affine3"]=Affine(self.params["W3"],self.params["b3"])
        self.layers['Relu3']=Relu()
        self.layers["Affine4"]=Affine(self.params["W4"],self.params["b4"])
        self.layers['Relu4']=Relu()
        self.layers["Affine5"]=Affine(self.params["W5"],self.params["b5"])
        self.lastLayer=SoftmaxWithLoss()

    def predict(self,x):
        for layer in self.layers.values():
            x=layer.forward(x)
        return x
    
    #x：输入数据，t：监督数据
    def loss(self,x,t):
        y=self.predict(x)
        return self.lastLayer.forward(y,t)
    def accuracy(self,x,t):
        y=self.predict(x)
        y=np.argmax(y,axis=1)
        if t.ndim != 1:
            t=np.argmax(t,axis=1)
        accuracy=np.sum(y==t)/float(x.shape[0])
        return accuracy
        
    def gradient(self,x,t):
        #forward
        self.loss(x,t)

        #backward
        dout=1
        dout=self.lastLayer.backward(dout)
        layers=list(self.layers.values())  #是一个类对象的列表
        layers.reverse() #把层里面的值都反转过来
        for layer in layers:
            dout=layer.backward(dout)  # 更新各个类对象里面的dW和db
        
        grads={}
        grads['W1']=self.layers['Affine1'].dW
        grads["b1"]=self.layers['Affine1'].db
        grads['W2']=self.layers['Affine2'].dW
        grads["b2"]=self.layers['Affine2'].db
        grads['W3']=self.layers['Affine3'].dW
        grads["b3"]=self.layers['Affine3'].db
        grads['W4']=self.layers['Affine4'].dW
        grads["b4"]=self.layers['Affine4'].db
        grads['W5']=self.layers['Affine5'].dW
        grads["b5"]=self.layers['Affine5'].db
        return grads

In [52]:
network=FiveLayerNet(784,40,42,38,40,10)
print("forward finished")
iters_num=400
train_size=x_train.shape[0]
batch_size=1000
learning_rate=0.1
train_loss_list=[]
train_acc_list=[]
test_acc_list=[]

iter_per_epoch=max(train_size/batch_size,1)

for i in range(iters_num):
    batch_mask=np.random.choice(train_size, batch_size) #挑选minibatch的index了
    x_batch=x_train[batch_mask]
    t_batch=t_train[batch_mask]

    grad=network.gradient(x_batch, t_batch)
    # print("backward for "+str(i))

    for key in ('W1','b1','W2','b2','W3','b3','W4','b4','W5','b5'):
        network.params[key] -= learning_rate*grad[key]
        
    loss=network.loss(x_batch,t_batch)
    train_loss_list.append(loss)

    if i%iter_per_epoch==0:
        train_acc=network.accuracy(x_train,t_train)
        test_acc=network.accuracy(x_test,t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc,test_acc)

forward finished
[[ -7.04735642 -18.78566179 -90.01704249 ...  30.95484001  11.17370973
  -10.13635517]
 [-18.75154253  27.43131058 -56.88884953 ...  38.13373094 -31.59845366
   31.46672652]
 [  3.80044999  20.38289215 -45.27270459 ...  56.84959876   2.93312075
    1.02761781]
 ...
 [  7.19829468  -0.77677637 -16.69372366 ...  12.92104462 -11.79240951
   -2.02702008]
 [ 21.61674028  -9.5273505  -18.64453518 ...   9.85732144 -33.36088716
    1.80562706]
 [ 10.72264742   1.22405581 -49.59096843 ...  46.68363303 -29.37619647
  -15.97541962]]
[[-0.78248384 -2.90680703 -1.35505226 ...  0.89150216  1.65036047
   0.25877304]
 [ 1.16650847 -2.37095284 -1.01861597 ... -0.39573549  1.19980084
   0.24758141]
 [-1.26027597 -2.38210306 -0.54420624 ...  0.71559499  0.68687084
  -0.23618614]
 ...
 [-0.81757964 -2.49937208 -0.88719531 ...  0.097887    0.40453541
  -0.04193634]
 [ 0.86236231 -1.59099417 -0.87115489 ... -1.91187645  0.84664157
   0.88201168]
 [ 0.09751573 -0.85430056 -0.87926322 ... -0.

  out=x @ self.W+self.b
  exp_a=np.exp(a-c) #防溢出：易证，本函数同加减某个数不会改变最终结果


[[inf inf inf ... inf inf inf]
 [inf inf inf ... inf inf inf]
 [inf inf inf ... inf inf inf]
 ...
 [inf inf inf ... inf inf inf]
 [inf inf inf ... inf inf inf]
 [inf inf inf ... inf inf inf]]
nan
[[ 2.88961579e+70 -3.20641443e+02  3.22753173e+67 ...  3.70898962e+70
   1.69921729e+69  2.87337044e+69]
 [ 5.57277011e+70 -7.51235281e+02  6.22445809e+67 ...  7.15297397e+70
   3.27702643e+69  5.54144013e+69]
 [ 2.74689455e+70 -3.22029978e+02  3.06812045e+67 ...  3.52579862e+70
   1.61529111e+69  2.73145157e+69]
 ...
 [ 4.22847192e+70 -5.74890683e+02  4.72295567e+67 ...  5.42748919e+70
   2.48652177e+69  4.20469955e+69]
 [ 2.64327390e+70 -3.58514096e+02  2.95238226e+67 ...  3.39279551e+70
   1.55435775e+69  2.62841348e+69]
 [ 1.35587430e+70 -1.58297450e+02  1.51443225e+67 ...  1.74034338e+70
   7.97311898e+68  1.34825161e+69]]
[[ 1.76333486e+137 -1.61363098e+069 -9.15355869e+068 ... -3.34114104e+068
   2.13572906e+138  1.92147936e+138]
 [ 3.40068040e+137 -3.11196891e+069 -1.76531006e+069 ... 

KeyboardInterrupt: 

In [None]:
train_loss_list

[9.209178825049108,
 9.209087958914726,
 9.209179725124185,
 9.20893075922167,
 9.20922217794285,
 9.20875981574438,
 9.208249599226411,
 9.208568049361878,
 9.209117198451347,
 9.208806612062435,
 9.208868582949629,
 9.208913157496589,
 9.20834593832746,
 9.208898019337521,
 9.20834678087008,
 9.208923967701953,
 9.20811816608838,
 9.208637886018169,
 9.208991046207181,
 9.207456719273376,
 9.208042274498942,
 9.208297025505283,
 9.208112084667452,
 9.208370577896273,
 9.208086116090875,
 9.209173932969799,
 9.207994306765618,
 9.208816069479928,
 9.208141581290311,
 9.209122554770463,
 9.209582555598438,
 9.20801598777419,
 9.207526177434081,
 9.208018409838587,
 9.20845648093074,
 9.208184138624889,
 9.20802888720133,
 9.206556259394645,
 9.209516907673997,
 9.20932934256949,
 9.208397194561153,
 9.206076659564967,
 9.207816671477804,
 9.206766602658178,
 9.208756479339335,
 9.208137607585641,
 9.207834045558684,
 9.2067159490148,
 9.208027597087128,
 9.209152364669032,
 9.207494209

In [None]:
train_acc_list

[0.10441666666666667,
 0.11236666666666667,
 0.09871666666666666,
 0.09871666666666666,
 0.09871666666666666,
 0.09871666666666666,
 0.09871666666666666]

In [None]:
test=FiveLayerNet(784,10,12,8,10,10)

In [None]:
test.predict(x_train)[0]

array([ 1.30386244e-05, -1.46211230e-05, -5.32366726e-06, -6.61053326e-06,
       -5.16376909e-06,  1.07851303e-05, -2.27714953e-05,  8.39069057e-06,
       -3.01397829e-07, -5.97682714e-06])

In [None]:
grad

{'W1': array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]),
 'b1': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan,  0., nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan]),
 'W2': array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]),
 'b2': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan

In [None]:
import numpy as np
t1=np.array([[1,2,3]]).T
t2=np.array([[2,3,4]])
t1@t2

array([[ 2,  3,  4],
       [ 4,  6,  8],
       [ 6,  9, 12]])

In [1]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 
    #若不加上面的if这一段，则必然会溢出，导致模型表现极差
    x = x - np.max(x) # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))

In [7]:
import numpy as np
softmax(np.array([[1,2],[2,1]]))

array([[0.26894142, 0.73105858],
       [0.73105858, 0.26894142]])

In [10]:
def bsoftmax(x):
    # x = x - np.max(x) # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))

In [11]:
bsoftmax(np.array([[1,2],[2,1]]))

array([[0.13447071, 0.36552929],
       [0.36552929, 0.13447071]])

In [13]:
np.sum(np.array([[1,2],[2,1]]), axis=0)

array([3, 3])

In [14]:
np.array([[1,2],[2,1]]).sum()

6

In [15]:
def zsoftmax(X):
    X_exp=np.exp(X)
    partition=X_exp.sum()
    return X_exp/partition 

In [16]:
zsoftmax(np.array([[1,2],[2,1]]))

array([[0.13447071, 0.36552929],
       [0.36552929, 0.13447071]])

In [21]:
test=np.array([[1,2,3],[2,1,3]])

In [24]:
test.reshape(test.shape[0],-1)

array([[1, 2, 3],
       [2, 1, 3]])

In [27]:
(1,1).count

<function tuple.count(value, /)>

In [30]:
if 1:
    print("y")

y


In [31]:
#副本
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 权重和偏置参数的导数
        self.dW = None
        self.db = None

    def forward(self, x):
        # 对应张量
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)  # 还原输入数据的形状（对应张量）
        return dx
