In [65]:
import csv
from matplotlib import pyplot as plt
import numpy as np
import scipy.special
import time
# %matplotlib inline



数据集:
$$
D{_{m \times 1}}
=
\begin{bmatrix}
 d{_1} \\ d{_1}  \\ \cdots \\ d{_m}
\end{bmatrix}
$$

其中:
$$
d{_i}
=
\begin{bmatrix}
 y{^{(i)}} \\ x{^{(i)}_1} \\ x{^{(i)}_2} \\ \cdots \\ x{^{(i)}_n}
\end{bmatrix}
$$

设
$$
X{^{(i)}_{n+1 \times 1}}
=
\begin{bmatrix}
 1 \\ x{^{(i)}_1} \\ x{^{(i)}_2} \\ \cdots \\ x{^{(i)}_n}
\end{bmatrix} 
$$

$$
W{_{n+1 \times 1}}
=
\begin{bmatrix}
 b \\ w{_1} \\ w{_2}\\ \cdots \\ w{_n}
\end{bmatrix}
$$

$$
Z = W{^T}X{^{(i)}}
$$

假设:
$$
h(Z) = Sigmoid(Z)
$$
由sigmoid函数性质,可以认为
$$
\hat{y}{^{(i)}} = h(Z)=  P(y{^{(i)}}=1|X{^{(i)}})
$$

$$
即   \hat{y}{^{(i)}}    表示输出为1的概率 
$$

$$ p(y|x)= 
\begin{cases} 
\hat{y}{^{(i)}}, & \text {if $y=1$} \\ 
1-\hat{y}{^{(i)}}, & \text{if $y=0$} 
\end{cases} 
$$

可得

$$
p(y|x)
=
\hat{y}{^y}(1-\hat{y}){^{1-y}}
$$

则定义loss function
$$
l(\hat{y}{^{(i)}},y{^{(i)}})
=
log(p(y|x))
=-[y{^{(i)}}\log{(\hat{y}{^{(i)}})} + (1-y)\log{(1-\hat{y}{^{(i)}})}]
$$

cost function : 给定的训练集数据的平均loss function
$$
C(W,b) 
= 
\ frac {1}{m}\sum_{i=1}^m l(\hat{y}{^{(i)}},y{^{(i)}})
$$

----
## 梯度下降
对于一条数据

$$
dy{^{(i)}}=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}\hat{y}{^{(i)}}}
=
\frac {y{^{(i)}}}{\hat{y}{^{(i)}}} + 
\frac {1-y{^{(i)}}}{1-\hat{y}{^{(i)}}}
$$

$$
\frac{{\rm d}\hat{y}{^{(i)}}}{{\rm d}Z}
=
\hat{y}{^{(i)}}(1+\hat{y}{^{(i)}})
$$

$$
dz{^{(i)}}=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}Z}
=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}\hat{y}{^{(i)}}} \frac{{\rm d}\hat{y}{^{(i)}}}{{\rm d}Z}
=
\hat{y{^{(i)}}} - y{^{(i)}}
$$

$$
\frac{{\rm d}Z}{{\rm d}w{_j}}
=x{^{(i)}_j}
$$
$$
\frac{{\rm d}Z}{{\rm d}b}
=1
$$
所以有
$$
dw{^{(i)}_j} =
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}w{_i}}
=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}\hat{y}{^{(i)}}}
*
\frac{{\rm d}\hat{y}{^{(i)}}}{{\rm d}Z}
*
\frac{{\rm d}Z}{{\rm d}w{_i}}
=
x{^{(i)}_j}(\hat{y}{^{(i)}}-y{^{(i)}})
$$
$$
db=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}b}
=
\frac{{\rm d}l(\hat{y}{^{(i)}},y{^{(i)}})}{{\rm d}\hat{y}{^{(i)}}}
*
\frac{{\rm d}\hat{y}{^{(i)}}}{{\rm d}Z}
=
\hat{y}{^{(i)}}-y{^{(i)}}
$$

遍历整个训练集,进行累加有
$$
dw{_j} = 
\frac{1}{m}\sum_{i=1}^{m}dw{^{(i)}_j}
=
\frac{1}{m}\sum_{i=1}^{m}x{^{(i)}_j}(\hat{y}{^{(i)}}-y{^{(i)}})
$$

----

## 向量化
由于每条数据含有n个特征,对应n个参数,需使用for遍历n个w计算dw,向量化可简化for循环

$$
dw{^{(i)}}
=
\begin{bmatrix}
 \hat{y}{^{(i)}}-y{^{(i)}} \\ 
 x{^{(i)}_1}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ 
  x{^{(i)}_2}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ 
 \vdots \\ 
 x{^{(i)}_n}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ \end{bmatrix}
$$

$$
dw
=
\frac{1}{m}
\begin{bmatrix}
 \sum_{i=1}^{m}\hat{y}{^{(i)}}-y{^{(i)}} \\ 
 \sum_{i=1}^{m}x{^{(i)}_1}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ 
  \sum_{i=1}^{m}x{^{(i)}_2}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ 
 \vdots \\ 
 \sum_{i=1}^{m}x{^{(i)}_n}(\hat{y}{^{(i)}}-y{^{(i)}}) \\ 
 \end{bmatrix}
$$
至此消除了一条数据内部循坏,但对于dw仍需遍历整个数据集

$$
X
=
\begin{bmatrix}
1 & 1 & \cdots & 1 \\
x{^{(1)}_1} & x{^{(2)}_1} & \cdots & x{^{(m)}_1}\\
x{^{(1)}_2} & x{^{(2)}_2} & \cdots & x{^{(m)}_2}\\
\vdots & \vdots & \ddots & \vdots\\
x{^{(1)}_n} & x{^{(2)}_n} & \cdots & x{^{(m)}_n}\\
\end{bmatrix}
$$

$$
dZ = \begin{bmatrix}
dz{^{(1)}}\\
dz{^{(2)}}\\
\vdots\\
dz{^{(m)}}\\
\end{bmatrix}
=
\begin{bmatrix}
\hat{y}{^{(1)}}-y{^{(1)}}\\
\hat{y}{^{(2)}}-y{^{(2)}}\\
\vdots\\
\hat{y}{^{(m)}}-y{^{(m)}}\\ 
\end{bmatrix}
$$

所以有

$$
dw=
\frac{1}{m}XdZ{^T}=
\begin{bmatrix}
 \sum_{i=1}^{m}\hat{y}{^{(i)}}-y{^{(i)}} &
 \sum_{i=1}^{m}x{^{(i)}_1}(\hat{y}{^{(i)}}-y{^{(i)}}) &
 \sum_{i=1}^{m}x{^{(i)}_2}(\hat{y}{^{(i)}}-y{^{(i)}}) & 
 \cdots &
 \sum_{i=1}^{m}x{^{(i)}_n}(\hat{y}{^{(i)}}-y{^{(i)}}) &
\end{bmatrix}
$$





In [68]:
#获取数据
def get_data():
    with open('data/mnist_train.csv','r') as f:
        data = [x.strip().split(',') for x in f]
        return data
def get_test_data():
    with open('data/mnist_test.csv','r') as f:
        data = [x.strip().split(',') for x in f]
        return data

In [36]:
# 预处理数据
data = np.asarray(get_data(),dtype='float').T
# 由于是二分类,这里只判断图片是否等于0,当图片是0 表记 y=1 其余数字y=0
y = np.array([[1 if x==0 else 0 for x in data[0]]])
print(y)
x = data
x = (x/255.0*0.99)+0.01
x[0] = np.zeros(len(y[0]))+1
number_of_features = len(x)-1
print('number_of_features',number_of_features)


[[0 1 0 ... 0 0 0]]
number_of_features 784


In [None]:
#权重矩阵
w = np.array([[1]*(number_of_features+1)]).T
print(w)
# 学习率
learning_rate = 0.1

In [59]:
output_y = np.dot(w.T,x)
output_y = scipy.special.expit(output_y)
print(y)
dz = output_y - y
print(dz)
print(output_y)

[[0 1 0 ... 0 0 0]]
[[1. 0. 1. ... 1. 1. 1.]]
[[1. 1. 1. ... 1. 1. 1.]]


In [None]:
dw = np.dot(x,dz.T)/number_of_features
#更新系数w
w = w - learning_rate*dw

In [100]:
# 综合起来
start_time = time.time()
iterations = 100
learning_rate = 0.5
number_of_features = 784
# 预处理数据
data = np.asarray(get_data(),dtype='float').T
# 由于是二分类,这里只判断图片是否等于0,当图片是0 表记 y=1 其余数字y=0
y = np.array([[1 if x==0 else 0 for x in data[0]]])
x = data
x = (x/255.0*0.99)+0.01
x[0] = np.zeros(len(y[0]))+1

w = np.array([[1]*(number_of_features+1)]).T
for i in range(iterations):
    output_y = np.dot(w.T,x)
    # 模型计算出的y
    output_y = scipy.special.expit(output_y)
    # 求导
    dz = output_y - y
    dw = np.dot(x,dz.T)/number_of_features
    # 更新模型
    w = w - learning_rate*dw
print(f'duration:{time.time()-start_time}')

duration:17.616844177246094


In [101]:
test_set = get_test_data()
total_number = len(test_set)
right = 0
for data in test_set:
    label = data[0]
    data[0]=1
    data = np.asarray([data],dtype='float')
    res = scipy.special.expit(np.dot(data,w))
    if label == '0' and res[0][0] == 1:
        right += 1
    if label != '0' and res[0][0] == 0:
        right += 1
print(right/total_number*100)
    

98.19
