In [1]:
from mxnet import ndarray as nd
from mxnet import autograd as ag

In [2]:
num_inputs = 2
num_examples = 1000

true_w = [2, -3.4]
true_b = 4.2

X = nd.random_normal(shape=(num_examples, num_inputs))
y = true_w[0] * X[:,0] + true_w[1] * X[:, 1] + true_b
y += 0.01 * nd.random_normal(shape=y.shape)

In [3]:
print(X[0], y[0])


[ 1.16307867  0.48380461]
<NDArray 2 @cpu(0)> 
[ 4.87962484]
<NDArray 1 @cpu(0)>


In [4]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 1].asnumpy(), y.asnumpy())
plt.show()

<matplotlib.figure.Figure at 0x10fc67550>

In [5]:
# 通过 Python 的 yeild 构造一个 iterator
import random
batch_size = 32
def data_iter():
    idx = list(range(num_examples))
    random.shuffle(idx)
    for i in range(0, num_examples, batch_size):
        j = nd.array(idx[i:min(i+batch_size, num_examples)])
        yield nd.take(X, j), nd.take(y, j)

In [6]:
for data, label in data_iter():
    print(data, label)
    break


[[-0.72485566  1.11196363]
 [ 1.60673869  0.18408279]
 [ 0.22842942  0.62486178]
 [-0.93251252  2.05497503]
 [-1.50477183 -0.05162206]
 [-1.46828771  0.56517828]
 [ 1.17471194  1.36774385]
 [ 0.43260059  0.94352221]
 [-0.2103665  -2.23908734]
 [ 1.4021405  -2.4169414 ]
 [ 0.44712451  0.28236011]
 [-2.77867079  0.01066511]
 [-0.36942869  0.54585946]
 [-1.05003417  1.45409334]
 [ 1.17235053  1.52714729]
 [ 0.57125562 -1.57108414]
 [ 1.7005018   0.25498316]
 [ 0.11150214 -0.22487849]
 [-1.05367899 -0.26470137]
 [-0.25803244  0.02452744]
 [ 1.45263755  2.13133287]
 [-0.66490102  0.50258273]
 [-0.69040912  0.09003334]
 [ 0.300295    0.73225945]
 [-0.48599643 -1.13515449]
 [-0.29419237 -0.239079  ]
 [ 0.425275   -0.37855875]
 [-0.68288445 -0.25153375]
 [ 0.46851122  0.81799328]
 [ 0.00585518 -0.32527655]
 [ 0.25081477 -0.30159083]
 [ 0.80678338 -0.55400944]]
<NDArray 32x2 @cpu(0)> 
[ -1.02687657   6.79803848   2.51352644  -4.65387917   1.35681796
  -0.63820291   1.92082524   1.8577776   11.

In [7]:
w = nd.random_normal(shape=(num_inputs, 1))
b = nd.zeros((1,))
params = [w, b]

In [8]:
for param in params:
    param.attach_grad()

In [15]:
def net(X):
    return nd.dot(X, w) + b

In [32]:
def square_loss(y_hat, y):
    return (y_hat-y)**2

In [33]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

In [34]:
# 模型函数
def real_fn(X):
    return 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2

In [35]:
# 绘制损失随训练次数降低的折线图，以及预测值和真实值的散点图
def plot(losses, X, sample_size=100):
    xs = list(range(len(losses)))
    f, (fg1, fg2) = plt.subplots(1, 2)
    fg1.set_title('Loss during training')
    fg1.plot(xs, losses, '-r')
    fg2.set_title('Estimated vs real function')
    fg2.plot(X[:sample_size, 1].asnumpy(),
             net(X[:sample_size, :]).asnumpy(), 'or', label='Estimated')
    fg2.plot(X[:sample_size, 1].asnumpy(),
             real_fn(X[:sample_size, :]).asnumpy(), '*g', label='Real')
    fg2.legend()
    plt.show()

In [None]:
epochs = 5
learning_rate = .001
niter = 0
losses = []
moving_loss = 0.
smoothing_constant = .01

# 训练
for e in range(epochs):
    total_loss = 0.

    for data, label in data_iter():
        with ag.record():
            output = net(data)
            loss = square_loss(output, label)
        loss.backward()
        SGD(params, learning_rate)
        total_loss += nd.sum(loss).asscalar()

        # 记录每读取一个数据点后，损失的移动平均值的变化；
        niter +=1
        curr_loss = nd.mean(loss).asscalar()
        moving_loss = (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss

        # correct the bias from the moving averages
        est_loss = moving_loss/(1-(1-smoothing_constant)**niter)

        if (niter + 1) % 100 == 0:
            losses.append(est_loss)
            print("Epoch %s, batch %s. Moving avg of loss: %s. Average loss: %f" % (e, niter, est_loss, total_loss/num_examples))
            plot(losses, X)

In [27]:
print(true_w, w)

[2, -3.4] 
[[  1.00215664e+16]
 [ -1.38553229e+14]]
<NDArray 2x1 @cpu(0)>


In [28]:
print(true_b, b)

4.2 
[ -8.28094928e+15]
<NDArray 1 @cpu(0)>
