In [2]:
import numpy as np
from torch import nn, optim
import torch
import random
import time

🤔思考：如何加载数据？

In [3]:
with open("labels.txt", "r") as f:
    raw = f.readlines() #读取所有行

tags = []
data = []
for l in raw:
    tags.append(int(l[0]))#每行的第一个字符是标签
    d = l[1:-1]#去掉标签和换行符
    d = map(float, tuple(d)) #将字符串转换为tuple，数字转换为float，方便后续转为tensor
    #tuple相对于list更省内存，因为tuple是不可变的，对象所含method更少
    data.append(tuple(d))

#将标签和数据转为tensor，方便后续切分训练集和测试集
data = torch.tensor(data)
tags = torch.tensor(tags)

#划分训练集和测试集
train_test_ratio = 0.8 #训练集占总数据集的比例

train_size = int(train_test_ratio * len(data)) #训练集大小
test_size = len(data) - train_size #测试集大小

# 切分训练集和测试集
data_train = data[:train_size]
data_test  = data[train_size:]
tags_train = tags[:train_size]
tags_test  = tags[train_size:]

In [4]:
# 直接套用d2l网站上的代码，没有改动
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    # 这些样本是随机读取的，没有特定的顺序
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i: min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]


batch_size = 10

w = torch.normal(0, 0.01, size=(len(data[0]), 1), requires_grad=True, dtype=torch.float32) #对每个像素都有一个权重
b = torch.zeros(1, requires_grad=True)

👇方案1：Linear Regression

In [5]:
# 训练模型
def linreg(X, w, b):
    return torch.matmul(X, w) + b

# 损失函数
def mse_loss(y_pred, y_true):
    return torch.mean((y_pred - y_true)**2)

# 优化器
def sgd(params, lr, batch_size):
    """小批量随机梯度下降"""
    with torch.no_grad():
        for param in params:    
            param -= lr * param.grad / batch_size
            param.grad.zero_()

sgd优化器数学原理：
- 计算梯度：$\mathbf{g} \leftarrow \partial_{(\mathbf{w},b)} \frac{1}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} l(\mathbf{x}^{(i)}, y^{(i)}, \mathbf{w}, b)$
  - 其中$l$是损失函数，$\mathcal{B}$是训练集，$\mathbf{x}^{(i)}$是第$i$个样本的输入，$y^{(i)}$是第$i$个样本的标签，$\mathbf{w}$和$b$是模型参数
- 更新参数：$\mathbf{w} \leftarrow \mathbf{w} - \eta \cdot \mathbf{g}$，
- $\eta$ 是学习率，控制更新步长


In [7]:
lr = 0.02
num_epochs = 16
net = linreg
loss = mse_loss

for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, data_train, tags_train):
        l = loss(net(X, w, b), y)  # X和y的小批量损失
        # 因为l形状是(batch_size,1)，而不是一个标量。l中的所有元素被加到一起，
        # 并以此计算关于[w,b]的梯度
        l.sum().backward() # 反向传播
        # 使用小批量随机梯度下降更新参数
        sgd([w, b], lr, batch_size) 
    with torch.no_grad():
        tags_pred = torch.round(net(data_train, w, b)).reshape(tags_train.shape)
        acc_num = (tags_pred == tags_train).sum().item()  # 计算预测正确的样本数
        print(f'epoch {epoch + 1:02d}, correct rate on training set: {acc_num/len(data_train)*100:.02f}%')

epoch 01, correct rate on training set: 0.88%
epoch 02, correct rate on training set: 10.56%
epoch 03, correct rate on training set: 21.41%
epoch 04, correct rate on training set: 10.56%
epoch 05, correct rate on training set: 0.00%
epoch 06, correct rate on training set: 11.14%
epoch 07, correct rate on training set: 10.26%
epoch 08, correct rate on training set: 12.02%
epoch 09, correct rate on training set: 9.97%
epoch 10, correct rate on training set: 10.26%
epoch 11, correct rate on training set: 12.02%
epoch 12, correct rate on training set: 10.26%
epoch 13, correct rate on training set: 0.59%
epoch 14, correct rate on training set: 9.38%
epoch 15, correct rate on training set: 12.90%
epoch 16, correct rate on training set: 21.99%


In [8]:
with torch.no_grad():
    train_l = torch.round(net(data_test, w, b)).reshape(tags_test.shape) == tags_test  # 训练集上的损失
    print(f'correct rate on test set: {train_l.sum()/len(tags_test)*100:.02f}%')

correct rate on test set: 23.26%


👆可以看到，正确率很低上不去，说明可能不存在线性可分的情况，需要更换其他回归模型。例如，决策树、随机森林、支持向量机等。