### 1. 原始数据读取
- 并不是把所有图像全部读进内存！
- 而是把所有图像的`路径`和`类别`归纳和梳理出来！
- img_path
- img_label

In [1]:
"""
    尝试读取 train 
"""
import os

# 当前目录
current_dir = os.getcwd()

# 获取上一级目录
parent_dir = os.path.abspath(os.path.join(current_dir, "..", ".."))

train_root = os.path.join(parent_dir, "day08", "codes", "gesture", "train")
train_paths = []
train_labels = []
print(train_root)

for label in os.listdir(train_root):
    label_root = os.path.join(train_root, label)
    for file in os.listdir(label_root):
        file_path = os.path.join(label_root, file)
        train_paths.append(file_path)
        train_labels.append(label)

D:\workspaces\ai_study\day08\codes\gesture\train


In [2]:
"""
    尝试读取 test 
"""
import os
test_root = os.path.join(parent_dir, "day08", "codes", "gesture", "test")
test_paths = []
test_labels = []

for label in os.listdir(test_root):
    label_root = os.path.join(test_root, label)
    for file in os.listdir(label_root):
        file_path = os.path.join(label_root, file)
        test_paths.append(file_path)
        test_labels.append(label)

In [3]:
# 构建 标签字典 label dict
labels = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
label2idx = {label: idx for idx, label in enumerate(labels)}
idx2label = {idx: label for label, idx in label2idx.items()}

### 2. 批量化打包
- 继承 `Dataset`，自定义一个数据集
- 实例化 `DataLoader`

In [4]:
# 引入必要的工具类
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image
from torchvision import transforms
import torch

In [5]:
class GestureDataset(Dataset):
    """
        自定义手势识别数据集
    """
    def __init__(self, X, y):
        """
            初始化
        """
        self.X = X
        self.y = y

    def __getitem__(self, idx):
        """
            实现：
                - 按下标来索引一个样本
        """
        # 获取图像路径
        img_path = self.X[idx]
        # 读取图像
        img = Image.open(fp=img_path)
        # 统一大小
        img = img.resize((32, 32))
        # 转张量 [C, H, W]
        # [0, 1]
        img = transforms.ToTensor()(img)
        # [-1, 1]
        img = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(img)

        # 读取标签
        img_label = self.y[idx]
        # 标签转 id
        img_idx = label2idx.get(img_label)
        # 转张量
        label = torch.tensor(data=img_idx, dtype=torch.long)

        return img, label

    
    def __len__(self):
        """
            返回该数据集的样本个数
        """
        return len(self.X)

In [6]:
# 训练集加载器
train_dataset = GestureDataset(X=train_paths, y=train_labels)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
# 测试集加载器
test_dataset = GestureDataset(X=test_paths, y=test_labels)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [7]:
# 测试
for X, y in test_dataloader:
    print(X.shape)
    print(y.shape)
    break

torch.Size([32, 3, 32, 32])
torch.Size([32])


### 3. 搭建模型

In [8]:
import torch
from torch import nn

In [9]:
class Model(nn.Module):
    """
        自定义一个神经网络
    """
    def __init__(self, in_channels=3, n_classes=10):
        """
            初始化
        """
        super(Model, self).__init__()

        # 1. 特征抽取
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, 
                               out_channels=6, 
                               kernel_size=5,
                               stride=1,
                               padding=0),
            nn.MaxPool2d(kernel_size=2, 
                                stride=2,
                                padding=0),
            nn.Conv2d(in_channels=6,
                              out_channels=16,
                              kernel_size=5,
                              stride=1,
                              padding=0),
            nn.MaxPool2d(kernel_size=2, 
                               stride=2,
                               padding=0)
        )

        # 2. 分类输出
        self.classifier = nn.Sequential(
            nn.Flatten(start_dim=1, end_dim=-1),
            nn.Linear(in_features=400, out_features=120),
            nn.Linear(in_features=120, out_features=84),
            nn.Linear(in_features=84, out_features=n_classes)
        )

    def forward(self, x):
        """
            前向传播
        """
        # 1. 先做特征抽取
        x = self.feature_extractor(x)
        # 2. 再做分类回归
        x = self.classifier(x)
        return x

### 4. 训练过程

In [12]:
def train():
    """
        定义训练函数
    """
    for epoch in range(epochs):
        for batch_X, batch_y in train_dataloader:
            # 1. 数据搬家
            batch_X = batch_X.to(device=device)
            batch_y = batch_y.to(device=device)
            # 2. 正向传播
            y_pred = model(batch_X)
            # 3. 计算损失
            loss = loss_fn(y_pred, batch_y)
            # 4. 反向传播
            loss.backward()
            # 5. 优化一步
            optimizer.step()
            # 6. 清空梯度
            optimizer.zero_grad()
        # 7. 打印损失
        print(loss.item())           

In [13]:
# 设置训练轮次
epochs = 50
# 设备
device = "cuda" if torch.cuda.is_available() else "cpu"
# 实例化模型
model = Model()
model.to(device=device)
# 优化器
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
# 损失函数
loss_fn = nn.CrossEntropyLoss()

In [14]:
train()

0.968528687953949
0.5537124872207642
0.11572491377592087
0.20999424159526825
0.35636427998542786
0.04770141467452049
0.40265074372291565
0.019015749916434288
0.03408932313323021
0.018282009288668633
0.013586784712970257
0.030104253441095352
0.09641117602586746
0.01463738176971674
0.19347600638866425
0.09610701352357864
0.006743032485246658
0.02750261314213276
0.0005216533900238574
0.00012698538193944842
9.357242379337549e-05
5.9960399084957317e-05
0.00034650228917598724
0.0005969497142359614
0.0028842701576650143
0.0012410080526024103
0.0007446525269187987
2.1252277292660438e-05
0.00049805041635409
5.107428660267033e-05
0.00012534546840470284
0.0004467817780096084
0.000269952550297603
3.882762939610984e-06
0.0008299491018988192
0.00018438413098920137
0.00012798899842891842
2.0885026970063336e-05
0.00026031082961708307
0.00012170946138212457
1.3555200894188602e-05
3.564019061741419e-05
8.418938523391262e-05
5.451918696053326e-05
6.67059575789608e-05
6.946310895727947e-05
0.0002275795268

In [15]:
"""
    1. 过程监控（准确率accuracy）
    2. 可视化loss和accuracy曲线
    3. 早停设置（在测试集上，如果连续N=3轮没有性能提升，则停止训练）
    4. 模型的best.pt和last.pt保存
    5. 加载预训练模型 last.pt
    6. 模型加载、推理流程
"""

'\n    1. 过程监控（准确率accuracy）\n    2. 可视化loss和accuracy曲线\n    3. 早停设置（在测试集上，如果连续N=3轮没有性能提升，则停止训练）\n    4. 模型的best.pt和last.pt保存\n    5. 加载预训练模型 last.pt\n    6. 模型加载、推理流程\n'