In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import torchvision
import torchvision.transforms as transforms
import pickle

In [22]:
# 数据准备
transform = transforms.Compose([
    transforms.RandomRotation(degrees=20),            # 随机旋转图像，范围是 [-20度, 20度]
    transforms.RandomHorizontalFlip(p=0.5),           # 50%的概率水平翻转图像
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # 随机平移，最大平移10%
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))              # 标准化
])

train_dataset = datasets.MNIST(root='/public/group_data_2023/luohh/Class/01.OmicsAndAI/01.Materials/homework/data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='/public/group_data_2023/luohh/Class/01.OmicsAndAI/01.Materials/homework/data', train=False, download=True, transform=transform)


In [27]:
# 定义Transformer模型
class TransformerModel(nn.Module):
    def __init__(self, dim=28, seq_length=28, num_classes=10, depth=2, heads=7, mlp_dim=256, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.seq_length = seq_length  # 序列长度为 28（即 28 行）

        # 位置编码，不位置编码会怎么样，请大家尝试
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_length, dim))

        # Transformer 编码器
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim,
            nhead=heads,
            dim_feedforward=mlp_dim,
            dropout=dropout
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # 最后的分类层
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        # 将 28x28 的图片视为长度为 28 的序列
        x = x.view(x.size(0), 28, 28)  # [batch_size, 28, 28]
        
        # 加入位置编码（是否进行位置编码会对结果有什么影响？）
        x = x + self.pos_embedding

        # 通过 Transformer 进行序列建模
        x = self.transformer(x)  # [batch_size, 28, dim]

        # 将每个序列（即 28 个嵌入）在 dim=1 上取均值，得到了一个表示整个序列的全局特征向量。
        x = x.mean(dim=1)  

        # 分类
        x = self.mlp_head(x)  # [batch_size, num_classes]
        return x

In [24]:
# 训练函数
def train(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for batch_idx, (images, labels) in enumerate(loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    return running_loss / len(loader)

# 测试函数
def test(model, loader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return running_loss / len(loader), accuracy

In [35]:
# 生成超参数组合并测试
# position_encoding = [True, False]
position_encoding = [True]
# n_heads_list = [2, 4, 7]
n_heads_list = [7]
# n_layers_list = [1, 2, 3]
n_layers_list = [3]
# lr_list = [0.01, 0.0001, 0.00001]
lr_list = [0.0001]
# epochs_list = list(range(1, 11))
epochs_list = [9]
batch_size_list = [16, 32, 64, 128]
# batch_size_list = [64]

In [36]:
for pos_enc in position_encoding:
    for n_heads in n_heads_list:
        for n_layers in n_layers_list:
            for lr in lr_list:
                for epochs in epochs_list:
                    for batch_size in batch_size_list:
                        print("### position_encoding = "+str(pos_enc)+", n_heads = "+str(n_heads)+", n_layers = "+str(n_layers)+
                              ", learning_rate = "+str(lr)+", epochs = "+str(epochs)+", batch_size = "+str(batch_size))
                        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
                        test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
                        
                        # 初始化模型、损失函数和优化器
                        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                        model = TransformerModel(dim=28, seq_length=28, num_classes=10, depth=n_layers, heads=n_heads, mlp_dim=256, dropout=0.1).to(device)
                        criterion = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=lr)
                        
                        # 训练和测试模型
                        for epoch in range(epochs):
                            train_loss = train(model, train_loader, optimizer, criterion, device)
                            test_loss, accuracy = test(model, test_loader, criterion, device)
                        print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f} - Test Loss: {test_loss:.4f} - Accuracy: {accuracy:.2f}%')

### position_encoding = True, n_heads = 7, n_layers = 3, learning_rate = 0.0001, epochs = 9, batch_size = 16
Epoch 9/9 - Train Loss: 0.7692 - Test Loss: 0.6986 - Accuracy: 75.97%
### position_encoding = True, n_heads = 7, n_layers = 3, learning_rate = 0.0001, epochs = 9, batch_size = 32
Epoch 9/9 - Train Loss: 0.8173 - Test Loss: 0.7053 - Accuracy: 75.57%
### position_encoding = True, n_heads = 7, n_layers = 3, learning_rate = 0.0001, epochs = 9, batch_size = 64
Epoch 9/9 - Train Loss: 0.9211 - Test Loss: 0.8111 - Accuracy: 72.43%
### position_encoding = True, n_heads = 7, n_layers = 3, learning_rate = 0.0001, epochs = 9, batch_size = 128
Epoch 9/9 - Train Loss: 0.9903 - Test Loss: 0.8748 - Accuracy: 70.75%
