In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, DistributedSampler
from torchvision import datasets, transforms
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
import argparse

# 1. 初始化分布式环境
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'  # 主节点地址（服务器本地）
    os.environ['MASTER_PORT'] = '12355'      # 端口（确保未被占用）
    dist.init_process_group("nccl", rank=rank, world_size=world_size)  # 初始化进程组

# 2. 清理分布式环境
def cleanup():
    dist.destroy_process_group()

# 3. 定义模型（与之前一致，适合Fashion-MNIST或其他图像任务）
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5, padding=2), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16*7*7, 120), nn.Sigmoid(),
            nn.Linear(120, 84), nn.Sigmoid(),
            nn.Linear(84, 10)
        )
    def forward(self, x):
        return self.seq(x)

# 4. 训练函数（每个GPU进程独立执行）
def train(rank, world_size, args):
    # 初始化分布式环境
    setup(rank, world_size)
    
    # 配置设备（当前进程使用的GPU）
    torch.cuda.set_device(rank)
    device = torch.device(f"cuda:{rank}")
    
    # 加载大型数据集（使用高效的预处理和加载方式）
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    
    # 注意：大型数据集建议使用ImageFolder或自定义Dataset，并设置num_workers加速加载
    dataset = datasets.FashionMNIST(  # 替换为你的大型数据集
        root=args.data_root,
        train=True,
        download=True,
        transform=transform
    )
    
    # 分布式采样器：自动将数据分片到多个GPU（避免重复加载）
    sampler = DistributedSampler(dataset, shuffle=True)
    # 数据加载器优化：pin_memory=True加速CPU到GPU的数据传输
    dataloader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        sampler=sampler,
        num_workers=args.num_workers,  # 多进程加载数据（根据CPU核心数设置）
        pin_memory=True
    )
    
    # 初始化模型并移到当前GPU
    model = Net().to(device)
    # 用DDP包装模型：实现多GPU并行训练
    ddp_model = DDP(model, device_ids=[rank])
    
    # 损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ddp_model.parameters(), lr=1e-3)
    
    # 开始训练
    ddp_model.train()
    for epoch in range(args.epochs):
        sampler.set_epoch(epoch)  # 确保每个epoch的shuffle不同
        for i, (images, labels) in enumerate(dataloader):
            # 数据已由sampler分配，直接移到当前GPU（无需手动分片）
            images = images.to(device, non_blocking=True)  # non_blocking加速异步传输
            labels = labels.to(device, non_blocking=True)
            
            # 前向传播
            outputs = ddp_model(images)
            loss = criterion(outputs, labels)
            
            # 反向传播+更新
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # 仅在主进程（rank=0）打印日志，避免多进程重复输出
            if rank == 0 and i % 100 == 0:
                print(f"Epoch {epoch}, Batch {i}, Loss: {loss.item():.4f}")
    
    # 清理
    cleanup()

# 5. 主函数：启动多进程（每个GPU一个进程）
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_root', type=str, default='/path/to/large_dataset')
    parser.add_argument('--batch_size', type=int, default=128)  # 总batch_size=单GPU*GPU数
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--num_workers', type=int, default=8)  # 建议设为CPU核心数的1-2倍
    args = parser.parse_args()
    
    world_size = 2  # 服务器有2块GPU
    # 启动多进程（每个进程对应一块GPU）
    mp.spawn(train, args=(world_size, args), nprocs=world_size, join=True)

if __name__ == '__main__':
    # 确保CUDA可见（默认使用所有GPU，也可指定：export CUDA_VISIBLE_DEVICES=0,1）
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # 指定使用第0和第1块GPU
    main()


**chapter7 现代卷积神经网络**

In [None]:
import torch
import torch.nn as nn
AlexNet=nn.Sequential(
    nn.Conv2d(1,96,kernel_size=11,stride=4,padding=1),nn.ReLU(),
    
    nn.MaxPool2d(kernel_size=3,stride=2),
    
    nn.Conv2d(96,256,kernel_size=5,padding=2),nn.ReLU(),
    nn.MaxPool2d(kernel_size=3,stride=2),
    
    nn.Conv2d(256,384,kernel_size=3,padding=1),nn.ReLU(),
    nn.Conv2d(384,384,kernel_size=3,padding=1),nn.ReLU(), 
    nn.Conv2d(384,256,kernel_size=3,padding=1),nn.ReLU(),
    nn.MaxPool2d(kernel_size=3,stride=2),
    nn.Flatten(),   
    nn.Linear(9216,4096),nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(4096,4096),nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(4096,10)
)
# net=AlexNet()
x=torch.randn((1,1,244,244))
print(x.shape)
for layer in AlexNet:
    x=layer(x)
    print(layer.__class__.__name__,"output shape:\t",x.shape)
from torch.utils.data import DataLoader
from torchvision import datasets,transforms
train_dataset=datasets.FashionMNIST(root="/home/pumengyu/2025_9python/download",
                                    train=True,
                                    download=False,
                                    transform=transforms.ToTensor())
test_dataset=datasets.FashionMNIST(root="/home/pumengyu/2025_9python/download",
                                    train=False,
                                    download=False,
                                    transform=transforms.ToTensor())
train_iter=DataLoader(dataset=train_dataset,shuffle=True,batch_size=64)
test_iter=DataLoader(dataset=test_dataset,shuffle=True,batch_size=64)
loss=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(AlexNet.parameters(),lr=1e-3)
epoch=1
AlexNet.train()
# train_iter1=DataLoader(dataset=train_dataset,shuffle=True,batch_size=1)
# for i,(images,labels) in enumerate(train_iter1):
    
    #print(images.shape)
for i in range(epoch):
    for j,(images,labels) in enumerate(train_iter):
        outputs=AlexNet(images)
        loss1=loss(outputs,labels)
        loss1.backward()
        optimizer.step()
        optimizer.zero_grad()

AlexNet.eval()  
accuracy=0
index=0
for j,(images,labels) in enumerate(test_iter):
        outputs=AlexNet(images)
        index+=labels.size(0)
        _,outputs=torch.max(outputs,dim=1)
        accuracy+=(outputs==labels).sum()
print("最后的准确率",accuracy/index)


torch.Size([1, 1, 244, 244])
Conv2d output shape:	 torch.Size([1, 96, 59, 59])
ReLU output shape:	 torch.Size([1, 96, 59, 59])
MaxPool2d output shape:	 torch.Size([1, 96, 29, 29])
Conv2d output shape:	 torch.Size([1, 256, 29, 29])
ReLU output shape:	 torch.Size([1, 256, 29, 29])
MaxPool2d output shape:	 torch.Size([1, 256, 14, 14])
Conv2d output shape:	 torch.Size([1, 384, 14, 14])
ReLU output shape:	 torch.Size([1, 384, 14, 14])
Conv2d output shape:	 torch.Size([1, 384, 14, 14])
ReLU output shape:	 torch.Size([1, 384, 14, 14])
Conv2d output shape:	 torch.Size([1, 256, 14, 14])
ReLU output shape:	 torch.Size([1, 256, 14, 14])
MaxPool2d output shape:	 torch.Size([1, 256, 6, 6])
Flatten output shape:	 torch.Size([1, 9216])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear 

In [24]:
def vgg_block(in_channels=1,out_channels=3,num_convs=2):
    layers=[]
    for i in range(num_convs):
        layers.append(nn.Conv2d(in_channels=in_channels,out_channels=out_channels,kernel_size=3,padding=1))
        layers.append(nn.ReLU())
        in_channels=out_channels
    layers.append(nn.MaxPool2d(kernel_size=2,stride=2))
    net=nn.Sequential(*layers)
    return net


print(vgg_block(1,3,2))


Sequential(
  (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
