In [12]:
import torch
from torch import nn
from torch.nn import functional as F
from dltool import train,dataprocess,test,plot,nlp

In [13]:
# 初始化模型参数
scale = 0.01
W1 = torch.randn(size=(20, 1, 3, 3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50, 20, 5, 5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(800, 128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128, 10)) * scale
b4 = torch.zeros(10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

# 定义模型
def lenet(X, params):
    h1_conv = F.conv2d(input=X, weight=params[0], bias=params[1])
    h1_activation = F.relu(h1_conv)
    h1 = F.avg_pool2d(input=h1_activation, kernel_size=(2, 2), stride=(2, 2))
    h2_conv = F.conv2d(input=h1, weight=params[2], bias=params[3])
    h2_activation = F.relu(h2_conv)
    h2 = F.avg_pool2d(input=h2_activation, kernel_size=(2, 2), stride=(2, 2))
    h2 = h2.reshape(h2.shape[0], -1)
    h3_linear = torch.mm(h2, params[4]) + params[5]
    h3 = F.relu(h3_linear)
    y_hat = torch.mm(h3, params[6]) + params[7]
    return y_hat

# 交叉熵损失函数
loss = nn.CrossEntropyLoss(reduction='none')

In [14]:
# 向多个设备分发参数并附加梯度
def get_params(params, device):
  new_params = [p.to(device) for p in params]
  for p in new_params:
    p.requires_grad_()
  return new_params

In [15]:
new_params = get_params(params, train.try_gpu(1))
print('b1 权重:', new_params[1])
print('b1 梯度:', new_params[1].grad)

b1 权重: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:1', requires_grad=True)
b1 梯度: None


In [16]:
# 跨设备对参数求和
# 一个向量分布在多个GPU，下面函数将所有向量相加，并将结果广播给所有GPU
def allreduce(data):
  for i in range(1, len(data)):
    data[0][:] += data[i].to(data[0].device)
  for i in range(1, len(data)):
    data[i][:] = data[0].to(data[i].device)

In [17]:
data = [torch.ones((1,2), device=train.try_gpu(i)) * (i+1) for i in range(1,3)]
print('allreduce之前：\n',data[0],'\n',data[1])
allreduce(data)
print('allreduce之后：\n',data[0],'\n',data[1])

allreduce之前：
 tensor([[2., 2.]], device='cuda:1') 
 tensor([[3., 3.]], device='cuda:2')
allreduce之后：
 tensor([[5., 5.]], device='cuda:1') 
 tensor([[5., 5.]], device='cuda:2')


# 数据分发
将小批量数据均分到多个GPU

In [18]:
data = torch.arange(20).reshape(4,5)
devices = [torch.device('cuda:1'),torch.device('cuda:2')]
split = nn.parallel.scatter(data, devices)
print("input: ",data)
print("load into: ",devices)
print("output: ",split)

input:  tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]])
load into:  [device(type='cuda', index=1), device(type='cuda', index=2)]
output:  (tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]], device='cuda:1'), tensor([[10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]], device='cuda:2'))


In [19]:
def split_batch(X,y,devices):
  """将X和y拆分到多个设备"""
  assert X.shape[0] == y.shape[0]
  return (nn.paraller.scatter(X,devices),
          nn.paraller.scatter(y,devices))

In [None]:
def train_batch(X, y, device_params, devices, lr):
    X_shards, y_shards = split_batch(X, y, devices)
    # 在每个GPU上分别计算损失
    ls = [loss(lenet(X_shard, device_W), y_shard).sum()
          for X_shard, y_shard, device_W in zip(
              X_shards, y_shards, device_params)]
    for l in ls:  # 反向传播在每个GPU上分别执行
        l.backward()
    # 将每个GPU的所有梯度相加，并将其广播到所有GPU
    with torch.no_grad():
        for i in range(len(device_params[0])):
            allreduce(
                [device_params[c][i].grad for c in range(len(devices))])
    # 在每个GPU上分别更新模型参数
    for param in device_params:
        d2l.sgd(param, lr, X.shape[0]) # 在这里，我们使用全尺寸的小批量