In [2]:
import torch
from torch import nn
torch.xpu.is_available()

True

In [3]:
torch.device('cpu'), torch.device('xpu'), torch.device('xpu:1')

(device(type='cpu'), device(type='xpu'), device(type='xpu', index=1))

In [None]:
torch.xpu.device_count()   # 查询可用GPU的数量

1

In [5]:
def try_gpu(i=0):
    """如果存在,则返回gpu(i),否则返回cpu()"""
    if torch.xpu.device_count() >= i+1:
        return torch.device(f'xpu:{i}')
    return torch.device('cpu')

def try_all_gpus():
    """返回所有可用的GPU,如果没有GPU,则返回[cpu(),]"""
    devices = [torch.device(f'xpu:{i}') for i in range(torch.xpu.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='xpu', index=0),
 device(type='cpu'),
 [device(type='xpu', index=0)])

In [None]:
x = torch.tensor([1, 2, 3])   # 默认张量是在cpu上创建的
x.device

device(type='cpu')

In [7]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='xpu:0')

In [8]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [9]:
net(X)

tensor([[-0.2054],
        [-0.2054]], device='xpu:0', grad_fn=<AddmmBackward0>)

In [10]:
net[0].weight.data.device

device(type='xpu', index=0)

练习

进行大矩阵的乘法，查看CPU和GPU的速度差异。再尝试计算量很小的任务。

In [13]:
import time
# 计算量较大的任务
X = torch.randn((10000, 10000))
Y = X.xpu(0)
time_start = time.time()
Z = torch.mm(X, X)
time_end = time.time()
print(f'cpu time cost: {round((time_end-time_start)*1000, 2)} ms')

time_start = time.time()
Z = torch.mm(Y, Y)
time_end = time.time()
print(f'gpu time cost: {round((time_end-time_start)*1000, 2)} ms')

# 计算量较小的任务
X = torch.randn((100, 100))
Y = X.xpu(0)
time_start = time.time()
Z = torch.mm(X, X)
time_end = time.time()
print(f'cpu time cost: {round((time_end-time_start)*1000, 2)} ms')

time_start = time.time()
Z = torch.mm(Y, Y)
time_end = time.time()
print(f'gpu time cost: {round((time_end-time_start)*1000, 2)} ms')

cpu time cost: 2931.16 ms
gpu time cost: 41.17 ms
cpu time cost: 0.0 ms
gpu time cost: 0.0 ms


计算1000个100*100矩阵的矩阵乘法，并输出矩阵的弗罗贝尼乌斯范数 \
对比每次矩阵乘法，都输出范数结果的耗时 与 1000次矩阵乘法之后，只输出最终范数结果的耗时


In [21]:
time_start = time.time()
for i in range(1000):
    Y = torch.mm(Y, Y)
    Z = torch.norm(Y)
time_end = time.time()
print(f'gpu time cost: {round((time_end-time_start)*1000)} ms')

time_start = time.time()
for i in range(1000):
    Y = torch.mm(Y, Y)
Z = torch.norm(Y)
time_end = time.time()
print(f'gpu time cost: {round((time_end-time_start)*1000)} ms')


gpu time cost: 93 ms
gpu time cost: 34 ms
