In [1]:
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
def cpu():  # @save
    """Get the CPU device."""
    return torch.device('cpu')


def gpu(i=0):  # @save
    """Get a GPU device."""
    return torch.device(f'cuda:{i}')


cpu(), gpu(), gpu(1)

(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=1))

In [3]:
def num_gpus():  # @save
    """Get the number of available GPUs."""
    return torch.cuda.device_count()


num_gpus()

1

In [4]:
def try_gpu(i=0):  # @save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()


def try_all_gpus():  # @save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]


try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [5]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [6]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [11]:
Y = torch.rand(2, 3, device=try_gpu(0)) # we only have one GPU
Y

tensor([[0.7000, 0.2187, 0.8960],
        [0.2203, 0.9702, 0.0400]], device='cuda:0')

In [12]:
Z = X.cuda(0)
print(X)
print(Z)

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')
tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')


In [13]:
Y + Z

tensor([[1.7000, 1.2187, 1.8960],
        [1.2203, 1.9702, 1.0400]], device='cuda:0')

In [14]:
Z.cuda(0) is Z

True

In [15]:
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=try_gpu())



In [16]:
net(X)

tensor([[0.9593],
        [0.9593]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [17]:
net[0].weight.data.device

device(type='cuda', index=0)

In [18]:
@d2l.add_to_class(d2l.Trainer)  # @save
def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
    self.save_hyperparameters()
    self.gpus = [d2l.gpu(i) for i in range(min(num_gpus, d2l.num_gpus()))]


@d2l.add_to_class(d2l.Trainer)  # @save
def prepare_batch(self, batch):
    if self.gpus:
        batch = [a.to(self.gpus[0]) for a in batch]
    return batch


@d2l.add_to_class(d2l.Trainer)  # @save
def prepare_model(self, model):
    model.trainer = self
    model.board.xlim = [0, self.max_epochs]
    if self.gpus:
        model.to(self.gpus[0])
    self.model = model

In [19]:
# Here's an example to demonstrate the difference in speed between CPU and GPU for a large matrix multiplication task and a small matrix multiplication task:
import torch
import time


def measure_time(matrix_size, device):
    A = torch.randn(matrix_size, matrix_size, device=device)
    B = torch.randn(matrix_size, matrix_size, device=device)

    start_time = time.time()
    C = torch.matmul(A, B)
    torch.cuda.synchronize()  # Ensure synchronization for GPU
    end_time = time.time()

    return end_time - start_time


large_matrix_size = 5000
small_matrix_size = 50

cpu_device = torch.device("cpu")
gpu_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("Large matrix multiplication:")
print("CPU:", measure_time(large_matrix_size, cpu_device), "seconds")
print("GPU:", measure_time(large_matrix_size, gpu_device), "seconds")

print("Small matrix multiplication:")
print("CPU:", measure_time(small_matrix_size, cpu_device), "seconds")
print("GPU:", measure_time(small_matrix_size, gpu_device), "seconds")

Large matrix multiplication:
CPU: 1.1213064193725586 seconds
GPU: 0.14332032203674316 seconds
Small matrix multiplication:
CPU: 0.00016021728515625 seconds
GPU: 0.00010704994201660156 seconds


Here's an example to demonstrate the difference in speed between CPU and GPU for a large matrix multiplication task and a small matrix multiplication task:

```python
import torch
import time

def measure_time(matrix_size, device):
    A = torch.randn(matrix_size, matrix_size, device=device)
    B = torch.randn(matrix_size, matrix_size, device=device)

    start_time = time.time()
    C = torch.matmul(A, B)
    torch.cuda.synchronize()  # Ensure synchronization for GPU
    end_time = time.time()

    return end_time - start_time

large_matrix_size = 5000
small_matrix_size = 50

cpu_device = torch.device("cpu")
gpu_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("Large matrix multiplication:")
print("CPU:", measure_time(large_matrix_size, cpu_device), "seconds")
print("GPU:", measure_time(large_matrix_size, gpu_device), "seconds")

print("Small matrix multiplication:")
print("CPU:", measure_time(small_matrix_size, cpu_device), "seconds")
print("GPU:", measure_time(small_matrix_size, gpu_device), "seconds")
```

For a large matrix multiplication task, you should observe a significant speedup on the GPU compared to the CPU. However, for a small matrix multiplication task, the difference in speed might be less noticeable or even slower on the GPU due to the overhead of GPU computation and data transfer.

To read and write model parameters on the GPU, you can use the `to()` method to move the model and its parameters to the GPU, and the `state_dict()` and `load_state_dict()` methods to save and load the parameters. Here's an example:

```python
# Move the model to the GPU
model = model.to(gpu_device)

# Save the model parameters
torch.save(model.state_dict(), "model_parameters.pt")

# Load the model parameters
loaded_parameters = torch.load("model_parameters.pt", map_location=gpu_device)
model.load_state_dict(loaded_parameters)
```

Now, let's measure the time it takes to compute 1000 matrix–matrix multiplications and log the Frobenius norm of the output matrix one result at a time, compared to keeping a log on the GPU and transferring only the final result:

```python
num_matrices = 1000
matrix_size = 500

A = torch.randn(num_matrices, matrix_size, matrix_size, device=gpu_device)
B = torch.randn(num_matrices, matrix_size, matrix_size, device=gpu_device)
C = torch.empty(num_matrices, matrix_size, matrix_size, device=gpu_device)

# Compute and log the Frobenius norm one result at a time
start_time = time.time()
for i in range(num_matrices):
    C[i] = torch.matmul(A[i], B[i])
    frobenius_norm = torch.norm(C[i], p="fro").cpu().item()
    # Log the Frobenius norm (here we just print it)
    print(frobenius_norm)
end_time = time.time()
print("One result at a time:", end_time - start_time, "seconds")

# Compute and log the Frobenius norm only for the final result
start_time = time.time()
C = torch.matmul(A, B)
frobenius_norms = torch.norm(C, p="fro", dim=(1, 2)).cpu()
end_time = time.time()
print("Final result only:", end_time - start_time, "seconds")
```

You should observe that keeping a log on the GPU and transferring only the final result is faster than logging the Frobenius norm one result at a time.

Finally, let's measure how much time it takes to perform two matrix–matrix multiplications on two GPUs at the same time, compared to computing them in sequence on one GPU:

```python
if torch.cuda.device_count() >= 2:
    # Two GPUs available
    gpu1_device = torch.device("cuda:0")
    gpu2_device = torch.device("cuda:1")

    A1 = torch.randn(matrix_size, matrix_size, device=gpu1_device)
    B1 = torch.randn(matrix_size, matrix_size, device=gpu1_device)
    A2 = torch.randn(matrix_size, matrix_size, device=gpu2_device)
    B2 = torch.randn(matrix_size, matrix_size, device=gpu2_device)

    start_time = time.time()
    C1 = torch.matmul(A1, B1)
    C2 = torch.matmul(A2, B2)
    torch.cuda.synchronize()  # Ensure synchronization for both GPUs
    end_time = time.time()
    print("Two GPUs:", end_time - start_time, "seconds")

A1 = A1.to(gpu_device)
B1 = B1.to(gpu_device)
A2 = A2.to(g

抱歉，我的回答被意外截断。请允许我继续上面的答案。

接下来，我们将测量在两个GPU上同时执行两个矩阵-矩阵乘法所需的时间，并将其与在一个GPU上顺序计算它们所需的时间进行比较：

```python
if torch.cuda.device_count() >= 2:
    # Two GPUs available
    gpu1_device = torch.device("cuda:0")
    gpu2_device = torch.device("cuda:1")

    A1 = torch.randn(matrix_size, matrix_size, device=gpu1_device)
    B1 = torch.randn(matrix_size, matrix_size, device=gpu1_device)
    A2 = torch.randn(matrix_size, matrix_size, device=gpu2_device)
    B2 = torch.randn(matrix_size, matrix_size, device=gpu2_device)

    start_time = time.time()
    C1 = torch.matmul(A1, B1)
    C2 = torch.matmul(A2, B2)
    torch.cuda.synchronize()  # Ensure synchronization for both GPUs
    end_time = time.time()
    print("Two GPUs:", end_time - start_time, "seconds")

# Move all tensors to the same GPU device
A1 = A1.to(gpu_device)
B1 = B1.to(gpu_device)
A2 = A2.to(gpu_device)
B2 = B2.to(gpu_device)

start_time = time.time()
C1 = torch.matmul(A1, B1)
C2 = torch.matmul(A2, B2)
torch.cuda.synchronize()  # Ensure synchronization for the single GPU
end_time = time.time()
print("One GPU:", end_time - start_time, "seconds")
```

当在两个GPU上同时执行矩阵-矩阵乘法时，您应该看到几乎线性的扩展，即所需时间大约是在一个GPU上顺序计算所需时间的一半。这是因为两个GPU可以并行处理计算任务，从而显著减少总体计算时间。

In [20]:
# Here's an example to demonstrate the difference in speed between CPU and GPU for a large matrix multiplication task and a small matrix multiplication task:
import torch
import time

num_matrices = 1000
matrix_size = 500

A = torch.randn(num_matrices, matrix_size, matrix_size, device=gpu_device)
B = torch.randn(num_matrices, matrix_size, matrix_size, device=gpu_device)
C = torch.empty(num_matrices, matrix_size, matrix_size, device=gpu_device)

# Compute and log the Frobenius norm one result at a time
start_time = time.time()
for i in range(num_matrices):
    C[i] = torch.matmul(A[i], B[i])
    frobenius_norm = torch.norm(C[i], p="fro").cpu().item()
    # Log the Frobenius norm (here we just print it)
    print(frobenius_norm)
end_time = time.time()
print("One result at a time:", end_time - start_time, "seconds")

# Compute and log the Frobenius norm only for the final result
start_time = time.time()
C = torch.matmul(A, B)
frobenius_norms = torch.norm(C, p="fro", dim=(1, 2)).cpu()
end_time = time.time()
print("Final result only:", end_time - start_time, "seconds")

11237.4482421875
11218.3046875
11169.251953125
11191.328125
11170.3046875
11165.5322265625
11158.1552734375
11169.115234375
11207.3603515625
11192.583984375
11187.5126953125
11166.400390625
11162.6376953125
11148.673828125
11171.458984375
11156.2685546875
11155.4248046875
11148.712890625
11142.8369140625
11206.29296875
11170.2490234375
11186.9609375
11188.642578125
11172.740234375
11222.0302734375
11143.40234375
11186.3515625
11173.91015625
11210.4951171875
11179.443359375
11212.7236328125
11178.3515625
11164.1923828125
11174.9697265625
11164.837890625
11177.880859375
11176.1591796875
11175.2705078125
11179.2724609375
11175.5673828125
11149.845703125
11113.1220703125
11207.4951171875
11161.19921875
11228.517578125
11171.904296875
11185.544921875
11211.798828125
11190.5634765625
11184.025390625
11177.912109375
11165.890625
11152.3896484375
11179.6689453125
11204.3876953125
11187.2958984375
11210.3984375
11253.6767578125
11182.8818359375
11182.7451171875
11199.24609375
11191.4814453125
1