In [1]:
import torch
import time

# Check for CUDA

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available!")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available!
Number of GPUs available: 1
GPU: NVIDIA A100-PCIE-40GB


In [3]:
# Set the device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")

using device: cuda


In [4]:
# Create two random tensors
tensor1 = torch.randn(1000, 1000, device=device)
tensor2 = torch.randn(1000, 1000, device=device)

# Add the two tensors, the operation will be performed on the GPU if available
result = tensor1 + tensor2

print(result)

tensor([[-2.3412, -2.9746,  1.6327,  ...,  1.7762,  1.1176, -0.6660],
        [-0.7098, -1.0529,  0.7931,  ...,  1.2098, -0.3568, -0.5596],
        [ 3.3406, -1.2258, -2.7458,  ..., -0.6209, -0.3127,  1.0155],
        ...,
        [-0.8189,  2.3436,  2.2477,  ...,  1.2224, -1.5397,  0.0579],
        [-0.2297, -2.2453, -0.7732,  ..., -0.3020,  1.9078,  1.2613],
        [-0.6079,  0.5231, -0.5131,  ..., -0.5740,  3.3171,  1.2996]],
       device='cuda:0')


# Compare speed of CPU vs GPU

In [5]:
# create random matrix
matrix_size = 10000
x = torch.randn(matrix_size, matrix_size, device="cpu")
y = torch.randn(matrix_size, matrix_size, device="cpu")

In [6]:
%%timeit -n 10
result = torch.div(x,y)

145 ms ± 757 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
x_gpu = x.to("cuda")
y_gpu = y.to("cuda")
torch.cuda.synchronize()

In [8]:
%%timeit -n 10
result_gpu = torch.div(x_gpu,y_gpu)

The slowest run took 30.48 times longer than the fastest. This could mean that an intermediate result is being cached.
53.4 µs ± 101 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### The GPU is 3 orders of magnitude faster on this operation!

![image](./time-scale.png)