In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device being used:", device)
print("CUDA version:", torch.version.cuda)
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
print("Number of CUDA cores:", torch.cuda.get_device_properties(0).multi_processor_count if torch.cuda.is_available() else "N/A")


Device being used: cuda
CUDA version: 12.1
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Number of CUDA cores: 20


In [2]:
import torch
import time

# Tensor size for testing
size = 5000

# CPU Test
a_cpu = torch.rand(size, size)
b_cpu = torch.rand(size, size)

start = time.time()
c_cpu = torch.matmul(a_cpu, b_cpu)
end = time.time()
print(f"CPU time: {end - start:.4f} seconds")

# GPU Test
if torch.cuda.is_available():
    a_gpu = a_cpu.to('cuda')
    b_gpu = b_cpu.to('cuda')
    
    torch.cuda.synchronize()  # Make sure GPU is ready
    start = time.time()
    c_gpu = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()  # Wait for completion
    end = time.time()
    
    print(f"GPU time: {end - start:.4f} seconds")


CPU time: 0.5550 seconds
GPU time: 0.6867 seconds


In [3]:
if torch.cuda.is_available():
    gpu_properties = torch.cuda.get_device_properties(0)
    print("Total GPU memory (GB):", round(gpu_properties.total_memory / 1e9, 2))
    print("Current allocated memory (MB):", round(torch.cuda.memory_allocated(0)/1e6, 2))
    print("Current cached memory (MB):", round(torch.cuda.memory_reserved(0)/1e6, 2))


Total GPU memory (GB): 6.44
Current allocated memory (MB): 310.51
Current cached memory (MB): 322.96


In [4]:
import torchvision.models as models

model = models.resnet18().to(device)
data = torch.randn(64, 3, 224, 224).to(device)  # batch size 64
labels = torch.randint(0, 1000, (64,)).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Timing one forward + backward pass
torch.cuda.synchronize()
start = time.time()

outputs = model(data)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

torch.cuda.synchronize()
end = time.time()
print(f"One training step time: {end - start:.4f} seconds")


One training step time: 5.7226 seconds


In [5]:
for i in range(5):
    torch.cuda.synchronize()
    start = time.time()
    c_gpu = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()
    end = time.time()
    print(f"GPU run {i+1}: {end - start:.4f} s")


GPU run 1: 0.5321 s
GPU run 2: 0.5317 s
GPU run 3: 0.5324 s
GPU run 4: 0.5308 s
GPU run 5: 0.5309 s


In [6]:
import torch, time

device = "cuda"

model = models.resnet18().to(device)
data = torch.randn(64,3,224,224).to(device)
labels = torch.randint(0,1000,(64,)).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Warm-up
for _ in range(5):
    outputs = model(data)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Actual benchmark
torch.cuda.synchronize()
start = time.time()
for _ in range(10):
    outputs = model(data)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
torch.cuda.synchronize()
end = time.time()

print(f"Average training step time: {(end-start)/10:.4f} seconds")


Average training step time: 3.3249 seconds


In [7]:
import torch
import time
import torchvision.models as models

# ✅ Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

# ------------------------------
# 1️⃣ Matrix Multiplication Test
# ------------------------------
size = 10000  # large enough for GPU to shine
a_cpu = torch.rand(size, size)
b_cpu = torch.rand(size, size)

# CPU
start = time.time()
c_cpu = torch.matmul(a_cpu, b_cpu)
end = time.time()
print(f"CPU matrix multiplication: {end-start:.4f} s")

# GPU
if torch.cuda.is_available():
    a_gpu = a_cpu.to('cuda')
    b_gpu = b_cpu.to('cuda')
    
    # Warm-up GPU (important!)
    for _ in range(3):
        torch.matmul(a_gpu, b_gpu)
    
    # Timed GPU
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(5):
        c_gpu = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()
    end = time.time()
    print(f"GPU matrix multiplication (avg of 5 runs): {(end-start)/5:.4f} s")

# ------------------------------
# 2️⃣ Mini Training Step Test
# ------------------------------
batch_size = 64
model = models.resnet18().to(device)
data = torch.randn(batch_size, 3, 224, 224).to(device)
labels = torch.randint(0, 1000, (batch_size,)).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Warm-up GPU
for _ in range(5):
    optimizer.zero_grad()
    outputs = model(data)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

# Timed training steps
torch.cuda.synchronize()
start = time.time()
steps = 10
for _ in range(steps):
    optimizer.zero_grad()
    outputs = model(data)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
torch.cuda.synchronize()
end = time.time()

print(f"Average training step time on GPU: {(end-start)/steps:.4f} s")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
CPU matrix multiplication: 12.6980 s
GPU matrix multiplication (avg of 5 runs): 5.0898 s
Average training step time on GPU: 3.3299 s
