In [1]:
import torch
from torchvision import models, transforms
from PIL import Image
import requests
import time
from io import BytesIO

In [2]:
# URL of the image
url = "https://upload.wikimedia.org/wikipedia/commons/9/9a/Pug_600.jpg"

# Add headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# Send the request with headers
response = requests.get(url, headers=headers, stream=True)

# Check for a successful response
if response.status_code == 200:
    # Load the image into PIL directly without saving to disk
    img = Image.open(BytesIO(response.content)).convert("RGB")
else:
    print(f"Failed to download image. Status code: {response.status_code}")
    
# Define a transform for the input image
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [3]:
# Load a pretrained ResNet50 model
model = models.resnet50(weights='ResNet50_Weights.DEFAULT').cuda()
input_tensor = preprocess(img).unsqueeze(0).cuda()

In [4]:
model.eval()
# Perform a single prediction
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    with_stack=True
) as prof:
    output = model(input_tensor)
    _, predicted_class = output.max(1)

# Load the class labels
LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
labels = requests.get(LABELS_URL).json()

# Get the class label
predicted_label = labels[predicted_class.item()]
print(f"Predicted class label: {predicted_label}")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Predicted class label: pug
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.15%     241.807us        68.81%     111.482ms       2.103ms       0.000us         0.00%       4.091ms      77.190us            53  
                                      aten::convolution         0.40%     643.756us        68.66%     111.240ms       2.099ms       0.000us         0.00%       4.091ms      77.190u

In [7]:
# Measure inference time
start_time = time.time()
trials = 100
for _ in range(trials):
    label = model(input_tensor)
    assert label.argmax() == predicted_class
end_time = time.time()

avg_time_inference_gpu = (end_time - start_time) / trials
print(f"Average inference time on GPU: {avg_time_inference_gpu:.4f} seconds")

Average inference time on GPU: 0.0090 seconds


In [8]:
# Set model to train mode
model.train()

# Create a dummy label (for ImageNet, valid class indices are 0 to 999)
dummy_label = torch.tensor([0], dtype=torch.long, device='cuda')

# Define a loss function and an optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# Profile a single training iteration
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log_train'),
    record_shapes=True,
    with_stack=True
) as prof:
    # Forward pass
    output = model(input_tensor)
    loss = criterion(output, dummy_label)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Single training iteration completed.")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


Single training iteration completed.
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        57.57%      73.468ms        57.57%      73.468ms     128.666us       0.000us         0.00%       0.000us       0.000us           571  
autograd::engine::evaluate_function: ConvolutionBack...         0.73%     932.273us        21.59%      27.555ms     519.914us       0.000us         0.00%       8.765ms   

In [9]:
start_time_gpu = time.time()
trials = 100
for _ in range(trials):
    output = model(input_tensor)
    loss = criterion(output, dummy_label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

end_time_gpu = time.time()
avg_time_train_gpu = (end_time_gpu - start_time_gpu) / trials
print(f"Average training time on GPU: {avg_time_train_gpu:.4f} seconds")

Average training time on GPU: 0.0470 seconds


In [11]:
# Move model and tensor to CPU
model_cpu = models.resnet50(weights='ResNet50_Weights.DEFAULT').cpu()
model_cpu.eval()
input_tensor_cpu = preprocess(img).unsqueeze(0).cpu()

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    with_stack=True
) as prof:
    output_cpu = model_cpu(input_tensor_cpu)
    _, predicted_class_cpu = output_cpu.max(1)

predicted_label_cpu = labels[predicted_class_cpu.item()]
print(f"Predicted class label on CPU: {predicted_label_cpu}")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Predicted class label on CPU: pug
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         0.36%     307.513us        83.41%      71.167ms       1.343ms            53  
                aten::convolution         0.96%     819.516us        83.05%      70.860ms       1.337ms            53  
               aten::_convolution         0.65%     551.267us        82.09%      70.040ms       1.322ms            53  
         aten::mkldnn_convolution        80.51%      68.691ms        81.44%      69.489ms       1.311ms            53  
                 aten::batch_norm         0.25%     211.849us         7.76%       6.617ms     124.852us            53  
     a

In [12]:
# Measure inference time on CPU
start_time_cpu = time.time()
trials = 100
for _ in range(trials):
    label = model_cpu(input_tensor_cpu)
    assert label.argmax() == predicted_class  # Ensure same result as GPU
end_time_cpu = time.time()
avg_time_inference_cpu = (end_time_cpu - start_time_cpu) / trials
print(f"Average inference time on CPU: {avg_time_inference_cpu:.4f} seconds")


Average inference time on CPU: 0.0842 seconds


In [14]:
# Set model to train mode
model_cpu.train()

# Create a dummy label (for ImageNet, valid class indices are 0 to 999)
dummy_label = torch.tensor([0], dtype=torch.long, device='cpu')

# Define a loss function and an optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_cpu.parameters(), lr=1e-3)

# Profile a single training iteration
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log_train'),
    record_shapes=True,
    with_stack=True
) as prof:
    # Forward pass
    output = model_cpu(input_tensor_cpu)
    loss = criterion(output, dummy_label)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Single training iteration completed.")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


Single training iteration completed.
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
autograd::engine::evaluate_function: ConvolutionBack...         0.38%       1.738ms        54.03%     246.707ms       4.655ms            53  
                                   ConvolutionBackward0         0.16%     742.912us        52.80%     241.095ms       4.549ms            53  
                             aten::convolution_backward        52.21%     238.395ms        52.64%     240.352ms       4.535ms            53  
                                           aten::conv2d         0.11%     497.033us        28.72%     131.143ms

In [15]:
start_time_cpu = time.time()
trials = 100
for _ in range(trials):
    output = model_cpu(input_tensor_cpu)
    loss = criterion(output, dummy_label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

end_time_cpu = time.time()
avg_time_train_cpu = (end_time_cpu - start_time_cpu) / trials
print(f"Average training time on CPU: {avg_time_train_cpu:.4f} seconds")

Average training time on CPU: 0.2967 seconds


In [16]:
# Speedup
speedup = avg_time_train_cpu / avg_time_train_gpu
print(f"Train Speedup: {speedup:.2f}x")
speedup = avg_time_inference_cpu / avg_time_inference_gpu
print(f"Inference Speedup: {speedup:.2f}x")

Train Speedup: 6.32x
Inference Speedup: 9.37x
