In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.models import alexnet
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.Grayscale(3),        
    transforms.ToTensor(),
    transforms.Normalize((0.1307,) * 3, (0.3081,) * 3) 
])

In [4]:
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)

In [5]:
model = alexnet(weights=None) 
model.classifier[6] = nn.Linear(model.classifier[6].in_features, 10)  # Modify last layer for 10 classes (MNIST)
model = model.to(device)

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [7]:
def train_batch(inputs, labels):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [10]:
def run_profiler():
    activities = [
        ProfilerActivity.CPU,
        ProfilerActivity.CUDA,
    ]
    
    # Warm-up
    for i, (inputs, labels) in enumerate(train_loader):
        if i >= 10: 
            break
        inputs, labels = inputs.to(device), labels.to(device)
        train_batch(inputs, labels)
    
    with profile(
        activities=activities,
        record_shapes=True,
        profile_memory=True,
        with_flops=True,
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/alexnet_t4_lightning_profile'),
    ) as prof:
        for i, (inputs, labels) in enumerate(train_loader):
            if i >= 100:  
                break
            
            inputs, labels = inputs.to(device), labels.to(device)
            with record_function("train_batch"):
                loss = train_batch(inputs, labels)
            prof.step()
    
    # Print profiler results
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    
    # Get FLOPS information
    flops_data = prof.key_averages()
    total_flops = 0
    total_time = 0
    
    for item in flops_data:
        if hasattr(item, 'flops') and item.flops > 0:
            total_flops += item.flops
            if hasattr(item, 'cuda_time_total'):
                total_time += item.cuda_time_total
            elif hasattr(item, 'cpu_time_total'):
                total_time += item.cpu_time_total
    
    # Calculate FLOPS/s (divide by 1e9 to get GFLOPS/s)
    if total_time > 0: 
        flops_per_second = total_flops / (total_time * 1e-6)  # Convert time from microseconds to seconds
        print(f"\nTotal FLOPS: {total_flops}")
        print(f"Total CUDA time: {total_time * 1e-6:.6f} seconds")
        print(f"Performance: {flops_per_second / 1e9:.2f} GFLOPS/s")
    
    # Calculate arithmetic intensity (FLOPs/byte)
    total_memory_bytes = 0
    for item in flops_data:
        if hasattr(item, 'self_memory_usage'):
            total_memory_bytes += item.self_memory_usage
    
    if total_memory_bytes > 0: 
        arithmetic_intensity = total_flops / total_memory_bytes
        print(f"Total memory usage: {total_memory_bytes / (1024 * 1024):.2f} MB")
        print(f"Arithmetic Intensity: {arithmetic_intensity:.2f} FLOPS/byte")
    
    return prof

In [11]:
profiler_results = run_profiler()

print("\nTop operators by FLOPS:")
print(profiler_results.key_averages().table(sort_by="flops", row_limit=10))

profiler_results.export_chrome_trace("alexnet_t4_lightning.json")
print("\nTrace file exported to: alexnet_t4_lightning.json")

STAGE:2025-04-02 01:35:51 1318:1318 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-04-02 01:35:58 1318:1318 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-04-02 01:35:58 1318:1318 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total GFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            train_batch        11.01%     713.460ms        65.82%        4.266s      42.665ms       0.000us         0.00%        2.024s      20.239ms           0 

RuntimeError: Trace is already saved.