In [1]:
import torch

a = torch.tensor([1., 2., 3.])

print(torch.square(a))
print(a ** 2)
print(a * a)

def time_pytorch_function(func, input):
    # CUDA IS ASYNC so can't use python time module
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    # Warmup
    for _ in range(5):
        func(input)

    start.record()
    func(input)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)

b = torch.randn(10000, 10000).cuda()

def square_2(a):
    return a * a

def square_3(a):
    return a ** 2

time_pytorch_function(torch.square, b)
time_pytorch_function(square_2, b)
time_pytorch_function(square_3, b)

print("=============")
print("Profiling torch.square")
print("=============")

# Now profile each function using pytorch profiler
with torch.profiler.profile() as prof:
    torch.square(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

print("=============")
print("Profiling a * a")
print("=============")

with torch.profiler.profile() as prof:
    square_2(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

print("=============")
print("Profiling a ** 2")
print("=============")

with torch.profiler.profile() as prof:
    square_3(b)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

tensor([1., 4., 9.])
tensor([1., 4., 9.])
tensor([1., 4., 9.])
Profiling torch.square
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::square         0.00%      18.000us        99.97%        2.502s        2.502s       0.000us         0.00%     975.000us     975.000us             1  
                                              aten::pow         0.03%     654.000us        99.97%        2.502s        2.

<module 'torch.cuda' from '/home/usd.local/robin.ranabhat/myenv/lib64/python3.6/site-packages/torch/cuda/__init__.py'>

##  Extras : Memory and Streams
https://docs.pytorch.org/docs/stable/notes/cuda.html#memory-management

As you run the program below, observe `watch -n 0.1 nvidia-smi`. 
if benchmark_sequential allocates around X MB of memory, while benchmark_parallel allocates around twice of this.
This is because, `benchmark_with_streams` has two independent  allocation of `A` and `B` Tensor at same time.
In First program, `benchmark_sequential` first allocates memory for `A`, runs some computation. And while allocationg memory for `B`, utilizes the caching feature of pytorch.

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

# setup
device = 'cuda:0'
model = models.resnet18().to(device)
data = torch.randn(64, 3, 224, 224, device=device)
target = torch.randint(0, 1000, (64,), device=device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

nb_iters = 20
warmup_iters = 10
for i in range(nb_iters):
    optimizer.zero_grad()

    # start profiling after 10 warmup iterations
    if i == warmup_iters: torch.cuda.cudart().cudaProfilerStart()

    # push range for current iteration
    if i >= warmup_iters: torch.cuda.nvtx.range_push("iteration{}".format(i))

    # push range for forward
    if i >= warmup_iters: torch.cuda.nvtx.range_push("forward")
    output = model(data)
    if i >= warmup_iters: torch.cuda.nvtx.range_pop()

    loss = criterion(output, target)

    if i >= warmup_iters: torch.cuda.nvtx.range_push("backward")
    loss.backward()
    if i >= warmup_iters: torch.cuda.nvtx.range_pop()

    if i >= warmup_iters: torch.cuda.nvtx.range_push("opt.step()")
    optimizer.step()
    if i >= warmup_iters: torch.cuda.nvtx.range_pop()

    # pop iteration range
    if i >= warmup_iters: torch.cuda.nvtx.range_pop()

torch.cuda.cudart().cudaProfilerStop()


In [1]:
import torch

def gpu_mem():
    """
    Returns the current GPU memory usage in MB
    ( Equivalent to usage shown in nvidia-smi )
    """
    mem = torch.cuda.mem_get_info()[1] / (1024 ** 2) - (torch.cuda.mem_get_info()[0] / 1024 ** 2)
    return round(mem, 2)

# --- 2. Define Benchmark Parameters ---
n_warmup = 5
n_runs = 10

M = 9000

# --- 3. Scenario 1: Without Streams (Sequential) ---
def benchmark_sequential():


    # Warm-up runs
    for _ in range(n_warmup):
        A = torch.randn(M, M, device='cuda')
        B = torch.randn(M, M, device='cuda')
        C = torch.mm(A, A)
        C = torch.softmax(C, dim=1)
        C = torch.tanh(C)
        D = torch.mm(B, B)
        D = torch.softmax(D, dim=1)
        D = torch.tanh(D)
        # C.to('cpu')
        # D.to('cpu')
    
    # Clear the cache to measure correct memory usage
    del A, B, C, D
    torch.cuda.empty_cache()
    # Wait for the GPU to finish all queued work
    torch.cuda.synchronize()
    
    print("Starting GPU Usage : ", gpu_mem())
    # Timing
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # Timing for CPU to GPU transfer
    start_cpu_gpu = torch.cuda.Event(enable_timing=True)
    end_cpu_gpu = torch.cuda.Event(enable_timing=True)
    total_elapsed_times = 0
    
    start_event.record()
    for _ in range(n_runs):
        start_cpu_gpu.record()
        A = torch.randn(M, M, device='cuda')
        end_cpu_gpu.record()
        total_elapsed_times += start_cpu_gpu.elapsed_time(end_cpu_gpu)
        B = torch.randn(M, M, device='cuda')
        
        C = torch.mm(A, A)
        C = torch.softmax(C, dim=1)
        C = torch.tanh(C)
        D = torch.mm(B, B)
        D = torch.softmax(D, dim=1)
        D = torch.tanh(D)
        # C.to('cpu')
        # D.to('cpu')
        
        
    end_event.record()
    
    torch.cuda.empty_cache()
    # Wait for the GPU to finish all queued work
    torch.cuda.synchronize()
    # Calculate elapsed time
    elapsed_time_ms = start_event.elapsed_time(end_event)

    print("ELAPSED TIME FOR DATA TRANSFER", total_elapsed_times / n_runs)
    print("Total GPU Usage : ", gpu_mem())
    return elapsed_time_ms / n_runs

# --- 4. Scenario 2: With Streams (Concurrent) ---
def benchmark_with_streams():

    s1 = torch.cuda.Stream()
    s2 = torch.cuda.Stream()

    # Warm-up runs
    for _ in range(n_warmup):
        with torch.cuda.stream(s1):
            A = torch.randn(M, M, device='cuda')
            C = torch.mm(A, A)
            C = torch.softmax(C, dim=1)
            C = torch.tanh(C)
            # C.to('cpu')
        with torch.cuda.stream(s2):
            B = torch.randn(M, M, device='cuda')
            D = torch.mm(B, B)
            D = torch.softmax(D, dim=1)
            D = torch.tanh(D)
            # D.to('cpu')

    # Clear the cache to measure correct memory usage
    del A, B, C, D
    torch.cuda.empty_cache()
    # Wait for the GPU to finish all queued work
    torch.cuda.synchronize()
    

    print("Starting GPU Usage : ", gpu_mem())
    # Timing
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # Timing for CPU to GPU transfer
    start_cpu_gpu = torch.cuda.Event(enable_timing=True)
    end_cpu_gpu = torch.cuda.Event(enable_timing=True)
    total_elapsed_times = 0

    
    start_event.record()
    print("Starting stream benchmark...")
    for i in range(n_runs):
        with torch.cuda.stream(s1):
            start_cpu_gpu.record()
            A = torch.randn(M, M, device='cuda')
            end_cpu_gpu.record()
            total_elapsed_times += start_cpu_gpu.elapsed_time(end_cpu_gpu)
            C = torch.mm(A, A)
            C = torch.softmax(C, dim=1)
            C = torch.tanh(C)
            # C.to('cpu')
        with torch.cuda.stream(s2):
            B = torch.randn(M, M, device='cuda')
            D = torch.mm(B, B)
            D = torch.softmax(D, dim=1)
            D = torch.tanh(D)
            # D.to('cpu')
    end_event.record()
    
    
    torch.cuda.empty_cache()
    # Wait for the GPU to finish all queued work
    torch.cuda.synchronize()
    # Calculate elapsed time
    elapsed_time_ms = start_event.elapsed_time(end_event)
    

    print("ELAPSED TIME FOR DATA TRANSFER", total_elapsed_times / n_runs)
    
    print("Final GPU Usage : ", gpu_mem())
    return elapsed_time_ms / n_runs


# --- 5. Run and Compare ---

print("BEFORE SEQUENCIAL BENCHMARKING : The GPU USAGE : ", gpu_mem())
sequential_time = benchmark_sequential()
print(f"Scenario 1 (Sequential): {sequential_time:.3f} ms per run")


torch.cuda.empty_cache()   
print("BEFORE STREAM BENCHMARKING : The GPU USAGE : ", gpu_mem()) 
streams_time = benchmark_with_streams()
print(f"Scenario 2 (With Streams): {streams_time:.3f} ms per run")

BEFORE SEQUENCIAL BENCHMARKING : The GPU USAGE :  157.81
Starting GPU Usage :  211.81


RuntimeError: CUDA error: device not ready
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
BEFORE STREAM OPS  : Cache cleared. The GPU USAGE :  271.81
Stream Warm-up complete. GPU USAGE :  1655.81
Starting Fresh Timing with cache cleared GPU USAGE :  1079.81
Starting stream benchmark...
END Fresh Timing with current GPU USAGE :  1655.81
Scenario 2 (With Streams): 0.770 ms per run


In [3]:
## still in cache, so remove them and observe memory usage in nvidia-smi` 
torch.cuda.empty_cache()

In [18]:
gpu_mem()

9709.81

In [11]:
torch.cuda.get_device_properties(0).total_memory / 1024 ** 2

10822.9375

## Profiling Default Stream in CUDA

In [1]:
## Profiling with CUDA Streams
### https://docs.pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
### Personal :: https://gemini.google.com/app/061daa91e4502050
import torch
import torch.profiler

ITERS = 5
            
# --- Profiling Setup ---
size = 7000
log_dir = "./log"


A = torch.ones(size, size, device="cpu", pin_memory=True)
collected_data = [torch.empty(size, size, device="cpu", pin_memory=True) for _ in range(ITERS)]

def sequential_data_processing(size):
    for i in range(ITERS):
        A_gpu = A.to("cuda")
        C_gpu = torch.mm(A_gpu, A_gpu)
        for _ in range(4):
            C_gpu += torch.mm(A_gpu, A_gpu)
        C_gpu = C_gpu.to("cpu")
        collected_data[i].copy_(C_gpu, non_blocking=False)


# --- Profile the Sequential Case ---
print("Profiling sequential execution...")
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(wait=2, warmup=4, active=4, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'{log_dir}/sequential'),
    record_shapes=True,
    with_stack=True
) as prof_sequential:
    for _ in range(10): # 1 wait, 1 warmup, 2 active
        sequential_data_processing(size)
        prof_sequential.step() # Mark the end of an iteration

print(prof_sequential.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Profiling sequential execution...
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.27%      19.492ms        99.99%        7.342s        1.835s       0.000us         0.00%        6.675s        1.669s             4  
                                            aten::copy_         8.87%     651.076ms        99.58%        7.312s     121.867ms        1.656s        24.81%        1.656s      

## Profiling Multiple Streams in CUDA

In [1]:
import torch
import torch.profiler

# --- Profiling Setup ---
print("Profiling execution with streams...")
size = 7000
log_dir = "./log"
# --- Profile the Streams Case ---
ITERS = 5
collected_data = [torch.empty(size, size, device="cpu", pin_memory=True) for _ in range(ITERS)]

A = torch.ones(size, size, device="cpu", pin_memory=True) # Use pinned memory for faster async copies
## SETUP STREAMS
streams = []
for i in range(ITERS):
    streams.append(torch.cuda.Stream())


def streamed_data_processing(size):
    # 1. QUEUE all operations without waiting
    for i in range(ITERS):
        with torch.cuda.stream(streams[i]):
            A_gpu = A.to("cuda", non_blocking=True)
            C_gpu = torch.mm(A_gpu, A_gpu)
            for _ in range(4):
                C_gpu += torch.mm(A_gpu, A_gpu)
            collected_data[i].copy_(C_gpu, non_blocking=True)



with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(wait=2, warmup=4, active=4, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'{log_dir}/with_streams'),
    record_shapes=True,
    with_stack=True
) as prof_stream:
    for _ in range(10): # 1 wait, 1 warmup, 2 active
        streamed_data_processing(size)
        prof_stream.step() # Mark the end of an iteration

print(f"Profiling complete. Traces saved in '{log_dir}' directory.")
print(prof_stream.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Profiling execution with streams...
Profiling complete. Traces saved in './log' directory.
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                  cudaDeviceSynchronize        99.82%       10.008s        99.82%       10.008s       10.008s       0.000us         0.00%       0.000us       0.000us             1  
                                          ProfilerStep*         0.11%      11.252ms         0.18%      17.723ms     

In [None]:
import torch
import torch.profiler

# --- Profiling Setup ---
print("Profiling execution with streams...")
size = 7000
log_dir = "./log"
# --- Profile the Streams Case ---
ITERS = 5
collected_data = [torch.empty(size, size, device="cpu", pin_memory=True) for _ in range(ITERS)]

A = torch.ones(size, size, device="cpu", pin_memory=True) # Use pinned memory for faster async copies
## SETUP STREAMS
streams = []
for i in range(ITERS):
    streams.append(torch.cuda.Stream())


def streamed_data_processing(size):

    # Allocate input tensors
    N = 1000000000  # 3 billion elements (# H100 94 GB)
    A1 = torch.randn(N, device="cuda")
    B1 = torch.randn(N, device="cuda")
    C1 = torch.empty_like(A1)

    A2 = torch.randn(N, device="cuda")
    B2 = torch.randn(N, device="cuda")
    C2 = torch.empty_like(A2)

    # Create two CUDA streams
    stream1 = torch.cuda.Stream()
    stream2 = torch.cuda.Stream()

    # Launch vector addition in stream1
    with torch.cuda.stream(stream1):
        C1.copy_(A1 + B1)  # PyTorch kernel for element-wise addition

    # Launch vector addition in stream2
    with torch.cuda.stream(stream2):
        C2.copy_(A2 + B2)

# Wait for both kernels to finish
torch.cuda.synchronize()

print("Both vector additions ran on separate streams!")




with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(wait=2, warmup=4, active=4, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'{log_dir}/with_streams'),
    record_shapes=True,
    with_stack=True
) as prof_stream:
    for _ in range(10): # 1 wait, 1 warmup, 2 active
        streamed_data_processing(size)
        prof_stream.step() # Mark the end of an iteration

print(f"Profiling complete. Traces saved in '{log_dir}' directory.")
print(prof_stream.key_averages().table(sort_by="cpu_time_total", row_limit=10))

In [2]:
# import torch
# import torch.profiler

# # --- Profiling Setup ---
# print("Profiling execution with streams...")
# size = 1000
# log_dir = "./log"
# # --- Profile the Streams Case ---
# ITERS = 5
# collected_data = [torch.empty(size, size, device="cpu", pin_memory=True) for _ in range(ITERS)]

# A = torch.ones(size, size, device="cpu", pin_memory=True) # Use pinned memory for faster async copies
# ## SETUP STREAMS
# streams = []
# for i in range(ITERS):
#     streams.append(torch.cuda.Stream())


# def streamed_data_processing(size):
#     # 1. QUEUE all operations without waiting
#     for i in range(ITERS):
#         with torch.cuda.stream(streams[i]):
#             A_gpu = A.to("cuda", non_blocking=True)
#             C_gpu = torch.mm(A_gpu, A_gpu)
#             for _ in range(10):
#                 C_gpu += torch.mm(A_gpu, A_gpu)
#             collected_data[i].copy_(C_gpu, non_blocking=True)



# with torch.profiler.profile(
#     activities=[
#         torch.profiler.ProfilerActivity.CPU,
#         torch.profiler.ProfilerActivity.CUDA,
#     ],
#     schedule=torch.profiler.schedule(wait=2, warmup=4, active=4, repeat=1),
#     # on_trace_ready=torch.profiler.tensorboard_trace_handler(f'{log_dir}/with_streams'),
#     record_shapes=True,
#     with_stack=True
# ) as prof_stream:
#     for _ in range(10): # 1 wait, 1 warmup, 2 active
#         streamed_data_processing(size)
#         prof_stream.step() # Mark the end of an iteration

# print(f"Profiling complete. Traces saved in '{log_dir}' directory.")
# print(prof_stream.key_averages().table(sort_by="cpu_time_total", row_limit=10))

In [3]:
A = torch.ones(size, size, device="cpu")
C = torch.mm(A, A)
# collected_data[i] = C_gpu

In [8]:
collected_data = [torch.empty(size, size, device="cpu") for _ in range(ITERS)]

In [None]:
collected_data[0].copy_()

tensor([[0.0000e+00, 0.0000e+00, 6.8664e-44,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03],
        [2.0000e+03, 2.0000e+03, 2.0000e+03,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03],
        [2.0000e+03, 2.0000e+03, 2.0000e+03,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03],
        ...,
        [2.0000e+03, 2.0000e+03, 2.0000e+03,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03],
        [2.0000e+03, 2.0000e+03, 2.0000e+03,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03],
        [2.0000e+03, 2.0000e+03, 2.0000e+03,  ..., 2.0000e+03, 2.0000e+03,
         2.0000e+03]])

## RANDOM 

In [2]:
import torch
import torch.profiler

def func():
    A = torch.ones(size, size, device="cpu")
    A = A.to("cuda", non_blocking=True)
    C = torch.mm(A, A)
    C = C.to("cpu", non_blocking=True)

streams = []
for i in range(10):
    streams.append(torch.cuda.Stream()) 

def run_with_streams(size):
    """Runs creation and matmul concurrently on two separate streams."""
    for i in range(10):
        with torch.cuda.stream(streams[i]):
            func()

# --- Profiling Setup ---
size = 2000
log_dir = "./log"
# --- Profile the Streams Case ---
print("Profiling execution with streams...")
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(wait=2, warmup=4, active=4, repeat=1),
    # on_trace_ready=torch.profiler.tensorboard_trace_handler(f'{log_dir}/with_streams'),
    record_shapes=True,
    with_stack=True
) as prof_stream:
    for _ in range(10): # 1 wait, 1 warmup, 2 active
        run_with_streams(size)
        prof_stream.step() # Mark the end of an iteration

print(f"Profiling complete. Traces saved in '{log_dir}' directory.")
print(prof_stream.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Profiling execution with streams...


STAGE:2025-07-21 21:31:39 3142936:3142936 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


Profiling complete. Traces saved in './log' directory.
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    ProfilerStep*         2.69%       9.899ms        99.38%     365.838ms      91.460ms       0.000us         0.00%     338.026ms      84.507ms             4  
                                         aten::to         0.14%     527.000us        64.01%     235.613ms       2.945ms       0.000us         0.00%     274.106ms       3.426ms 

STAGE:2025-07-21 21:31:39 3142936:3142936 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-07-21 21:31:39 3142936:3142936 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


## Nvidia Nsight Profiling

In [None]:
# import torch

# # Allocate input tensors
# N = 1000000000  # 1 billion elements (adjusted for compatibility)
# A1 = torch.randn(N, device="cuda")
# B1 = torch.randn(N, device="cuda")
# C1 = torch.empty_like(A1)

# A2 = torch.randn(N, device="cuda")
# B2 = torch.randn(N, device="cuda")
# C2 = torch.empty_like(A2)

# # Create two CUDA streams
# stream1 = torch.cuda.Stream()
# stream2 = torch.cuda.Stream()

# # Setup for profiling
# nb_iters = 10
# warmup_iters = 3

# for i in range(nb_iters):
#     # Start profiling after warmup iterations
#     if i == warmup_iters: 
#         torch.cuda.cudart().cudaProfilerStart()
    
#     # Push range for current iteration
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_push("iteration{}".format(i))
    
#     # Push range for stream1 operation
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_push("stream1_vector_add")
    
#     # Launch vector addition in stream1
#     with torch.cuda.stream(stream1):
#         C1.copy_(A1 + B1)  # PyTorch kernel for element-wise addition
    
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_pop()
    
#     # Push range for stream2 operation
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_push("stream2_vector_add")
    
#     # Launch vector addition in stream2
#     with torch.cuda.stream(stream2):
#         C2.copy_(A2 + B2)
    
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_pop()
    
#     # Push range for synchronization
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_push("synchronize")
    
#     # Wait for both kernels to finish
#     torch.cuda.synchronize()
    
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_pop()
    
#     # Pop iteration range
#     if i >= warmup_iters: 
#         torch.cuda.nvtx.range_pop()

# # Stop profiling
# torch.cuda.cudart().cudaProfilerStop()

# print("Both vector additions ran on separate streams!")



import torch

input = []
batch_size = 64
shape = (batch_size, 256, 256)
for i in range (0, 4):
    input.append(torch.randn(shape))
input = [in_.pin_memory() for in_ in input]

s1 = torch.cuda.Stream()
s2 = torch.cuda.Stream()
s3 = torch.cuda.Stream()
s4 = torch.cuda.Stream()

torch.cuda.synchronize()
cpu_device = torch.device('cpu')
device = torch.device('cuda:0')

torch.cuda.cudart().cudaProfilerStart()

# Stream 1 operations
torch.cuda.nvtx.range_push("stream1_processing")
with torch.cuda.stream(s1):
    torch.cuda.nvtx.range_push("stream1_h2d_copy")
    curr_batch = input[0].to(device,non_blocking = True)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream1_exp_ops")
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream1_d2h_copy")
    input[0].copy_(curr_batch,non_blocking = True)
    torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_pop()

# Stream 2 operations
torch.cuda.nvtx.range_push("stream2_processing")
with torch.cuda.stream(s2):
    torch.cuda.nvtx.range_push("stream2_h2d_copy")
    curr_batch = input[1].to(device,non_blocking = True)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream2_exp_ops")
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream2_d2h_copy")
    input[1].copy_(curr_batch,non_blocking = True)
    torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_pop()

# Stream 3 operations
torch.cuda.nvtx.range_push("stream3_processing")
with torch.cuda.stream(s3):
    torch.cuda.nvtx.range_push("stream3_h2d_copy")
    curr_batch = input[2].to(device,non_blocking = True)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream3_exp_ops")
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream3_d2h_copy")
    input[2].copy_(curr_batch,non_blocking = True)
    torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_pop()

# Stream 4 operations
torch.cuda.nvtx.range_push("stream4_processing")
with torch.cuda.stream(s4):
    torch.cuda.nvtx.range_push("stream4_h2d_copy")
    curr_batch = input[3].to(device,non_blocking = True)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream4_exp_ops")
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.exp (curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    curr_batch = torch.matmul (curr_batch, curr_batch)
    torch.cuda.nvtx.range_pop()
    
    torch.cuda.nvtx.range_push("stream4_d2h_copy")
    input[3].copy_(curr_batch,non_blocking = True)
    torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_pop()

torch.cuda.nvtx.range_push("final_sync")
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()

torch.cuda.cudart().cudaProfilerStop()


# nsys profile --force-overwrite true  -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu --capture-range=cudaProfilerApi --cudabacktrace=true -x true -o test python stream_Add.py


In [1]:
import torch

# --- Setup ---
print("Streamed execution profiling...")
size = 256
ITERS = 5

shape = (64, size, size)
# Use pinned memory for faster async copies
# A = torch.ones(64, size, size, device="cpu", pin_memory=True)
input = []
for i in range (ITERS):
    input.append(torch.randn(shape, device="cpu"))
input = [in_.pin_memory() for in_ in input]

collected_data = [torch.empty(64, size, size, device="cpu", pin_memory=True) for _ in range(ITERS)]

# Setup CUDA streams
streams = []
for i in range(ITERS):
    streams.append(torch.cuda.Stream())

def streamed_data_processing():
    """Streamed processing - pipeline host-to-GPU, kernel, GPU-to-host operations"""
    torch.cuda.synchronize()
    # Queue all operations across streams without waiting
    for i in range(ITERS):
        with torch.cuda.stream(streams[i]):
            # Push range for current iteration
            torch.cuda.nvtx.range_push(f"stream_{i}_iteration")
            
            # Host to GPU transfer (non-blocking)
            torch.cuda.nvtx.range_push(f"stream_{i}_host_to_gpu")
            A_gpu = input[i].to("cuda", non_blocking=True)
            torch.cuda.nvtx.range_pop()
            
            # Kernel computation
            torch.cuda.nvtx.range_push(f"stream_{i}_kernel_computation")
            C_gpu = torch.mm(A_gpu, A_gpu) + torch.mm(A_gpu, A_gpu) + torch.mm(A_gpu, A_gpu)
            # for _ in range(10):
            #     C_gpu += torch.mm(A_gpu, A_gpu)
            torch.cuda.nvtx.range_pop()
            
            # GPU to host transfer (non-blocking)
            torch.cuda.nvtx.range_push(f"stream_{i}_gpu_to_host")
            collected_data[i].copy_(C_gpu, non_blocking=True)
            torch.cuda.nvtx.range_pop()
            
            # Pop iteration range
            torch.cuda.nvtx.range_pop()
    
    # Wait for all streams to complete
    torch.cuda.nvtx.range_push("stream_synchronization")
    torch.cuda.synchronize()
    torch.cuda.nvtx.range_pop()

# Warmup iterations (not profiled)
warmup_iters = 2
print("Running warmup iterations...")
for _ in range(warmup_iters):
    streamed_data_processing()

# Start profiling
print("Starting profiled iterations...")
torch.cuda.cudart().cudaProfilerStart()

# Profiled iterations
torch.cuda.nvtx.range_push("streamed_processing")
streamed_data_processing()
torch.cuda.nvtx.range_pop()

# Stop profiling
torch.cuda.cudart().cudaProfilerStop()

print("Streamed profiling complete.")
print("Run with: nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu --capture-range=cudaProfilerApi --stop-on-range-end=true --cudabacktrace=true -x true -o streamed_profile python streamed_main.py")

Streamed execution profiling...


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [1]:
import torch
import torch.profiler

# Allocate input tensors
N = 1000000000  # 3 billion elements
A1 = torch.randn(N, device="cuda")
B1 = torch.randn(N, device="cuda")
C1 = torch.empty_like(A1)

A2 = torch.randn(N, device="cuda")
B2 = torch.randn(N, device="cuda")
C2 = torch.empty_like(A2)

# Create two CUDA streams
stream1 = torch.cuda.Stream()
stream2 = torch.cuda.Stream()

# --- PyTorch Profiler ---
# The profiler context manager wraps the code we want to analyze.
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    record_shapes=True # Optionally records tensor shapes
) as prof:
    # Launch vector addition in stream1
    with torch.cuda.stream(stream1):
        C1.copy_(A1 + B1)

    # Launch vector addition in stream2
    with torch.cuda.stream(stream2):
        C2.copy_(A2 + B2)

# Note: An explicit torch.cuda.synchronize() is not needed here because
# the profiler context manager automatically synchronizes when it exits.

# Export the trace to a JSON file
prof.export_chrome_trace("vector_add_trace.json")

print("Profiling complete! Trace saved to 'vector_add_trace.json'.")
print("Open chrome://tracing in your browser and load the file to view it.")

Profiling complete! Trace saved to 'vector_add_trace.json'.
Open chrome://tracing in your browser and load the file to view it.
