In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from torch.profiler import profile, record_function, ProfilerActivity
import torch
import time

In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model.eval()
print("Model loaded")

Model loaded


In [3]:
prompt = "On 7 November 2024, the NVIDIA Corporation established itself as the most valuable company in the world with a market capitalization of $3.6 trillion. Exactly two years before, NVIDIA was a far cry at 15th with a market cap of $356.38 billion. How did this happen?"
inputs = tokenizer(prompt, return_tensors="pt")

In [8]:
assert(torch.cuda.is_available())
input_ids = inputs["input_ids"].cuda()
attention_mask = inputs["attention_mask"].cuda()
device = torch.device('cuda:0')
gpu_model = model.to(device)

In [9]:
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    with_stack=True
) as prof:
    output = gpu_model.generate(
        input_ids,
        max_length=100,  # Length of generated sequence
        temperature=0.7,  # Sampling temperature
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Nucleus sampling
        do_sample=True,
        attention_mask=attention_mask,  # Make sure padding is ignored
        pad_token_id=tokenizer.eos_token_id,  # Explicitly set pad token
        eos_token_id=tokenizer.eos_token_id,  # Explicitly set end-of-sequence token
    )

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated text on GPU:")
print(generated_text)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

Generated text on GPU:
On 7 November 2024, the NVIDIA Corporation established itself as the most valuable company in the world with a market capitalization of $3.6 trillion. Exactly two years before, NVIDIA was a far cry at 15th with a market cap of $356.38 billion. How did this happen? The answer lies in the company’s relentless pursuit of innovation and its commitment to creating solutions that redefine the boundaries of what’s possible. In this article, we will delve into the reasons
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------

In [10]:
start_time_gpu = time.time()
trials = 100
for _ in range(trials):
    output = model.generate(
        input_ids,
        max_length=100,  # Length of generated sequence
        temperature=0.7,  # Sampling temperature
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Nucleus sampling
        do_sample=True,
        attention_mask=attention_mask,  # Make sure padding is ignored
        pad_token_id=tokenizer.eos_token_id,  # Explicitly set pad token
        eos_token_id=tokenizer.eos_token_id,  # Explicitly set end-of-sequence token
    )

end_time_gpu = time.time()

avg_time_inference_gpu = (end_time_gpu - start_time_gpu) / trials
print(f"Average inference time on GPU: {avg_time_inference_gpu:.4f} seconds")

Average inference time on GPU: 1.0179 seconds


In [11]:
optimizer = AdamW(model.parameters(), lr=1e-5)
labels = input_ids.clone()
labels = labels.to(device)

# Profile a single training iteration
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
             with_stack=True,
             on_trace_ready=torch.profiler.tensorboard_trace_handler('./log_train')) as prof:
    
    # Forward pass
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    loss = outputs.loss
    torch.cuda.empty_cache()

    # Backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))



OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 7.92 GiB of which 11.00 MiB is free. Process 112923 has 750.00 MiB memory in use. Including non-PyTorch memory, this process has 7.07 GiB memory in use. Of the allocated memory 6.91 GiB is allocated by PyTorch, and 19.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
start_time_gpu = time.time()
trials = 100
for _ in range(trials):
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    # Backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

end_time_gpu = time.time()
avg_time_train_gpu = (end_time_gpu - start_time_gpu) / trials
print(f"Average training time on GPU: {avg_time_train_gpu:.4f} seconds")

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 7.92 GiB of which 6.81 MiB is free. Process 112923 has 750.00 MiB memory in use. Including non-PyTorch memory, this process has 7.07 GiB memory in use. Of the allocated memory 5.95 GiB is allocated by PyTorch, and 998.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]  # No .cuda() here for CPU
device = torch.device('cpu')
cpu_model = model.to(device)


In [15]:

# Generate output
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    with_stack=True
) as prof:
    output = cpu_model.generate(
        input_ids,
        max_length=100,  # Length of generated sequence
        temperature=0.7,  # Sampling temperature
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Nucleus sampling
        do_sample=True,
        attention_mask=attention_mask,  # Make sure padding is ignored
        pad_token_id=tokenizer.eos_token_id,  # Explicitly set pad token
        eos_token_id=tokenizer.eos_token_id,  # Explicitly set end-of-sequence token
    )

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated text on CPU:")
print(generated_text)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

Generated text on CPU:
On 7 November 2024, the NVIDIA Corporation established itself as the most valuable company in the world with a market capitalization of $3.6 trillion. Exactly two years before, NVIDIA was a far cry at 15th with a market cap of $356.38 billion. How did this happen? What was the catalyst that catapulted NVIDIA to the top of the list?
While the answer may seem obvious, it is worth delving into the details to understand the underlying factors driving
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                         aten::linear         0.24%      30.676ms        92.19%       12.0

In [16]:
start_time_cpu = time.time()
trials = 5
for _ in range(trials):
    output = cpu_model.generate(
        input_ids,
        max_length=100,  # Length of generated sequence
        temperature=0.7,  # Sampling temperature
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Nucleus sampling
        do_sample=True,
        attention_mask=attention_mask,  # Make sure padding is ignored
        pad_token_id=tokenizer.eos_token_id,  # Explicitly set pad token
        eos_token_id=tokenizer.eos_token_id,  # Explicitly set end-of-sequence token
    )

end_time_cpu = time.time()

avg_time_inference_cpu = (end_time_cpu - start_time_cpu) / trials
print(f"Average inference time on CPU: {avg_time_inference_cpu:.4f} seconds")

Average inference time on CPU: 13.2967 seconds


In [5]:
optimizer = AdamW(cpu_model.parameters(), lr=1e-5)
labels = input_ids.clone()
labels = labels.to(device)




In [None]:


# Profile a single training iteration (CPU)
with profile(activities=[ProfilerActivity.CPU],
             record_shapes=True,
             with_stack=True,
             on_trace_ready=torch.profiler.tensorboard_trace_handler('./log_cpu_train')) as prof:

    outputs = cpu_model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Optimizer.step#AdamW.step         1.79%     247.654ms        54.12%        7.471s        7.471s             1  
                                               aten::mm        38.13%        5.264s        38.14%        5.264s      15.529ms           339  
       autograd::engine::evaluate_function: MmBackward0         0.03%       3.672ms        25.61%        3.536s      31.288ms           113  
                                            MmBackward0         0.05%       6.577ms        25.59%        3.532s      31.256ms           113  
      

In [6]:
start_time_cpu = time.time()
trials = 1 # Reduced trials for CPU due to slower processing

for _ in range(trials):
    outputs = cpu_model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

end_time_cpu = time.time()
avg_time_train_cpu = (end_time_cpu - start_time_cpu) / trials
print(f"Average training time on CPU: {avg_time_train_cpu:.4f} seconds")

Average training time on CPU: 13.5160 seconds


In [9]:
# Speedup
# speedup = avg_time_train_cpu / avg_time_train_gpu
# print(f"Train Speedup: {speedup:.2f}x")
speedup = avg_time_inference_cpu / avg_time_inference_gpu
print(f"Inference Speedup: {speedup:.2f}x")

Inference Speedup: 13.06x
