In [2]:
import torch
from functools import reduce

This notebook is about finding the difference between loss(x)+loss(y) and loss(x+y).

x and y are shape BxB matrices, where B is the batch size.

loss is a function that takes a BxB matrix and returns a scalar using CE loss. The labels are always torch.arange(B).

    

In [30]:
B=20
n=10
vector=torch.rand(n,B,100)
loss=torch.nn.CrossEntropyLoss()
labels=torch.arange(B)
#to get the first set of losses, we create our for loop to create nxnxBxB matrix

losses=[a@b.T for a in vector for b in vector]
losses=reduce(torch.add, losses)
lossOut=loss(losses, labels)
print("Add then loss = {}".format(lossOut))


losses=reduce(torch.add, [loss(a@b.T,labels) for a in vector for b in vector])
print("Add then loss = {}".format(losses))




Add then loss = 21.46099090576172
Add then loss = 361.396728515625


In [None]:
import torch.profiler

Now lets demonstrate why we want to do loss(x+y) instead of loss(x)+loss(y) using torch cuda profiler...


In [31]:
with torch.autograd.profiler.profile(use_cuda = True,profile_memory=True, record_shapes=True, with_stack=True, with_flops=True, with_modules=True) as prof:
    #for i in range(100):
    lossOut=loss(reduce(torch.add, [a@b.T for a in vector for b in vector]), labels)
print(prof.key_averages())#.table(sort_by="cuda_time_total"))

prof.export_chrome_trace("trace_add_then_loss.json")

with torch.autograd.profiler.profile(use_cuda = True,profile_memory=True, record_shapes=True, with_stack=True, with_flops=True, with_modules=True) as prof:
    #for i in range(100):
    losses=reduce(torch.add, [loss(a@b.T,labels) for a in vector for b in vector])
print(prof.key_averages())#.table(sort_by="cuda_time_total"))

prof.export_chrome_trace("trace_loss_then_add.json")



STAGE:2023-05-25 11:44:02 279564:279564 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-05-25 11:44:02 279564:279564 ActivityProfilerController.cpp:300] Completed Stage: Collection
STAGE:2023-05-25 11:44:02 279564:279564 output_json.cpp:417] Completed Stage: Post Processing


----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls  Total KFLOPs  
----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::unbind        11.96%       1.622ms        24.79%       3.363ms     305.727us       1.022ms         6.73%       3.256ms     296.000us           0 b           0 b            11            --  
                aten::select        12.27%       1.665ms        12.83%       1.741ms      15.827us       1.557ms        10.26%       2.234ms      2

STAGE:2023-05-25 11:44:02 279564:279564 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-05-25 11:44:02 279564:279564 ActivityProfilerController.cpp:300] Completed Stage: Collection
STAGE:2023-05-25 11:44:02 279564:279564 output_json.cpp:417] Completed Stage: Post Processing


This is interesting because.... CE loss emphasises sum(log(e(target)/e(sum(all))))