In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

## loss test

In [69]:
# from train import train_epoch
# oarLoss = train_epoch.OARLoss
def OARLoss(proposals, per_frame_scores, start_point):
    """
    Onlion action recognizer loss = frame loss + start loss

    @param
    proposals: N*T*num_class, proposals of TPG, note that if all num_class are zero at a time point, then at this point, 
    per_frame_scores: N*T*(num_class+1)
    start_point: shape: N*T*2
    """
    # append the backgraound lable in proposals
    tmp,_ = torch.max(proposals,dim=2,keepdim=True)   # shape N*T*1
    inv_tmp = tmp
    inv_tmp = torch.ones_like(tmp,dtype=float) - inv_tmp
    proposals = torch.cat([proposals,inv_tmp],dim=2)  # N*T*(num_class+1)

    # frame_loss
    total_T = proposals.shape[0] * proposals.shape[1]
    frame_loss = -torch.sum(proposals*torch.log(per_frame_scores))/total_T

    # start loss
    start_loss = 0.
    head = torch.zeros(proposals.shape[0],1,1)
    tmp = torch.cat([head,tmp],dim=1)   # shape N*(T+1)*1
        
    start_label = torch.zeros(proposals.shape[0],proposals.shape[1],2)
    for b in range(proposals.shape[0]):
        for t in range(1,proposals.shape[1]):
            # if previous time block has not action, 
            # and current time block has action, then current time block is action start point
            if tmp[b][t-1][0] == 0 and tmp[b][t][0]==1:
                start_label[b][t][0] = 1.
            else:
                start_label[b][t][1] = 1.
    start_loss = -torch.sum(start_label*torch.log(start_point))/total_T
    print(start_loss)
    return frame_loss+start_loss

In [80]:
proposals = torch.Tensor(np.array([[[0,0,0],[1,0,1],[0,0,1],[1,1,1],[0,1,0]],[[0,0,0],[1,0,1],[0,0,1],[1,1,1],[0,1,0]]]))
per_frame_scores = torch.rand(2,5,4)
start_point = torch.rand(2,5,2)

tmp,_ = torch.max(proposals,dim=2,keepdim=True)   # shape N*T*1
inv_tmp = tmp
inv_tmp = torch.ones_like(tmp) - inv_tmp
loss = OARLoss(proposals,per_frame_scores,start_point)
print(loss)

tensor(0.6012)
tensor(1.5199, dtype=torch.float64)


In [17]:
def MILoss(topk_score,labels,binary=True,device='cuda'):
    """
    topk_score has the same shape with labels, (N,num_class), where N means batch size
    for dataset has one action class only, sigmoid should replace log_softmax
    """
    if binary:
        milloss = torch.mean(torch.sum(labels * torch.sigmoid(topk_score), dim=1), dim=0)
    else:
        milloss = -torch.mean(torch.sum(labels * F.log_softmax(topk_score,dim=1), dim=1), dim=0) # topk.shape = (N,1)
    return milloss

In [3]:
def OARLoss(proposals, per_frame_scores, OAR_out):
    """
    Onlion action recognizer loss = frame loss + start loss

    @param
    proposals: N*T*num_class, proposals of TPG, note that if all num_class are zero at a time point, then at this point, 
    per_frame_scores: N*T*(num_class+1)
    OAR_out: output of LSTM in OAR, shape: N*T*hidden_size
    """
    # append the backgraound lable in proposals
    tmp = torch.max(proposals,dim=2,keepdim=True)
    tmp = 1 - tmp
    proposals = torch.cat([proposals,tmp],dim=2)

In [29]:
proposals = torch.Tensor(np.array([[[1,1],[1,0],[0,0]],[[0,1],[1,1],[1,1]]]))
print(proposals)
tmp,_ = torch.max(proposals,dim=2,keepdim=True)
print(tmp)
tmp1 = tmp
tmp = 1 - tmp
proposals = torch.cat([proposals,tmp],dim=2)
print(proposals)
per_frame_scores = torch.rand(2,3,3)
total_T = proposals.shape[0] * proposals.shape[1]
frame_loss = -torch.sum(proposals*torch.log(per_frame_scores))/total_T
print(frame_loss)

head = torch.zeros(proposals.shape[0],1,1)
tmp1 = torch.cat([head,tmp1],dim=1)
print(tmp1)

tensor([[[1., 1.],
         [1., 0.],
         [0., 0.]],

        [[0., 1.],
         [1., 1.],
         [1., 1.]]])
tensor([[[1.],
         [1.],
         [0.]],

        [[1.],
         [1.],
         [1.]]])
tensor([[[1., 1., 0.],
         [1., 0., 0.],
         [0., 0., 1.]],

        [[0., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.]]])
tensor(1.2130)
tensor([[[0.],
         [1.],
         [1.],
         [0.]],

        [[0.],
         [1.],
         [1.],
         [1.]]])


In [64]:
class OAR(nn.Module):
    """
    online action recognizer, 
    ! For this module, if for real word inference, the input should be x,state = N*C*T,(h_0,c_0),
    ! where N = 1 and T = 1, otherwise, N = batch size, T is the num of time block
    Input: x,state = N*C*T,(h_0,c_0), where h_0 and c_0 of shape (num_layers * num_directions, batch, hidden_size)
    Output: output = N*T*hidden_size, per_frame_socres = N*T*num_class, 
    and state = (hn, cn), where h_n and c_n of shape (num_layers * num_directions, batch, hidden_size)
    here we have num_layers * num_directions = 1

    param input_size(int): The number of expected features in the input x
    param hidden_size(int): The number of features in the hidden state h
    param num_class(int): action classes num and a background class
    parma num_layers(int): Number of recurrent layers
    """
    def __init__(self,input_size, hidden_size, num_class, 
                num_layers=1,M=5, dropout=False, batch_first=True):
        super().__init__()
        self.M = M
        self.rnn = nn.LSTM(input_size,hidden_size,num_layers,dropout,batch_first)
        self.linear_scores = nn.Linear(hidden_size,num_class)
        self.linear_start = nn.Linear(hidden_size,2)  # TODO: may append layers here
        self.softmax = nn.Softmax(dim=2)
        self.maxpooling = nn.MaxPool2d(kernel_size=(M,1),stride=1)
    

    def forward(self,x,state):
        # x: N*C*T
#         x = x.permute(0,2,1).contiguous()
        output, (hn, cn) = self.rnn(x,state)
        state = (hn, cn)
        output_maxp = output
        T1 = output[:,0:1,:]
        for i in range(self.M-1):
            output_maxp = torch.cat([T1,output_maxp],dim=1)
        
        start_scores = self.softmax(self.linear_start(self.maxpooling(output_maxp)))
        
        per_frame_socres = self.softmax(self.linear_scores(output)) # per-frame action scores over classes and background class, shape N*T*num_class
        return start_scores
oar = OAR(5,6,3)
x = torch.randn(2,10,5)
start_label = torch.rand(2,10,2)
state = (torch.randn(1,2,6),torch.randn(1,2,6))

In [28]:


start_scores,state,per_frame_scores = oar(x,state)
print(start_scores.shape)
print(per_frame_scores.shape)
start_label = torch.rand(2,6,2)
print("start_label=",start_label)

torch.Size([2, 6, 2])
torch.Size([2, 10, 3])
start_label= tensor([[[0.9605, 0.6000],
         [0.3181, 0.0513],
         [0.6731, 0.9493],
         [0.7099, 0.0670],
         [0.5650, 0.4180],
         [0.0447, 0.6420]],

        [[0.5821, 0.2074],
         [0.5140, 0.3628],
         [0.3612, 0.2710],
         [0.9077, 0.3066],
         [0.3751, 0.5477],
         [0.7255, 0.9451]]])


In [65]:
gd = torch.optim.Adam(oar.parameters())
loss_fn = nn.MSELoss()
for i in range(10):
    start_scores= oar(x,state)
    print(start_scores)
#     loss = loss_fn(start_scores,start_label)
    loss = -torch.sum(start_label*torch.log(start_scores))
    print(loss)
    #清空前面的导数缓存
    gd.zero_grad()
    loss.backward(retain_graph=True)
    gd.step()
    print(loss)

tensor([[[0.5400, 0.4600],
         [0.5402, 0.4598],
         [0.5378, 0.4622],
         [0.5385, 0.4615],
         [0.5428, 0.4572],
         [0.5443, 0.4557],
         [0.5500, 0.4500],
         [0.5560, 0.4440],
         [0.5564, 0.4436],
         [0.5564, 0.4436]],

        [[0.6196, 0.3804],
         [0.5841, 0.4159],
         [0.5718, 0.4282],
         [0.5549, 0.4451],
         [0.5586, 0.4414],
         [0.5588, 0.4412],
         [0.5468, 0.4532],
         [0.5427, 0.4573],
         [0.5414, 0.4586],
         [0.5388, 0.4612]]], grad_fn=<SoftmaxBackward>)
tensor(15.4635, grad_fn=<NegBackward>)
tensor(15.4635, grad_fn=<NegBackward>)
tensor([[[0.5389, 0.4611],
         [0.5389, 0.4611],
         [0.5366, 0.4634],
         [0.5373, 0.4627],
         [0.5417, 0.4583],
         [0.5432, 0.4568],
         [0.5492, 0.4508],
         [0.5549, 0.4451],
         [0.5554, 0.4446],
         [0.5554, 0.4446]],

        [[0.6181, 0.3819],
         [0.5829, 0.4171],
         [0.5709, 0.4291]

In [19]:
scores = torch.rand(3,5,3)
video_level_score = torch.rand(3,3)
labels = torch.Tensor(np.array([[1,1,0],[0,1,1],[1,0,0]]))

In [20]:
MILoss(video_level_score,labels)

tensor(0.9457)

In [21]:
def CASLoss(features, scores, labels, num_similar,device='cpu'):
    """
    Co-Activity Similarity loss
    # TODO: this loss may be refactored to data with different time length in one batch

    @param
    """
    attention = F.softmax(scores,dim=1)         #N*T*num_class
    features_T = torch.transpose(features,1,2)  #N*f*T
    print("attention.shape=",attention.shape)
    print("features_T.shape=",features_T.shape)
    agg_act = torch.bmm(features_T,attention)    # N*f*num_class
    agg_back = torch.bmm(features_T,(1-attention)/attention.shape[1])    #N

    co_loss = 0.
    n_tmp = 0.       # num of classes in all pair
    for i in range(0,num_similar*2,2):      # num_similar*2 <= T
        Hf1 = agg_act[i]
        Hf2 = agg_act[i+1]
        Lf1 = agg_back[i]
        Lf2 = agg_back[i+1]

        d1 = 1 - torch.sum(Hf1*Hf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Hf2, 2, dim=0))
        d2 = 1 - torch.sum(Hf1*Lf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Lf2, 2, dim=0))
        d3 = 1 - torch.sum(Hf2*Lf1, dim=0) / (torch.norm(Hf2, 2, dim=0) * torch.norm(Lf1, 2, dim=0))
        co_loss = co_loss + 0.5*torch.sum(torch.max(d1-d2+0.5, torch.FloatTensor([0.]).to(device))*labels[i,:]*labels[i+1,:])
        co_loss = co_loss + 0.5*torch.sum(torch.max(d1-d3+0.5, torch.FloatTensor([0.]).to(device))*labels[i,:]*labels[i+1,:])
        n_tmp = n_tmp + torch.sum(labels[i,:]*labels[i+1,:])
    co_loss = co_loss / n_tmp
    print(n_tmp)
    return co_loss

In [24]:
features = torch.randn(3,5,10)
scores = torch.rand(3,5,2)
# labels = np.array([[[0,1],[1,1],[0,0],[0,1],[1,0]],[[1,1],[0,0],[0,1],[1,0],[1,1]],[[0,0],[0,1],[0,1],[1,0],[0,1]]])
labels = np.array([[1,1],[1,1],[0,1]])
labels = torch.Tensor(labels)

In [25]:
print("features=",features)
print("scores=",scores)
print("labels=",labels)

features= tensor([[[-1.0516e+00,  4.6353e-01,  7.2436e-01, -8.3697e-01, -4.6175e-01,
           5.2051e-01,  1.2845e-01,  8.2604e-01, -4.9437e-01, -4.2670e-01],
         [ 1.6683e+00,  2.7589e-02,  9.5760e-01, -5.2163e-01,  1.7666e-01,
          -1.0541e+00,  6.0088e-02, -2.9701e+00, -2.0242e-01,  1.4610e-01],
         [-6.6417e-01, -2.2485e+00,  6.0095e-01,  1.3202e+00, -6.0029e-01,
          -2.3096e-01,  6.8689e-01, -1.3064e+00, -1.1482e+00,  6.0157e-01],
         [ 4.7008e-01, -5.5981e-02,  6.8805e-02,  4.2870e-01, -8.1903e-01,
           1.7120e+00,  1.7539e+00,  6.6528e-03,  1.0895e+00,  3.8090e-02],
         [-4.7366e-01, -1.0655e+00,  5.4189e-01, -1.3752e-01, -1.9368e-01,
          -8.2747e-02,  4.8464e-01,  1.9683e-01, -1.4842e-01,  3.4106e-01]],

        [[-1.2701e+00, -1.0254e+00, -4.4687e-01, -2.2656e-01,  2.8984e-01,
           2.8234e-01, -2.1107e+00, -1.0679e-04, -3.1907e-01,  7.8770e-01],
         [ 3.5461e-01,  6.5225e-01,  4.0951e-01,  4.3840e-01,  3.4791e-01,
       

In [26]:
loss = CASLoss(features,scores,labels,1)
loss

attention.shape= torch.Size([3, 5, 2])
features_T.shape= torch.Size([3, 10, 5])
tensor(2.)


tensor(0.4720)

In [14]:
attention = F.softmax(scores,dim=1)         #N*T*num_class
features_T = torch.transpose(features,1,2)  #N*f*T
print("attention.shape=",attention.shape)
print("features_T.shape=",features_T.shape)
agg_act = torch.bmm(features_T,attention)    # N*f*num_class
agg_act

attention.shape= torch.Size([3, 5, 2])
features_T.shape= torch.Size([3, 10, 5])


tensor([[[ 0.5725,  0.6799],
         [-0.3605, -0.3147],
         [ 0.1487,  0.1362],
         [ 0.2501,  0.2930],
         [ 0.1865, -0.0453],
         [ 0.7697,  0.7118],
         [-0.2322, -0.3537],
         [-0.5252, -0.5390],
         [-0.8519, -0.9587],
         [ 0.3319,  0.3037]],

        [[-0.0715,  0.1059],
         [ 0.0504,  0.0684],
         [ 1.0954,  0.9466],
         [ 0.6079,  0.6809],
         [-0.0187, -0.0391],
         [ 0.0647, -0.1531],
         [ 0.6927,  0.6361],
         [-0.2934, -0.0114],
         [-0.1829, -0.2319],
         [ 0.0480,  0.0493]],

        [[ 0.0166,  0.0081],
         [-0.4705, -0.5331],
         [ 0.0952,  0.1041],
         [-0.0042,  0.0399],
         [-0.8270, -0.5941],
         [ 0.0385,  0.2045],
         [ 0.2506,  0.1382],
         [-0.5096, -0.2524],
         [ 0.3008,  0.2737],
         [ 0.2848,  0.4801]]])

In [30]:
x = torch.rand(5,3)
print(x)
mask = x.ge(0.5)
mask

tensor([[0.8086, 0.2444, 0.7037],
        [0.1214, 0.8210, 0.1594],
        [0.4395, 0.6299, 0.2168],
        [0.8906, 0.0974, 0.1860],
        [0.2650, 0.7115, 0.3104]])


tensor([[ True, False,  True],
        [False,  True, False],
        [False,  True, False],
        [ True, False, False],
        [False,  True, False]])

In [32]:
mask.unsqueeze(1).shape
scores = torch.rand(5,4,3)
scores

tensor([[[0.5525, 0.5471, 0.0743],
         [0.9454, 0.7712, 0.3943],
         [0.9130, 0.3346, 0.7402],
         [0.5996, 0.8005, 0.2182]],

        [[0.7448, 0.6191, 0.1369],
         [0.9466, 0.0271, 0.8626],
         [0.8952, 0.5816, 0.0261],
         [0.8032, 0.3378, 0.0813]],

        [[0.5613, 0.7013, 0.7648],
         [0.4102, 0.3060, 0.1338],
         [0.1767, 0.0102, 0.2986],
         [0.7390, 0.2893, 0.8552]],

        [[0.9061, 0.7105, 0.9145],
         [0.4061, 0.1375, 0.2803],
         [0.1462, 0.0365, 0.6566],
         [0.3384, 0.1836, 0.8128]],

        [[0.0223, 0.4694, 0.7929],
         [0.7790, 0.1117, 0.5057],
         [0.6244, 0.7990, 0.4335],
         [0.8096, 0.0447, 0.1649]]])

In [35]:
mask.unsqueeze(1)*scores
mask.unsqueeze(1)*scores.ge(0.5)

tensor([[[ True, False, False],
         [ True, False, False],
         [ True, False,  True],
         [ True, False, False]],

        [[False,  True, False],
         [False, False, False],
         [False,  True, False],
         [False, False, False]],

        [[False,  True, False],
         [False, False, False],
         [False, False, False],
         [False, False, False]],

        [[ True, False, False],
         [False, False, False],
         [False, False, False],
         [False, False, False]],

        [[False, False, False],
         [False, False, False],
         [False,  True, False],
         [False, False, False]]])