In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image, write_jpeg
import torchvision.transforms as transforms
from torchvision.transforms.functional import crop

import scipy.io as scio

In [2]:
emotions = np.array(['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection',
                     'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear',
                     'Happiness', 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning'])

emotion_encode = {e: i for i, e in enumerate(emotions)}
emotion_decode = {i: e for e, i in emotion_encode.items()}

In [3]:
class EmoticDataset(Dataset):
    def __init__(self, subject_size, context_size, anns_dir, img_dir):
        anns = scio.loadmat(anns_dir)["train"]
        self.anns = np.fromiter(filter(lambda x: x["folder"].item() != "framesdb/images", iter(anns[0])), dtype=anns.dtype)
        self.img_dir = img_dir
        self.subject_transform = transforms.Resize(subject_size)
        self.context_transform = transforms.Resize(context_size)

    def __len__(self):
        return self.anns.size

    def __getitem__(self, idx):
        ann = self.anns[idx]

        img_loc = "../data/cvpr_emotic/" + ann["folder"].item() + '/' + ann["filename"].item()
        context_img = read_image(img_loc)
        
        bbox =  ann["person"]["body_bbox"][0][0][0].astype(int)
        subject_img = crop(context_img, bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0])

        label = np.zeros(len(emotions), dtype=np.float32)
        ems = [i.item() for i in ann["person"]["annotations_categories"][0][0].item()[0][0]]
        for e in ems:
            label[emotion_encode[e]] = 1.

        subject_img = self.subject_transform(subject_img.float())
        context_img = self.context_transform(context_img.float())

        return subject_img, context_img, label

In [4]:
subject_size = (50, 50)
context_size = (200, 200)

In [10]:
batch_size = 16

train_data = EmoticDataset(subject_size, context_size, "../data/Annotations/Annotations.mat", "../data/cvpr_emotic/")
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [6]:
def net_branch():
    return nn.Sequential(
               # nn.Conv2d(3, 96, (11, 1), stride=(4, 1)),
               # nn.ReLU(),
               # nn.BatchNorm2d(96),
               # nn.Conv2d(96, 96, (1, 11), stride=(1, 4)),
               # nn.ReLU(),
               # nn.BatchNorm2d(96),
               # nn.MaxPool2d(3, stride=2),
        
               nn.Conv2d(3, 256, (1, 5), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(256),
               nn.Conv2d(256, 256, (5, 1), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(256),
               nn.MaxPool2d(3, stride=2),
        
               nn.Conv2d(256, 384, (1, 3), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(384),
               nn.Conv2d(384, 384, (3, 1), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(384),
               
               nn.Conv2d(384, 384, (1, 3), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(384),
               nn.Conv2d(384, 384, (3, 1), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(384),
    
               nn.Conv2d(384, 256, (1, 3), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(256),
               nn.Conv2d(256, 256, (3, 1), padding="same"),
               nn.ReLU(),
               nn.BatchNorm2d(256),
               nn.MaxPool2d(3, stride=2)
    )

In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.subject = net_branch()
        self.context = net_branch()
        
        self.fusion = nn.Sequential(
            nn.Linear(645632, 256),
            nn.ReLU(),
            nn.Linear(256, len(emotions)),
            nn.Sigmoid()
        )

    def forward(self, s, c):
        s = self.subject(s)
        s = torch.flatten(s, start_dim=1)
        
        c = self.context(c)
        c = torch.flatten(c, start_dim=1)

        x = torch.cat((s, c), dim=1)
        x = self.fusion(x)
        return x

In [8]:
emotic_net = Net()

In [12]:
size = len(train_dataloader.dataset)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(emotic_net.parameters(), lr=.01)

emotic_net.train()
for batch, (S, C, y) in enumerate(train_dataloader):
    pred = emotic_net(S, C)
    loss = loss_fn(pred, y)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    loss, current = loss.item(), batch * batch_size
    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

loss: 0.399846  [    0/12915]
loss: 0.315113  [   16/12915]
loss: 0.405735  [   32/12915]
loss: 0.824355  [   48/12915]
loss: 0.539643  [   64/12915]
loss: 0.383517  [   80/12915]
loss: 0.489162  [   96/12915]
loss: 0.273486  [  112/12915]
loss: 0.449190  [  128/12915]


KeyboardInterrupt: 

In [9]:
data = next(iter(train_dataloader))

In [10]:
out = n(data[0], data[1])

In [12]:
data[1].shape

torch.Size([16, 3, 200, 200])

In [13]:
out.shape

torch.Size([16, 26])

In [14]:
loss = nn.BCELoss(reduction='none')

In [12]:
data[2].dtype

torch.float32

In [15]:
loss(out, data[2])

tensor([[0.5698, 0.7862, 0.7664, 0.6679, 0.5959, 0.7587, 0.7810, 0.5231, 0.5032,
         0.6645, 0.7654, 0.5479, 0.6572, 0.7612, 0.7904, 0.7040, 0.7348, 0.6104,
         0.5603, 0.6832, 0.6952, 0.9693, 0.5510, 0.5029, 0.7823, 0.6094],
        [0.5439, 0.5876, 0.2667, 1.1274, 0.3508, 0.7186, 1.2768, 1.1940, 0.8091,
         0.8100, 0.5317, 0.5784, 0.4584, 0.8248, 0.5248, 0.4268, 0.8264, 0.4869,
         0.3836, 1.1287, 0.8039, 0.8637, 0.5582, 0.3649, 0.9937, 0.5323],
        [0.5015, 0.4972, 0.6327, 0.6324, 0.5582, 0.7617, 0.6780, 0.7400, 0.4364,
         0.7839, 0.6695, 0.9016, 0.5458, 0.8100, 0.7402, 0.6117, 0.6556, 0.7592,
         0.4259, 0.6794, 0.7276, 0.7482, 0.5616, 0.4885, 1.0681, 0.6824],
        [0.6377, 0.4700, 0.4240, 0.6734, 0.8766, 0.8690, 0.7802, 0.6464, 0.5150,
         0.4866, 0.6999, 0.5250, 0.6585, 0.7404, 0.8748, 0.6310, 0.5452, 0.5402,
         0.6032, 0.7431, 0.7918, 0.9087, 0.7571, 0.4751, 0.8918, 0.8567],
        [0.4619, 0.4957, 0.6861, 0.7400, 0.6055, 0.6578,

In [27]:
np.array(emotions)[out[0] > 0.05]

array(['Affection', 'Fatigue', 'Happiness', 'Sensitivity'], dtype='<U15')

In [18]:
np.broadcast_to(emotions, out.shape)

array([['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion',
        'Confidence', 'Disapproval', 'Disconnection', 'Disquietment',
        'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem',
        'Excitement', 'Fatigue', 'Fear', 'Happiness', 'Pain', 'Peace',
        'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise',
        'Sympathy', 'Yearning'],
       ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion',
        'Confidence', 'Disapproval', 'Disconnection', 'Disquietment',
        'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem',
        'Excitement', 'Fatigue', 'Fear', 'Happiness', 'Pain', 'Peace',
        'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise',
        'Sympathy', 'Yearning'],
       ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion',
        'Confidence', 'Disapproval', 'Disconnection', 'Disquietment',
        'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem',
        'Excitement', '

In [33]:
idx = 9
thres = 0.7

emotions[out[idx] > thres]

array([], dtype='<U15')