In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image, write_jpeg
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.transforms.functional import crop
from torcheval import metrics

import scipy.io as scio

In [2]:
emotions = np.array(['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection',
                     'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear',
                     'Happiness', 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning'])

emotion_encode = {e: i for i, e in enumerate(emotions)}
emotion_decode = {i: e for e, i in emotion_encode.items()}

In [3]:
class EmoticDataset(Dataset):
    def __init__(self, data_type, subject_size, context_size, anns_dir, img_dir, alex=False):
        anns = scio.loadmat(anns_dir)[data_type]
        self.anns = np.fromiter(filter(lambda x: x["folder"].item() != "framesdb/images", iter(anns[0])), dtype=anns.dtype)
        self.img_dir = img_dir
        self.subject_transform = transforms.Resize(subject_size)

        if alex:
            self.context_transform = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.CenterCrop(224),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
            
        else:
            self.context_transform = transforms.Resize(context_size)
            

    def __len__(self):
        return self.anns.size

    def __getitem__(self, idx):
        ann = self.anns[idx]

        img_loc = "../data/cvpr_emotic/" + ann["folder"].item() + '/' + ann["filename"].item()
        context_img = read_image(img_loc)
        
        bbox =  ann["person"]["body_bbox"][0][0][0].astype(int)
        subject_img = crop(context_img, bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0])

        label = np.zeros(len(emotions), dtype=np.float32)
        ems = [i.item() for i in ann["person"]["annotations_categories"][0][0][0][0][0][0]]
        for e in ems:
            label[emotion_encode[e]] = 1.
            
        if subject_img.shape[0] == 1:
            subject_img = subject_img.repeat(3, 1, 1)
            
        if context_img.shape[0] == 1:
            context_img = context_img.repeat(3, 1, 1)

        subject_img = self.subject_transform(subject_img.float())
        context_img = self.context_transform(context_img.float())

        return subject_img, context_img, label

In [4]:
def net_branch():
    return nn.Sequential(
        nn.Conv2d(3, 96, (11, 1), stride=(4, 1)),
        nn.ReLU(),
        nn.BatchNorm2d(96),
        nn.Conv2d(96, 96, (1, 11), stride=(1, 4)),
        nn.ReLU(),
        nn.BatchNorm2d(96),
        nn.MaxPool2d(3, stride=2),
        
        nn.Conv2d(96, 256, (1, 5), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(256),
        nn.Conv2d(256, 256, (5, 1), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(256),
        nn.MaxPool2d(3, stride=2),
        
        nn.Conv2d(256, 384, (1, 3), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(384),
        nn.Conv2d(384, 384, (3, 1), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(384),
        
        nn.Conv2d(384, 384, (1, 3), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(384),
        nn.Conv2d(384, 384, (3, 1), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(384),
        
        nn.Conv2d(384, 256, (1, 3), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(256),
        nn.Conv2d(256, 256, (3, 1), padding="same"),
        nn.ReLU(),
        nn.BatchNorm2d(256),
        nn.MaxPool2d(3, stride=2)
    )

In [46]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.subject = net_branch()
        self.context = net_branch()
        
        self.fusion = nn.Sequential(
            nn.Linear(12800, 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, len(emotions)),
            nn.Sigmoid()
        )

    def forward(self, s, c):
        s = self.subject(s)
        s = torch.flatten(s, start_dim=1)
        
        c = self.context(c)
        c = torch.flatten(c, start_dim=1)

        x = torch.cat((s, c), dim=1)
        x = self.fusion(x)
        return x

In [6]:
class PretrainedNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.subject = net_branch()

        alexnet_file = "../models/alexnet_places365.pth.tar"
        checkpoint = torch.load(alexnet_file, map_location=lambda storage, loc: storage, weights_only=False)
        state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}

        self.context = models.__dict__["alexnet"](num_classes=365)
        self.context.load_state_dict(state_dict)

        for param in self.context.parameters():
            param.requires_grad = False
        
        self.fusion = nn.Sequential(
            nn.Linear(15616, 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, len(emotions)),
            nn.Sigmoid()
        )

    def forward(self, s, c):
        s = self.subject(s)
        s = torch.flatten(s, start_dim=1)
        
        c = self.context.features(c)
        c = torch.flatten(c, start_dim=1)

        x = torch.cat((s, c), dim=1)
        x = self.fusion(x)
        return x

In [7]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)

In [8]:
subject_size = (224, 224)
context_size = (224, 224)

train_batch_size = 52
val_batch_size = 52
test_batch_size = 52
num_epochs = 500
learning_rate = .01

In [37]:
alex_data = False

train_data = EmoticDataset("train", subject_size, context_size, "../data/Annotations/Annotations.mat", "../data/cvpr_emotic/", alex=alex_data)
train_dataloader = DataLoader(train_data, batch_size=train_batch_size)

val_data = EmoticDataset("val", subject_size, context_size, "../data/Annotations/Annotations.mat", "../data/cvpr_emotic/", alex=alex_data)
val_dataloader = DataLoader(val_data, batch_size=val_batch_size)

test_data = EmoticDataset("test", subject_size, context_size, "../data/Annotations/Annotations.mat", "../data/cvpr_emotic/", alex=alex_data)
test_dataloader = DataLoader(test_data, batch_size=test_batch_size)

train_size = len(train_dataloader.dataset)
val_size = len(val_dataloader.dataset)
test_size = len(test_dataloader.dataset)

In [36]:
loss_fn = nn.BCELoss()
metric = metrics.MultilabelAUPRC(num_labels=len(emotions), device=device)

In [28]:
emotic_net = Net().to(device)

In [191]:
emotic_net = PretrainedNet().to(device)

In [26]:
sum(np.prod(i.data.shape) for i in emotic_net.parameters() if i.requires_grad)

np.int64(8920666)

In [192]:
optimizer = torch.optim.SGD(emotic_net.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    emotic_net.train()
    for batch, (S, C, y) in enumerate(train_dataloader):
        S, C, y = S.to(device), C.to(device), y.to(device)
        pred = emotic_net(S, C)
        loss = loss_fn(pred, y)
    
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
        loss, current = loss.item(), min((batch + 1) * train_batch_size, train_size)
        print(f"loss: {loss:>7f}  [{current:>5d}/{train_size:>5d}]", end="\r")
        
    print()

    torch.save(emotic_net.state_dict(), "../models/basic.pth")
    
    metric.reset()
    emotic_net.eval()
    for batch, (S, C, y) in enumerate(val_dataloader):
        with torch.no_grad():
            S, C, y = S.to(device), C.to(device), y.to(device)
            pred = emotic_net(S, C)
            metric.update(pred, y)
    
            current = min((batch + 1) * val_batch_size, val_size)
            print(f"calculating metric for:  [{current:>4d}/{val_size:>4d}]", end="\r")
    
    print()
    print(f"metric: {metric.compute() * 100:>7f} %\n")

In [47]:
load_emotic_net = Net().to(device)
load_emotic_net.load_state_dict(torch.load("../models/basic_trained.pth", weights_only=True))

<All keys matched successfully>

In [53]:
def compute_metrics(emotic_net, metric, dataloader, batch_size, size):
    metric.reset()
    emotic_net.eval()
    for batch, (S, C, y) in enumerate(dataloader):
        with torch.no_grad():
            S, C, y = S.to(device), C.to(device), y.to(device)
            pred = emotic_net(S, C)
            metric.update(pred, y)
    
            current = min((batch + 1) * batch_size, size)
            print(f"calculating metric for:  [{current:>4d}/{size:>4d}]", end="\r")
    
    print()
    return metric.compute()

In [91]:
mets = compute_metrics(load_emotic_net, metric, test_dataloader, test_batch_size, test_size)
print(f"metric: {mets * 100} %")

calculating metric for:  [3682/3682]
metric: 13.611191749572754 %



In [96]:
emotic_net

Net(
  (subject): Sequential(
    (0): Conv2d(3, 96, kernel_size=(11, 1), stride=(4, 1))
    (1): ReLU()
    (2): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(96, 96, kernel_size=(1, 11), stride=(1, 4))
    (4): ReLU()
    (5): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(96, 256, kernel_size=(1, 5), stride=(1, 1), padding=same)
    (8): ReLU()
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Conv2d(256, 256, kernel_size=(5, 1), stride=(1, 1), padding=same)
    (11): ReLU()
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(256, 384, kernel_size=(1, 3), stride=(1, 1), padding=same)
    (15): ReLU()
    (16): Batch

In [95]:
class_metrics = compute_metrics(load_emotic_net, metrics.MultilabelAUPRC(num_labels=len(emotions), device=device, average=None), test_dataloader, test_batch_size, test_size)

for e, m in zip(emotions, class_metrics):
    print(f"{e:16}: {m*100:>6.3f} %")

calculating metric for:  [3682/3682]
Affection       : 10.691 %
Anger           : 15.148 %
Annoyance       : 10.015 %
Anticipation    : 37.276 %
Aversion        :  3.424 %
Confidence      : 40.458 %
Disapproval     :  6.154 %
Disconnection   :  6.613 %
Disquietment    :  5.648 %
Doubt/Confusion :  6.024 %
Embarrassment   :  1.495 %
Engagement      : 56.543 %
Esteem          :  5.682 %
Excitement      : 40.755 %
Fatigue         :  4.948 %
Fear            :  2.251 %
Happiness       : 36.746 %
Pain            :  3.882 %
Peace           : 11.179 %
Pleasure        : 17.286 %
Sadness         :  7.964 %
Sensitivity     :  3.869 %
Suffering       :  7.226 %
Surprise        :  3.193 %
Sympathy        :  6.441 %
Yearning        :  2.981 %


In [94]:
for e, m in sorted(zip(emotions, class_metrics), key=lambda x: -x[1]):
    print(f"{e:16}: {m*100:>6.3f} %")

Engagement      : 56.543 %
Excitement      : 40.755 %
Confidence      : 40.458 %
Anticipation    : 37.276 %
Happiness       : 36.746 %
Pleasure        : 17.286 %
Anger           : 15.148 %
Peace           : 11.179 %
Affection       : 10.691 %
Annoyance       : 10.015 %
Sadness         :  7.964 %
Suffering       :  7.226 %
Disconnection   :  6.613 %
Sympathy        :  6.441 %
Disapproval     :  6.154 %
Doubt/Confusion :  6.024 %
Esteem          :  5.682 %
Disquietment    :  5.648 %
Fatigue         :  4.948 %
Pain            :  3.882 %
Sensitivity     :  3.869 %
Aversion        :  3.424 %
Surprise        :  3.193 %
Yearning        :  2.981 %
Fear            :  2.251 %
Embarrassment   :  1.495 %


In [39]:
idx = 1325
data = train_data[idx]
data = (data[0].unsqueeze(0), data[1].unsqueeze(0), torch.tensor(data[2]).unsqueeze(0))

In [42]:
out = load_emotic_net(data[0].to(device), data[1].to(device))

In [45]:
out.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [96]:
out.shape

torch.Size([1, 26])

In [97]:
loss_fn(out, data[2].to(device))

tensor(24.7358, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)

In [98]:
emotions[out.cpu()[0] > 0.5]

array(['Anticipation', 'Aversion', 'Disapproval', 'Disconnection',
       'Doubt/Confusion', 'Embarrassment', 'Esteem', 'Fatigue', 'Fear',
       'Happiness', 'Suffering'], dtype='<U15')

In [99]:
emotions[data[2][0] > 0.5]

array(['Excitement', 'Pleasure'], dtype='<U15')