In [1]:
import imageio
import matplotlib.pyplot as plt
from mlxtend.image import extract_face_landmarks
import cv2

In [2]:
import glob
import numpy as np
import torch
import os
import tqdm
import re
import pickle
import random


class CustomDataSet(torch.utils.data.Dataset):

    def __init__(self, type_="train"):
        self.type_ = type_
        
        all_files = list(set([re.findall(r"(.+)_(\d+)_(.+)_(\d+)_(\d+)_.+", filename)[0] for filename in os.listdir(f"/media/soroushh/Storage2/matrices/evaluation")]))
        if self.type_ == "train":
            all_files = all_files[:int((2/3) * len(all_files))]
        else:
            all_files = all_files[int((2/3) * len(all_files)):]
        
        self.final_database = []
        for person1_name, triple_to_reconstruct_index, person2_name, triple_to_change_input_index, triple_to_change_output_index in all_files:
            triple_to_reconstruct_index = int(triple_to_reconstruct_index)
            triple_to_change_input_index = int(triple_to_change_input_index)
            triple_to_change_output_index = int(triple_to_change_output_index)
            self.final_database.append((person1_name, triple_to_reconstruct_index, person2_name, triple_to_change_input_index, triple_to_change_output_index))
            
        with open(f'/media/soroushh/Storage2/database_evaluation.pickle', 'rb') as handle:
            self.database = pickle.load(handle)
        
        ids = list(self.database.keys())
        self.ids_dict = {id:idx for idx, id in enumerate(list(self.database.keys()))}
        self.emotions_dict = {emo:idx for idx, emo in enumerate(np.unique([triplet[0] for id in ids for triplet in self.database[id]]).tolist())}
        self.poses_dict = {pose:idx for idx, pose in enumerate(np.unique([triplet[1] for id in ids for triplet in self.database[id]]).tolist())}

    def __len__(self):
        return len(self.final_database)
            
    def __getitem__(self, idx):
        tup = self.final_database[idx]
        
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_pose_img_to_reconstruct.npy', 'rb') as f:
            pose_img_to_reconstruct = np.load(f)
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_embedding_input.npy', 'rb') as f:
            embedding_input = np.load(f)
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_embedding_output.npy', 'rb') as f:
            embedding_output = np.load(f)
            
        expected_pose_label = self.database[tup[2]][tup[4]][1]
        input_pose_label = self.database[tup[2]][tup[3]][1]
        input_id = tup[2]
        input_emo = self.database[tup[2]][tup[3]][0]
        
        while True:
            available_negative_choices = list(filter(lambda item: item[2] != input_id, self.final_database))
            if len(available_negative_choices) == 0:
                continue
                
            neg_tup = random.choice(available_negative_choices)
            break
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{neg_tup[0]}_{neg_tup[1]}_{neg_tup[2]}_{neg_tup[3]}_{neg_tup[4]}_embedding_input.npy', 'rb') as f:
            negative_embedding_input = np.load(f)
            
        expected_pose_label = self.poses_dict[expected_pose_label]
        input_pose_label = self.poses_dict[input_pose_label]
        input_id = self.ids_dict[input_id]
        input_emo = self.emotions_dict[input_emo]
            
        return pose_img_to_reconstruct, embedding_input, embedding_output, expected_pose_label, input_pose_label, input_id, input_emo

In [3]:
image_paths = glob.glob(os.path.join("./KDEF/*/*.JPG"))
image_paths = np.array(image_paths)
np.random.shuffle(image_paths)
image_paths = image_paths.tolist()

index = int(0.9 * len(image_paths))
# trainset = image_paths[:index]
valset = image_paths[index:]

In [4]:
# train_dataset = CustomDataSet(trainset, type_="train", precomputerd=True)
val_dataset = CustomDataSet(type_="val")

In [5]:
# len(train_dataset), 
len(val_dataset)

10001

In [6]:
from torch import nn


def img_to_patch(x, patch_size, flatten_channels=True):
    """
    Inputs:
        x - torch.Tensor representing the image of shape [B, C, H, W]
        patch_size - Number of pixels per dimension of the patches (integer)
        flatten_channels - If True, the patches will be returned in a flattened format
                           as a feature vector instead of a image grid.
    """
    B, C, H, W = x.shape
    x = x.reshape(B, C, H//patch_size, patch_size, W//patch_size, patch_size)
    x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W]
    x = x.flatten(1,2)              # [B, H'*W', C, p_H, p_W]
    if flatten_channels:
        x = x.flatten(2,4)          # [B, H'*W', C*p_H*p_W]
    return x


class AttentionBlock(nn.Module):

    def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
        """
        Inputs:
            embed_dim - Dimensionality of input and attention feature vectors
            hidden_dim - Dimensionality of hidden layer in feed-forward network
                         (usually 2-4x larger than embed_dim)
            num_heads - Number of heads to use in the Multi-Head Attention block
            dropout - Amount of dropout to apply in the feed-forward network
        """
        super().__init__()

        self.layer_norm_1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.layer_norm_2 = nn.LayerNorm(embed_dim)
        self.linear = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )


    def forward(self, x):
        inp_x = self.layer_norm_1(x)
        x = x + self.attn(inp_x, inp_x, inp_x)[0]
        x = x + self.linear(self.layer_norm_2(x))
        return x


class VisionTransformer(nn.Module):

    def __init__(self, embed_dim, hidden_dim, num_channels, num_heads, num_layers, num_classes, patch_size, num_patches, dropout=0.0):
        """
        Inputs:
            embed_dim - Dimensionality of the input feature vectors to the Transformer
            hidden_dim - Dimensionality of the hidden layer in the feed-forward networks
                         within the Transformer
            num_channels - Number of channels of the input (3 for RGB)
            num_heads - Number of heads to use in the Multi-Head Attention block
            num_layers - Number of layers to use in the Transformer
            num_classes - Number of classes to predict
            patch_size - Number of pixels that the patches have per dimension
            num_patches - Maximum number of patches an image can have
            dropout - Amount of dropout to apply in the feed-forward network and
                      on the input encoding
        """
        super().__init__()

        self.patch_size = patch_size

        # Layers/Networks
        self.input_layer = nn.Linear(num_channels*(patch_size**2), embed_dim)
        self.transformer = nn.Sequential(*[AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)])
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, num_classes)
        )
        self.dropout = nn.Dropout(dropout)

        # Parameters/Embeddings
        self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1,1+num_patches,embed_dim))


    def forward(self, x):
        # Preprocess input
        x = img_to_patch(x, self.patch_size)
        B, T, _ = x.shape
        x = self.input_layer(x)

        # Add CLS token and positional encoding
        cls_token = self.cls_token.repeat(B, 1, 1)
        x = torch.cat([cls_token, x], dim=1)
        x = x + self.pos_embedding[:,:T+1]

        # Apply Transforrmer
        x = self.dropout(x)
        x = x.transpose(0, 1)
        x = self.transformer(x)

        # Perform classification prediction
        cls = x[0]
        out = self.mlp_head(cls)
        # out = cls
        
        return out

In [7]:
import math
import torch.nn.functional as F


# class Reshape(torch.nn.Module):
    
#     def __init__(self, *args):
#         super(Reshape, self).__init__()
#         self.shape = args

#     def forward(self, x):
#         return x.view(self.shape)

    
# def scaled_dot_product(q, k, v, mask=None):
#     d_k = q.size()[-1]
#     attn_logits = torch.matmul(q, k.transpose(-2, -1))
#     attn_logits = attn_logits / math.sqrt(d_k)
#     if mask is not None:
#         attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
        
#     attention = F.softmax(attn_logits, dim=-1)
#     values = torch.matmul(attention, v)
    
#     return values, attention


# class MultiheadAttention(torch.nn.Module):

#     def __init__(self, input_dim, embed_dim, num_heads):
#         super().__init__()
#         assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

#         self.embed_dim = embed_dim
#         self.num_heads = num_heads
#         self.head_dim = embed_dim // num_heads

#         # Stack all weight matrices 1...h together for efficiency
#         # Note that in many implementations you see "bias=False" which is optional
#         self.qkv_proj = torch.nn.Linear(input_dim, 3*embed_dim)
#         self.o_proj = torch.nn.Linear(embed_dim, embed_dim)

#         self._reset_parameters()

#     def _reset_parameters(self):
#         # Original Transformer initialization, see PyTorch documentation
#         torch.nn.init.xavier_uniform_(self.qkv_proj.weight)
#         self.qkv_proj.bias.data.fill_(0)
#         torch.nn.init.xavier_uniform_(self.o_proj.weight)
#         self.o_proj.bias.data.fill_(0)

#     def forward(self, x, mask=None, return_attention=False):
#         batch_size, seq_length, embed_dim = x.size()
#         qkv = self.qkv_proj(x)

#         # Separate Q, K, V from linear output
#         qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim)
#         qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
#         q, k, v = qkv.chunk(3, dim=-1)

#         # Determine value outputs
#         values, attention = scaled_dot_product(q, k, v, mask=mask)
#         values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
#         values = values.reshape(batch_size, seq_length, embed_dim)
#         o = self.o_proj(values)

#         if return_attention:
#             return o, attention
#         else:
#             return o


class EmbeddingGeneratorDecoder(torch.nn.Module):
    
    def __init__(self):
        super(EmbeddingGeneratorDecoder, self).__init__()

        # self.downsample1 = torch.nn.Conv1d(2, 8, 3, stride=1, padding=1)
        # self.downsample2 = torch.nn.Conv1d(8, 64, 3, stride=1, padding=1)
        
        # self.multihead_attention = MultiheadAttention(input_dim=128, embed_dim=128, num_heads=4)
        
        self.vit = VisionTransformer(**{
                                        'embed_dim': 128,
                                        'hidden_dim': 256,
                                        'num_heads': 4,
                                        'num_layers': 4,
                                        'patch_size': 8,
                                        'num_channels': 1,
                                        'num_patches': 32,
                                        'num_classes': 512,
                                        'dropout': 0.2
                                    })
        
        self.normalizer = nn.LayerNorm(1024)
        
#         self.upsample1 = torch.nn.Conv1d(64, 8, 3, stride=1, padding=1)
#         self.upsample2 = torch.nn.Conv1d(8, 1, 3, stride=1, padding=1)
        
#         self.upsample_block = torch.nn.Upsample(scale_factor=2)
        
        # self.batchnorm1 = torch.nn.BatchNorm1d(1)
        # self.batchnorm8 = torch.nn.BatchNorm1d(8)
        # self.batchnorm64 = torch.nn.BatchNorm1d(64)
        
        # self.features = torch.nn.Sequential(
        #     torch.nn.Linear(512, 512), 
        #     torch.nn.BatchNorm1d(512),
        # )
        
#     def feature_extraction_downsample(self, x): # n, 2, 512
#         skip_connections = []
        
#         x = self.downsample1(x) # n, 8, 512
#         # x = self.batchnorm8(x)
#         x = F.dropout(x, p=0.4)
#         x = F.relu(x)
#         skip_connections.append(x)
#         x = F.max_pool1d(x, kernel_size=2) # n, 8, 256 
#         # x = F.avg_pool1d(x, kernel_size=2) # n, 8, 256 
        
#         x = self.downsample2(x) # n, 64, 256
#         # x = self.batchnorm64(x)
#         x = F.dropout(x, p=0.4)
#         x = F.relu(x)
#         skip_connections.append(x)
#         x = F.max_pool1d(x, kernel_size=2) # n, 64, 128
#         # x = F.avg_pool1d(x, kernel_size=2) # n, 8, 256
        
#         return x, skip_connections
    
#     def feature_extraction_upsample(self, x, skip_connections): # n, 64, 128
#         skip_connections = list(reversed(skip_connections))
        
#         x = self.upsample_block(x) # n, 64, 256
#         x = x + skip_connections[0]
#         x = self.upsample1(x) # n, 8, 256
#         # x = self.batchnorm8(x)
#         x = F.dropout(x, p=0.4)
#         x = F.relu(x)
        
#         x = self.upsample_block(x) # n, 8, 512
#         x = x + skip_connections[1]
#         x = self.upsample2(x) # n, 1, 512
#         # x = self.batchnorm1(x)
#         x = F.dropout(x, p=0.4)
#         x = F.relu(x)
        
#         return x
    
    def forward(self, emb1, emb2):
#         emb1 = emb1.unsqueeze(1)
#         emb2 = emb2.unsqueeze(1)
        
#         comb = torch.cat([emb1, emb2], 1)
#         # comb = comb.unsqueeze(1)
#         comb, skip_connections = self.feature_extraction_downsample(comb)
#         # att_comb = self.multihead_attention(comb)
#         att_comb = comb
#         att_comb = self.feature_extraction_upsample(att_comb, skip_connections)
#         att_comb = att_comb.squeeze(1)
#         embedding = self.features(att_comb)
        
        comb = torch.cat([emb1, emb2], 1)
        comb = self.normalizer(comb)
        comb = torch.reshape(comb, (-1, 32, 32))
        comb = comb.unsqueeze(1)
        embedding = self.vit(comb)

        # comb = comb.unsqueeze(1)
        # comb, skip_connections = self.feature_extraction_downsample(comb)
        # # att_comb = self.multihead_attention(comb)
        # att_comb = comb
        # att_comb = self.feature_extraction_upsample(att_comb, skip_connections)
        # att_comb = att_comb.squeeze(1)
        
        # embedding = self.features(att_comb)
        
        return embedding


class ConvAutoencoder(torch.nn.Module):
    
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        
        self.decoder_emb_generator = EmbeddingGeneratorDecoder()
        
        self.encoder_layer1 = torch.nn.Conv2d(1, 4, 3, stride=1, padding=1)
        self.encoder_layer2 = torch.nn.Conv2d(4, 16, 3, stride=1, padding=1)
        self.encoder_layer3 = torch.nn.Conv2d(16, 64, 3, stride=1, padding=1)
        self.encoder_layer4 = torch.nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.encoder_layer5 = torch.nn.Conv2d(128, 256, 3, stride=1, padding=1)
        self.encoder_layer6 = torch.nn.Conv2d(256, 512, 3, stride=1, padding=1)
        
        self.upsample2 = torch.nn.Upsample(scale_factor=2, mode='nearest')
        self.upsample3 = torch.nn.Upsample(scale_factor=3, mode='nearest')
        self.upsample4 = torch.nn.Upsample(scale_factor=4, mode='nearest')
        
        self.decoder_layer1 = torch.nn.Conv2d(512, 256, 2, stride=3, padding=2)
        self.decoder_layer2 = torch.nn.Conv2d(256, 128, 3, stride=1, padding=0)
        self.decoder_layer3 = torch.nn.Conv2d(128, 64, 3, stride=1, padding=1)
        self.decoder_layer4 = torch.nn.Conv2d(64, 16, 3, stride=1, padding=1)
        self.decoder_layer5 = torch.nn.Conv2d(16, 4, 3, stride=1, padding=1)
        self.decoder_layer6 = torch.nn.Conv2d(4, 1, 3, stride=1, padding=1)
        
        # self.bn8 = torch.nn.BatchNorm2d(8)
        # self.bn64 = torch.nn.BatchNorm2d(64)
        # self.bn256 = torch.nn.BatchNorm2d(256)
        # self.bn512 = torch.nn.BatchNorm2d(512)
        
    def encoder(self, x): # 1, 112, 112
        x = self.encoder_layer1(x) # 4, 112, 112
        x = F.dropout(x, p=0.4)
        # x = self.bn8(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 4, 56, 56
        
        x = self.encoder_layer2(x) # 16, 56, 56
        x = F.dropout(x, p=0.4)
        # x = self.bn64(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 16, 28, 28

        x = self.encoder_layer3(x) # 64, 28, 28
        x = F.dropout(x, p=0.4)
        # x = self.bn256(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 64, 14, 14
        
        x = self.encoder_layer4(x) #128, 14, 14
        x = F.dropout(x, p=0.4)
        # x = self.bn512(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 128, 7, 7
        
        x = self.encoder_layer5(x) #256, 7, 7
        x = F.dropout(x, p=0.4)
        # x = self.bn512(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 256, 3, 3
        
        x = self.encoder_layer6(x) #512, 3, 3
        x = F.dropout(x, p=0.4)
        # x = self.bn512(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2) # 512, 1, 1
        
        x = torch.reshape(x, (-1, 512))
        
        return x
        
    def decoder_pose_reconstructor(self, x):
        x = torch.reshape(x, (-1, 512, 1, 1))
        
        x = self.upsample4(x) # 512, 4, 4
        x = self.decoder_layer1(x) # 256, 3, 3
        x = F.dropout(x, p=0.4)
        # x = self.bn256(x)
        x = F.relu(x)
        
        x = self.upsample3(x) # 256, 9, 9
        x = self.decoder_layer2(x) # 128, 7, 7
        x = F.dropout(x, p=0.4)
        # x = self.bn256(x)
        x = F.relu(x)
        
        x = self.upsample2(x) # 128, 14, 14
        x = self.decoder_layer3(x) # 64, 14, 14
        x = F.dropout(x, p=0.4)
        # x = self.bn256(x)
        x = F.relu(x)
        
        x = self.upsample2(x) # 64, 28, 28
        x = self.decoder_layer4(x) # 16, 28, 28
        x = F.dropout(x, p=0.4)
        # x = self.bn64(x)
        x = F.relu(x)
        
        x = self.upsample2(x) # 16, 56, 56
        x = self.decoder_layer5(x) # 4, 56, 56
        x = F.dropout(x, p=0.4)
        # x = self.bn8(x)
        x = F.relu(x)
        
        x = self.upsample2(x) # 4, 112, 112
        x = self.decoder_layer6(x) # 1, 112, 112
        x = torch.sigmoid(x)
        
        return x
        
    def forward(self, pose_img, magface_embedding):
        coded = self.encoder(pose_img)
        reconstructed_pose = self.decoder_pose_reconstructor(coded)
        
        # Feature fusion
        generated_embedding = self.decoder_emb_generator(coded, magface_embedding)
        
        # fc_out = self.fc_head(generated_embedding)
        
#         id_predictions = self.id_head(fc_out)
#         id_predictions = F.softmax(id_predictions, dim=1)
        id_predictions = None
        
#         emo_predictions = self.emotion_head(fc_out)
#         emo_predictions = F.softmax(emo_predictions, dim=1)
        emo_predictions = None
        
        return reconstructed_pose, generated_embedding, (id_predictions, emo_predictions)

In [8]:
net = ConvAutoencoder()
net = net.to("cuda")
state_dict = torch.load("./augmentor.pt")
net.load_state_dict(state_dict)
net.eval()

ConvAutoencoder(
  (decoder_emb_generator): EmbeddingGeneratorDecoder(
    (vit): VisionTransformer(
      (input_layer): Linear(in_features=64, out_features=128, bias=True)
      (transformer): Sequential(
        (0): AttentionBlock(
          (layer_norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (layer_norm_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (linear): Sequential(
            (0): Linear(in_features=128, out_features=256, bias=True)
            (1): GELU()
            (2): Dropout(p=0.2, inplace=False)
            (3): Linear(in_features=256, out_features=128, bias=True)
            (4): Dropout(p=0.2, inplace=False)
          )
        )
        (1): AttentionBlock(
          (layer_norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAtte

In [9]:
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.preprocessing import StandardScaler


objective1_traindata_embs = []
objective1_traindata_labels_pose = []
objective1_traindata_labels_id = []
objective1_traindata_labels_emo = []
objective1_valdata_embs = []
objective1_valdata_labels_pose = []
objective1_valdata_labels_id = []
objective1_valdata_labels_emo = []

objective2_traindata_embs = []
objective2_traindata_labels_pose = []
objective2_traindata_labels_id = []
objective2_traindata_labels_emo = []
objective2_valdata_embs = []
objective2_valdata_labels_pose = []
objective2_valdata_labels_id = []
objective2_valdata_labels_emo = []

objective3_traindata_embs = []
objective3_traindata_labels_pose = []
objective3_traindata_labels_id = []
objective3_traindata_labels_emo = []
objective3_valdata_embs = []
objective3_valdata_labels_pose = []
objective3_valdata_labels_id = []
objective3_valdata_labels_emo = []

for data in tqdm.tqdm(val_dataset):
    pose_img = data[0]
    pose_img = torch.Tensor(pose_img)
    pose_img = pose_img.unsqueeze(0)
    pose_img = pose_img.permute(0, 3, 1, 2)
    pose_img = pose_img.to("cuda").to(torch.float32)
    
    input_emb = data[1]
    objective1_traindata_embs.append(input_emb)
    objective2_traindata_embs.append(input_emb)
    objective3_traindata_embs.append(input_emb)
    input_emb = torch.Tensor(input_emb)
    input_emb = input_emb.unsqueeze(0)
    input_emb = input_emb.to("cuda").to(torch.float32)
    
    output_emb = data[2]
    output_emb = torch.Tensor(output_emb)
    output_emb = output_emb.unsqueeze(0)
    output_emb = output_emb.to("cuda").to(torch.float32)
    
    expected_pose_label = data[3]
    input_pose_label = data[4]
    objective1_traindata_labels_pose.append(input_pose_label)
    objective2_traindata_labels_pose.append(input_pose_label)
    objective3_traindata_labels_pose.append(input_pose_label)
    
    expected_id_label = data[5]
    objective1_traindata_labels_id.append(expected_id_label)
    objective2_traindata_labels_id.append(expected_id_label)
    objective3_traindata_labels_id.append(expected_id_label)
    
    expected_emo_label = data[6]
    objective1_traindata_labels_emo.append(expected_emo_label)
    objective2_traindata_labels_emo.append(expected_emo_label)
    objective3_traindata_labels_emo.append(expected_emo_label)
    
    # ensure to not mixing the train and validation sets
    output_emb = output_emb.detach().cpu().numpy()
    
    features = net.encoder(pose_img)
    generated_embedding = net.decoder_emb_generator(features, input_emb)
    
    generated_embedding = generated_embedding.detach().cpu().numpy()
    
    # objective 1: measuring how good separable are the representations
    objective1_valdata_embs.append(generated_embedding[0]) 
    objective1_valdata_labels_pose.append(expected_pose_label) 
    objective1_valdata_labels_id.append(expected_id_label) 
    objective1_valdata_labels_emo.append(expected_emo_label)
    
    # objective1_traindata_embs.append(output_emb[0])
    # objective1_traindata_labels_pose.append(expected_pose_label)
    # objective1_traindata_labels_id.append(expected_id_label)
    # objective1_traindata_labels_emo.append(expected_emo_label)
    
    # objective 2 and 3: measuring how good the generated representations improve our accuracy
    objective2_valdata_embs.append(output_emb[0])
    objective2_valdata_labels_pose.append(expected_pose_label)
    objective2_valdata_labels_id.append(expected_id_label)
    objective2_valdata_labels_emo.append(expected_emo_label)
    
    objective3_traindata_embs.append(generated_embedding[0])
    objective3_traindata_labels_pose.append(expected_pose_label)
    objective3_traindata_labels_id.append(expected_id_label)
    objective3_traindata_labels_emo.append(expected_emo_label)
    
    objective3_valdata_embs.append(output_emb[0])
    objective3_valdata_labels_pose.append(expected_pose_label)
    objective3_valdata_labels_id.append(expected_id_label)
    objective3_valdata_labels_emo.append(expected_emo_label)

100%|██████████| 10001/10001 [12:54<00:00, 12.91it/s]


In [10]:
print("objective1_traindata_labels_pose", np.unique(objective1_traindata_labels_pose, return_counts=True))
print("objective1_traindata_labels_id", np.unique(objective1_traindata_labels_id, return_counts=True))
print("objective1_traindata_labels_emo", np.unique(objective1_traindata_labels_emo, return_counts=True))

print("objective1_valdata_labels_pose", np.unique(objective1_valdata_labels_pose, return_counts=True))
print("objective1_valdata_labels_id", np.unique(objective1_valdata_labels_id, return_counts=True))
print("objective1_valdata_labels_emo", np.unique(objective1_valdata_labels_emo, return_counts=True))

objective1_traindata_labels_pose (array([0, 1, 2, 3, 4]), array([  54,   90, 6576, 6703, 6579]))
objective1_traindata_labels_id (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), array([708, 754, 734, 682, 686, 692, 672, 646, 752, 742, 810, 704, 700,
       740, 698, 678, 758, 700, 688, 748, 674, 736, 660, 694, 734, 732,
       700, 780]))
objective1_traindata_labels_emo (array([0, 1, 2, 3, 4, 5, 6]), array([2852, 2748, 2888, 2946, 2974, 2694, 2900]))
objective1_valdata_labels_pose (array([0, 1, 2, 3, 4]), array([   3,    7, 3244, 3434, 3313]))
objective1_valdata_labels_id (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), array([354, 377, 367, 341, 343, 346, 336, 323, 376, 371, 405, 352, 350,
       370, 349, 339, 379, 350, 344, 374, 337, 368, 330, 347, 367, 366,
       350, 390]))
objective1_valdata_labels_emo (array([0, 1, 2, 3, 

In [11]:
objective1_traindata_embs = np.array(objective1_traindata_embs)
objective1_valdata_embs = np.array(objective1_valdata_embs)

objective2_traindata_embs = np.array(objective2_traindata_embs)
objective2_valdata_embs = np.array(objective2_valdata_embs)

objective3_traindata_embs = np.array(objective3_traindata_embs)
objective3_valdata_embs = np.array(objective3_valdata_embs)

In [12]:
from sklearn.preprocessing import LabelEncoder


scaler = StandardScaler()
scaler.fit(objective1_traindata_embs)
objective1_traindata_embs = scaler.transform(objective1_traindata_embs)
scaler.fit(objective2_traindata_embs)
objective2_traindata_embs = scaler.transform(objective2_traindata_embs)
scaler.fit(objective3_traindata_embs)
objective3_traindata_embs = scaler.transform(objective3_traindata_embs)

scaler2 = StandardScaler()
scaler2.fit(objective1_valdata_embs)
objective1_valdata_embs = scaler2.transform(objective1_valdata_embs)
scaler2.fit(objective2_valdata_embs)
objective2_valdata_embs = scaler2.transform(objective2_valdata_embs)
scaler2.fit(objective3_valdata_embs)
objective3_valdata_embs = scaler2.transform(objective3_valdata_embs)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective1_traindata_labels_encoded_pose = le_pose.fit_transform(objective1_traindata_labels_pose)
objective1_traindata_labels_encoded_id = le_id.fit_transform(objective1_traindata_labels_id)
objective1_traindata_labels_encoded_emo = le_emo.fit_transform(objective1_traindata_labels_emo)
objective1_valdata_labels_encoded_pose = le_pose.transform(objective1_valdata_labels_pose)
objective1_valdata_labels_encoded_id = le_id.transform(objective1_valdata_labels_id)
objective1_valdata_labels_encoded_emo = le_emo.transform(objective1_valdata_labels_emo)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective2_traindata_labels_encoded_pose = le_pose.fit_transform(objective2_traindata_labels_pose)
objective2_traindata_labels_encoded_id = le_id.fit_transform(objective2_traindata_labels_id)
objective2_traindata_labels_encoded_emo = le_emo.fit_transform(objective2_traindata_labels_emo)
objective2_valdata_labels_encoded_pose = le_pose.transform(objective2_valdata_labels_pose)
objective2_valdata_labels_encoded_id = le_id.transform(objective2_valdata_labels_id)
objective2_valdata_labels_encoded_emo = le_emo.transform(objective2_valdata_labels_emo)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective3_traindata_labels_encoded_pose = le_pose.fit_transform(objective3_traindata_labels_pose)
objective3_traindata_labels_encoded_id = le_id.fit_transform(objective3_traindata_labels_id)
objective3_traindata_labels_encoded_emo = le_emo.fit_transform(objective3_traindata_labels_emo)
objective3_valdata_labels_encoded_pose = le_pose.transform(objective3_valdata_labels_pose)
objective3_valdata_labels_encoded_id = le_id.transform(objective3_valdata_labels_id)
objective3_valdata_labels_encoded_emo = le_emo.transform(objective3_valdata_labels_emo)

In [13]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_emo))

print("objective 1")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 1
1.0 1.0 1.0
0.6073392660733926 0.7758224177582241 0.40135986401359863


In [14]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_emo))

print("objective 2")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 2
1.0 1.0 1.0
1.0 1.0 1.0


In [15]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_emo))

print("objective 3")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 3
0.9295070492950704 0.978952104789521 0.9911508849115088
1.0 0.9981001899810019 1.0
