In [1]:
import imageio
import matplotlib.pyplot as plt
from mlxtend.image import extract_face_landmarks
import cv2

In [2]:
import glob
import numpy as np
import torch
import os
import tqdm
import re
import pickle
import random


class CustomDataSet(torch.utils.data.Dataset):

    def __init__(self, type_="train"):
        self.type_ = type_
        
        all_files = list(set([re.findall(r"(.+)_(\d+)_(.+)_(\d+)_(\d+)_.+", filename)[0] for filename in os.listdir(f"/media/soroushh/Storage2/matrices/evaluation")]))
        if self.type_ == "train":
            all_files = all_files[:int((2/3) * len(all_files))]
        else:
            all_files = all_files[int((2/3) * len(all_files)):]
        
        self.final_database = []
        for person1_name, triple_to_reconstruct_index, person2_name, triple_to_change_input_index, triple_to_change_output_index in all_files:
            triple_to_reconstruct_index = int(triple_to_reconstruct_index)
            triple_to_change_input_index = int(triple_to_change_input_index)
            triple_to_change_output_index = int(triple_to_change_output_index)
            self.final_database.append((person1_name, triple_to_reconstruct_index, person2_name, triple_to_change_input_index, triple_to_change_output_index))
            
        with open(f'/media/soroushh/Storage2/database_evaluation.pickle', 'rb') as handle:
            self.database = pickle.load(handle)
        
        ids = list(self.database.keys())
        self.ids_dict = {id:idx for idx, id in enumerate(list(self.database.keys()))}
        self.emotions_dict = {emo:idx for idx, emo in enumerate(np.unique([triplet[0] for id in ids for triplet in self.database[id]]).tolist())}
        self.poses_dict = {pose:idx for idx, pose in enumerate(np.unique([triplet[1] for id in ids for triplet in self.database[id]]).tolist())}

    def __len__(self):
        return len(self.final_database)
            
    def __getitem__(self, idx):
        tup = self.final_database[idx]
        
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_pose_img_to_reconstruct.npy', 'rb') as f:
            pose_img_to_reconstruct = np.load(f)
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_embedding_input.npy', 'rb') as f:
            embedding_input = np.load(f)
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{tup[0]}_{tup[1]}_{tup[2]}_{tup[3]}_{tup[4]}_embedding_output.npy', 'rb') as f:
            embedding_output = np.load(f)
            
        expected_pose_label = self.database[tup[2]][tup[4]][1]
        input_pose_label = self.database[tup[2]][tup[3]][1]
        input_id = tup[2]
        input_emo = self.database[tup[2]][tup[3]][0]
        
        while True:
            available_negative_choices = list(filter(lambda item: item[2] != input_id, self.final_database))
            if len(available_negative_choices) == 0:
                continue
                
            neg_tup = random.choice(available_negative_choices)
            break
            
        with open(f'/media/soroushh/Storage2/matrices/evaluation/{neg_tup[0]}_{neg_tup[1]}_{neg_tup[2]}_{neg_tup[3]}_{neg_tup[4]}_embedding_input.npy', 'rb') as f:
            negative_embedding_input = np.load(f)
            
        expected_pose_label = self.poses_dict[expected_pose_label]
        input_pose_label = self.poses_dict[input_pose_label]
        input_id = self.ids_dict[input_id]
        input_emo = self.emotions_dict[input_emo]
            
        return pose_img_to_reconstruct, embedding_input, embedding_output, expected_pose_label, input_pose_label, input_id, input_emo

In [3]:
image_paths = glob.glob(os.path.join("./KDEF/*/*.JPG"))
image_paths = np.array(image_paths)
np.random.shuffle(image_paths)
image_paths = image_paths.tolist()

index = int(0.9 * len(image_paths))
# trainset = image_paths[:index]
valset = image_paths[index:]

In [4]:
# train_dataset = CustomDataSet(trainset, type_="train", precomputerd=True)
val_dataset = CustomDataSet(type_="val")

In [5]:
# len(train_dataset), 
len(val_dataset)

10001

In [8]:
net = ConvAutoencoder()
net = net.to("cuda")
state_dict = torch.load("./augmentor.pt")
net.load_state_dict(state_dict)
net.eval()

ConvAutoencoder(
  (decoder_emb_generator): EmbeddingGeneratorDecoder(
    (vit): VisionTransformer(
      (input_layer): Linear(in_features=64, out_features=128, bias=True)
      (transformer): Sequential(
        (0): AttentionBlock(
          (layer_norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (layer_norm_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (linear): Sequential(
            (0): Linear(in_features=128, out_features=256, bias=True)
            (1): GELU()
            (2): Dropout(p=0.2, inplace=False)
            (3): Linear(in_features=256, out_features=128, bias=True)
            (4): Dropout(p=0.2, inplace=False)
          )
        )
        (1): AttentionBlock(
          (layer_norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAtte

# Visualize AE reconstructions

In [None]:
with torch.no_grad():
    for data in train_dataloader:
        pose_img = data[0]
        pose_img = pose_img.permute(0, 3, 1, 2)
        pose_img = pose_img.to("cuda").to(torch.float32)
        input_emb = data[1]
        input_emb = input_emb.to("cuda").to(torch.float32)
        output_emb = data[2]
        output_emb = output_emb.to("cuda").to(torch.float32)
        
        features = net.encoder(pose_img)
        
        reconstructed_data = net.decoder_pose_reconstructor(features)
        
        reconstructed_data = reconstructed_data.permute(0, 2, 3, 1).cpu().numpy()
        orig_data = pose_img.permute(0, 2, 3, 1).cpu().numpy()
        
        for i in range(reconstructed_data.shape[0]):
            plt.imshow(orig_data[i, :, :, 0])
            plt.show()
            plt.imshow(reconstructed_data[i, :, :, 0])
            plt.show()
            
        break

In [None]:
with torch.no_grad():
    for data in val_dataloader:
        pose_img = data[0]
        pose_img = pose_img.permute(0, 3, 1, 2)
        pose_img = pose_img.to("cuda").to(torch.float32)
        input_emb = data[1]
        input_emb = input_emb.to("cuda").to(torch.float32)
        output_emb = data[2]
        output_emb = output_emb.to("cuda").to(torch.float32)
        
        features, _ = net.encoder(pose_img)
        
        reconstructed_data = net.decoder_pose_reconstructor(features)
        
        reconstructed_data = reconstructed_data.permute(0, 2, 3, 1).cpu().numpy()
        orig_data = pose_img.permute(0, 2, 3, 1).cpu().numpy()
        
        for i in range(reconstructed_data.shape[0]):
            plt.imshow(orig_data[i, :, :, 0])
            plt.show()
            plt.imshow(reconstructed_data[i, :, :, 0])
            plt.show()
            
        break

# Evaluate quality of generated representations

In [9]:
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.preprocessing import StandardScaler


objective1_traindata_embs = []
objective1_traindata_labels_pose = []
objective1_traindata_labels_id = []
objective1_traindata_labels_emo = []
objective1_valdata_embs = []
objective1_valdata_labels_pose = []
objective1_valdata_labels_id = []
objective1_valdata_labels_emo = []

objective2_traindata_embs = []
objective2_traindata_labels_pose = []
objective2_traindata_labels_id = []
objective2_traindata_labels_emo = []
objective2_valdata_embs = []
objective2_valdata_labels_pose = []
objective2_valdata_labels_id = []
objective2_valdata_labels_emo = []

objective3_traindata_embs = []
objective3_traindata_labels_pose = []
objective3_traindata_labels_id = []
objective3_traindata_labels_emo = []
objective3_valdata_embs = []
objective3_valdata_labels_pose = []
objective3_valdata_labels_id = []
objective3_valdata_labels_emo = []

for data in tqdm.tqdm(val_dataset):
    pose_img = data[0]
    pose_img = torch.Tensor(pose_img)
    pose_img = pose_img.unsqueeze(0)
    pose_img = pose_img.permute(0, 3, 1, 2)
    pose_img = pose_img.to("cuda").to(torch.float32)
    
    input_emb = data[1]
    objective1_traindata_embs.append(input_emb)
    objective2_traindata_embs.append(input_emb)
    objective3_traindata_embs.append(input_emb)
    input_emb = torch.Tensor(input_emb)
    input_emb = input_emb.unsqueeze(0)
    input_emb = input_emb.to("cuda").to(torch.float32)
    
    output_emb = data[2]
    output_emb = torch.Tensor(output_emb)
    output_emb = output_emb.unsqueeze(0)
    output_emb = output_emb.to("cuda").to(torch.float32)
    
    expected_pose_label = data[3]
    input_pose_label = data[4]
    objective1_traindata_labels_pose.append(input_pose_label)
    objective2_traindata_labels_pose.append(input_pose_label)
    objective3_traindata_labels_pose.append(input_pose_label)
    
    expected_id_label = data[5]
    objective1_traindata_labels_id.append(expected_id_label)
    objective2_traindata_labels_id.append(expected_id_label)
    objective3_traindata_labels_id.append(expected_id_label)
    
    expected_emo_label = data[6]
    objective1_traindata_labels_emo.append(expected_emo_label)
    objective2_traindata_labels_emo.append(expected_emo_label)
    objective3_traindata_labels_emo.append(expected_emo_label)
    
    # ensure to not mixing the train and validation sets
    output_emb = output_emb.detach().cpu().numpy()
    
    features = net.encoder(pose_img)
    generated_embedding = net.decoder_emb_generator(features, input_emb)
    
    generated_embedding = generated_embedding.detach().cpu().numpy()
    
    # objective 1: measuring how linear separable are the representations
    objective1_valdata_embs.append(generated_embedding[0]) 
    objective1_valdata_labels_pose.append(expected_pose_label) 
    objective1_valdata_labels_id.append(expected_id_label) 
    objective1_valdata_labels_emo.append(expected_emo_label)
    
    # objective 2 and 3: measuring how good the generated representations improve our accuracy
    objective2_valdata_embs.append(output_emb[0])
    objective2_valdata_labels_pose.append(expected_pose_label)
    objective2_valdata_labels_id.append(expected_id_label)
    objective2_valdata_labels_emo.append(expected_emo_label)
    
    objective3_traindata_embs.append(generated_embedding[0])
    objective3_traindata_labels_pose.append(expected_pose_label)
    objective3_traindata_labels_id.append(expected_id_label)
    objective3_traindata_labels_emo.append(expected_emo_label)
    
    objective3_valdata_embs.append(output_emb[0])
    objective3_valdata_labels_pose.append(expected_pose_label)
    objective3_valdata_labels_id.append(expected_id_label)
    objective3_valdata_labels_emo.append(expected_emo_label)

100%|██████████| 10001/10001 [16:51<00:00,  9.89it/s]


In [10]:
print("objective1_traindata_labels_pose", np.unique(objective1_traindata_labels_pose, return_counts=True))
print("objective1_traindata_labels_id", np.unique(objective1_traindata_labels_id, return_counts=True))
print("objective1_traindata_labels_emo", np.unique(objective1_traindata_labels_emo, return_counts=True))

print("objective1_valdata_labels_pose", np.unique(objective1_valdata_labels_pose, return_counts=True))
print("objective1_valdata_labels_id", np.unique(objective1_valdata_labels_id, return_counts=True))
print("objective1_valdata_labels_emo", np.unique(objective1_valdata_labels_emo, return_counts=True))

objective1_traindata_labels_pose (array([0, 1, 2, 3, 4]), array([  51,   83, 3306, 3297, 3264]))
objective1_traindata_labels_id (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), array([341, 407, 367, 350, 359, 354, 360, 339, 334, 360, 399, 356, 325,
       366, 342, 383, 370, 337, 328, 359, 346, 334, 346, 386, 383, 353,
       320, 397]))
objective1_traindata_labels_emo (array([0, 1, 2, 3, 4, 5, 6]), array([1429, 1345, 1487, 1439, 1472, 1388, 1441]))
objective1_valdata_labels_pose (array([0, 1, 2, 3, 4]), array([   5,    7, 3219, 3401, 3369]))
objective1_valdata_labels_id (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]), array([341, 407, 367, 350, 359, 354, 360, 339, 334, 360, 399, 356, 325,
       366, 342, 383, 370, 337, 328, 359, 346, 334, 346, 386, 383, 353,
       320, 397]))
objective1_valdata_labels_emo (array([0, 1, 2, 3, 

In [11]:
objective1_traindata_embs = np.array(objective1_traindata_embs)
objective1_valdata_embs = np.array(objective1_valdata_embs)

objective2_traindata_embs = np.array(objective2_traindata_embs)
objective2_valdata_embs = np.array(objective2_valdata_embs)

objective3_traindata_embs = np.array(objective3_traindata_embs)
objective3_valdata_embs = np.array(objective3_valdata_embs)

In [12]:
from sklearn.preprocessing import LabelEncoder


scaler = StandardScaler()
scaler.fit(objective1_traindata_embs)
objective1_traindata_embs = scaler.transform(objective1_traindata_embs)
scaler.fit(objective2_traindata_embs)
objective2_traindata_embs = scaler.transform(objective2_traindata_embs)
scaler.fit(objective3_traindata_embs)
objective3_traindata_embs = scaler.transform(objective3_traindata_embs)

scaler2 = StandardScaler()
scaler2.fit(objective1_valdata_embs)
objective1_valdata_embs = scaler2.transform(objective1_valdata_embs)
scaler2.fit(objective2_valdata_embs)
objective2_valdata_embs = scaler2.transform(objective2_valdata_embs)
scaler2.fit(objective3_valdata_embs)
objective3_valdata_embs = scaler2.transform(objective3_valdata_embs)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective1_traindata_labels_encoded_pose = le_pose.fit_transform(objective1_traindata_labels_pose)
objective1_traindata_labels_encoded_id = le_id.fit_transform(objective1_traindata_labels_id)
objective1_traindata_labels_encoded_emo = le_emo.fit_transform(objective1_traindata_labels_emo)
objective1_valdata_labels_encoded_pose = le_pose.transform(objective1_valdata_labels_pose)
objective1_valdata_labels_encoded_id = le_id.transform(objective1_valdata_labels_id)
objective1_valdata_labels_encoded_emo = le_emo.transform(objective1_valdata_labels_emo)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective2_traindata_labels_encoded_pose = le_pose.fit_transform(objective2_traindata_labels_pose)
objective2_traindata_labels_encoded_id = le_id.fit_transform(objective2_traindata_labels_id)
objective2_traindata_labels_encoded_emo = le_emo.fit_transform(objective2_traindata_labels_emo)
objective2_valdata_labels_encoded_pose = le_pose.transform(objective2_valdata_labels_pose)
objective2_valdata_labels_encoded_id = le_id.transform(objective2_valdata_labels_id)
objective2_valdata_labels_encoded_emo = le_emo.transform(objective2_valdata_labels_emo)

le_pose = LabelEncoder()
le_id = LabelEncoder()
le_emo = LabelEncoder()
objective3_traindata_labels_encoded_pose = le_pose.fit_transform(objective3_traindata_labels_pose)
objective3_traindata_labels_encoded_id = le_id.fit_transform(objective3_traindata_labels_id)
objective3_traindata_labels_encoded_emo = le_emo.fit_transform(objective3_traindata_labels_emo)
objective3_valdata_labels_encoded_pose = le_pose.transform(objective3_valdata_labels_pose)
objective3_valdata_labels_encoded_id = le_id.transform(objective3_valdata_labels_id)
objective3_valdata_labels_encoded_emo = le_emo.transform(objective3_valdata_labels_emo)

In [13]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective1_traindata_embs, objective1_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective1_traindata_embs) == objective1_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective1_valdata_embs) == objective1_valdata_labels_encoded_emo))

print("objective 1")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 1
1.0 0.9987001299870013 1.0
0.9976002399760024 0.7401259874012599 0.5612438756124387


In [14]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective2_traindata_embs, objective2_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective2_traindata_embs) == objective2_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective2_valdata_embs) == objective2_valdata_labels_encoded_emo))

print("objective 2")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 2
1.0 0.9987001299870013 1.0
1.0 0.9986001399860014 1.0


In [15]:
from sklearn import linear_model
from sklearn.svm import SVC


accuracy_pose = []
accuracy_id = []
accuracy_emotion = []
val_accuracy_pose = []
val_accuracy_id = []
val_accuracy_emotion = []

clf_pose = SVC()
clf_pose.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_pose)
accuracy_pose.append(np.mean(clf_pose.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_pose))
val_accuracy_pose.append(np.mean(clf_pose.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_pose))

clf_id = SVC()
clf_id.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_id)
accuracy_id.append(np.mean(clf_id.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_id))
val_accuracy_id.append(np.mean(clf_id.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_id))

clf_emo = SVC()
clf_emo.fit(objective3_traindata_embs, objective3_traindata_labels_encoded_emo)
accuracy_emotion.append(np.mean(clf_emo.predict(objective3_traindata_embs) == objective3_traindata_labels_encoded_emo))
val_accuracy_emotion.append(np.mean(clf_emo.predict(objective3_valdata_embs) == objective3_valdata_labels_encoded_emo))

print("objective 3")
print(np.mean(accuracy_pose), np.mean(accuracy_id), np.mean(accuracy_emotion))
print(np.mean(val_accuracy_pose), np.mean(val_accuracy_id), np.mean(val_accuracy_emotion))

objective 3
0.9994000599940006 0.9795020497950205 0.9828017198280172
1.0 0.9986001399860014 1.0
