In [2]:
# Required imports

from argparse import Namespace
import time
import sys
import pprint
import numpy as np
import os
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision import utils
import matplotlib.pyplot as plt
import cv2
import glob
from tqdm import tqdm

from datasets import augmentations
from utils.common import tensor2im, log_input_image
from models.psp import pSp

from models.stylegan2.model import Generator

%matplotlib inline

In [3]:
# Define the StyleGANv2 decoder and load pretrained weights

style2gan_decoder = Generator(1024, 512, 8) # 1024x1024 output
ckpt = torch.load('pretrained_models/stylegan2-ffhq-config-f.pt')
style2gan_decoder.load_state_dict(ckpt['g_ema'], strict=False)

device = 'cuda'
style2gan_decoder.to(device)
style2gan_decoder.eval();

In [4]:
# Define the MTCNN face detector for cropping
# https://github.com/timesler/facenet-pytorch
# https://www.semanticscholar.org/paper/Joint-Face-Detection-and-Alignment-Using-Multitask-Zhang-Zhang/9e60942aa15670ed9ee03af3c0ae011fa4966b7c

from facenet_pytorch import MTCNN
mtcnn = MTCNN(keep_all=False, select_largest=False, post_process=False, min_face_size=50, device=device)

# Define the emotion recognition network
# https://github.com/av-savchenko/face-emotion-recognition
# https://www.semanticscholar.org/paper/Classifying-Emotions-and-Engagement-in-Online-Based-Savchenko-Savchenko/260d7f95ab8a562f4ff590684ef6a509b8fed316

from hsemotion.facial_emotions import HSEmotionRecognizer
model_name = 'enet_b0_8_best_afew'
fer = HSEmotionRecognizer(model_name=model_name, device=device)
emotion_class_n = 8

C:\Users\Stamina\.hsemotion\enet_b0_8_best_afew.pt Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


In [5]:
# Define the function to generate images for given batch 

# 224x224 image is enough for emotion recognition net
# saving 1024x1024 is too slow because of floating point arithmetic during the normalization, so downscale it
pool_logit = torch.nn.AdaptiveAvgPool2d((224, 224)) # this is clever, resizes logits (you can not use image resizers, as it is not normalized)

def generate_images(generator, n_images):
    gen_imgs = []
    latents = []
    with torch.no_grad():
        for _ in range(n_images): # args.pics
            z = torch.randn(1, 512, device=device) # args.sample, args.latent
            gen_img, latent = generator(
                [z], truncation=1, truncation_latent=None, return_latents=True # args.truncation
            )
            reshaped_gen_img = pool_logit(gen_img)
            gen_imgs.append(reshaped_gen_img[0])
            latents.append(latent[0].to('cpu').numpy())
    return gen_imgs, latents

In [6]:
# Define the function to normalize images and convert to numpy

# This normalization block is taken from the original torch repository:
# https://github.com/pytorch/vision/blob/89d2b38cbc3254ed7ed7b43393e4635979ac12eb/torchvision/utils.py

def norm_ip(img, low, high):
    img.clamp_(min=low, max=high)
    img.sub_(low).div_(max(high - low, 1e-5))

def norm_range(t, value_range):
    if value_range is not None:
        norm_ip(t, value_range[0], value_range[1])
    else:
        norm_ip(t, float(t.min()), float(t.max()))

def normalize_images_and_convert_to_numpy(images): # inplace
    for i, image in enumerate(images):
        norm_range(image, (-1, 1))
        images[i] = image.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()

In [8]:
# Define the function to preprocess images and predict emotions for given batch

def extract_emotion_scores_from_images(mtcnn_net, images):

    # Finding facial bounding box
    detections, _ = mtcnn_net.detect(images, landmarks=False)

    detections = np.clip(np.array([detection[0] if detection is not None else [-1, -1, -1, -1] for detection in detections]), 0, 223)

    # all_scores = []
    
    # for i, image in enumerate(images):
    #     bounding_box = detections[i]
    #     box = bounding_box.astype(int)
    #     x1, y1, x2, y2 = box[0:4]
    #     if x2 == 0:
    #         all_scores.append(np.array([-1.] * emotion_class_n, dtype=np.float32)) # TODO Handle no face case !!
    #         continue
    #     face_img = image[y1:y2,x1:x2,:]
    #     _, scores = fer.predict_emotions(face_img, logits=False)
    #     all_scores.append(scores)

    invalid_images = []

    face_images = []
    
    for i, image in enumerate(images):
        bounding_box = detections[i]
        box = bounding_box.astype(int)
        x1, y1, x2, y2 = box[0:4]
        if x2 == 0:
            face_images.append(np.zeros((224, 224, 3), np.uint8))
            invalid_images.append(i)
            continue
        face_img = image[y1:y2,x1:x2,:]
        face_images.append(face_img)

    _, all_scores = fer.predict_multi_emotions(face_images, logits=False)

    for invalid_image_idx in invalid_images:
        all_scores[invalid_image_idx] = np.zeros_like(all_scores[invalid_image_idx]) - 1
    
    return all_scores

In [9]:
# Define the function for generates images and extracts predictions for emotions for given batch

# {0: 'Anger', 1: 'Contempt', 2: 'Disgust', 3: 'Fear', 4: 'Happiness', 5: 'Neutral', 6: 'Sadness', 7: 'Surprise'}

def generate_images_extract_emotions(generator, mtcnn_net, n_images, start_idx = 0, img_save_path = 'main_generatedimages', score_save_path = 'main_emotionscores', latent_save_path = 'main_wplusses'):
    generated_images, latents = generate_images(generator, n_images) # max that my gpu can take :(
    latents = np.array(latents)
    normalize_images_and_convert_to_numpy(generated_images)
    scores = extract_emotion_scores_from_images(mtcnn_net, generated_images)

    # Save generated images
    for i, img in enumerate(generated_images):
        img_path = os.path.join(img_save_path, f'{start_idx + i}.png')
        img = Image.fromarray(img)
        img.save(img_path)
    
    # Save scores and latents
    np.save(os.path.join(score_save_path, f'{start_idx}.npy'), scores)
    np.save(os.path.join(latent_save_path, f'{start_idx}.npy'), latents)

    # return scores, latents

In [13]:
batch_size = 32
imgs_to_generate = 312832
iterations = imgs_to_generate // batch_size

# This is for continuing previous runs
start_num = 7168
start_batch = start_num // batch_size

for i in tqdm(range(iterations)):
    generate_images_extract_emotions(style2gan_decoder, mtcnn, batch_size, start_idx = (start_batch + i) * batch_size)

100%|██████████| 9776/9776 [8:48:44<00:00,  3.25s/it]  
