In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
import cv2
from sklearn.model_selection import train_test_split
from align import AlignDlib
%matplotlib inline

alignment = AlignDlib('models/landmarks.dat')
def load_image(path):
    img = cv2.imread(path, 1)
    return img[...,::-1]

In [None]:
face_embeddings = list(np.load('embedded_face.npy') )
face_emb_labels = list(np.load('embedded_face_labels.npy'))
voice_embeddings= list(np.load('voice_embeddings_input.npy'))
voice_emb_labels= list(np.load('voice_embeddings_labels.npy'))

voice_embedd = []
voice_embedd_labels = []
face_npy = []

for k in [0,1,2,3,4,5,6,7,8]:
    count = 0
    limit = face_emb_labels.count(k)
    for i in range(len(voice_emb_labels)):
        if(count==limit):
            break
        else:    
            if(voice_emb_labels[i]==k):
                voice_embedd.append(voice_embeddings[i])
                voice_embedd_labels.append(voice_emb_labels[i])
                count+=1
            
directory_path = 'Dataset'
directory_path1 = 'Dataset1'
face_npy = []
for k in [0,1,2,3,4,5,6,7,8]:
    count = 0
    limit = face_emb_labels.count(k)
    print(k)
    for file in os.listdir(os.path.join(directory_path,str(k))):
        if(count==limit):
            break
        else: 
            if(count==0):
                for file1 in os.listdir(os.path.join(directory_path1,str(k))):
                    file_path = os.path.join(directory_path1,str(k))
                    file_path = os.path.join(file_path,file1)
                    jc_orig = load_image(file_path)
                    bb = alignment.getLargestFaceBoundingBox(jc_orig)
                    jc_aligned = alignment.align(64, jc_orig, bb, landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)
                    jc_aligned = jc_aligned/255.0
            face_npy.append(jc_aligned)
            count+=1
                                 
len(face_npy)==len(voice_embedd_labels)  

train_face, test_face, train_voice, test_voice = train_test_split(face_npy, voice_embedd, test_size=0.2, random_state=42)

In [None]:
dataset_sz = len(train_face)
num_examples = dataset_sz
num_steps = 10000
lr_generator = 0.002
lr_discriminator = 0.002
sound_vector = 128
BATCH_SIZE = 64
NOISE_SIZE = 128
n_batches = int(dataset_sz/BATCH_SIZE)

In [None]:
f1, f2, f3, f4, f5 = 3, 64, 128, 256, 512
s1, s2, s3, s4, s5 = 64, 32, 16, 8, 4

In [None]:
epochs_completed = 0
index_in_epoch = 0
num_examples = len(train_face)

def next_batch(batch_size):    

    global train_face
    global train_voice
    global index_in_epoch
    global epochs_completed

    start = index_in_epoch
    index_in_epoch += batch_size

    # when all trainig data have been already used, it is reorder randomly    
    if index_in_epoch > num_examples:
        # finished epoch
        epochs_completed += 1
        # shuffle the data
        perm = np.arange(num_examples)
        np.random.shuffle(perm)
        train_face = [train_face[i] for i in perm]
        train_voice = [train_voice[i] for i in perm]
        # start next epoch
        start = 0
        index_in_epoch = batch_size
        assert batch_size <= num_examples
    end = index_in_epoch
    return train_face[start:end], train_voice[start:end]


In [None]:
def denormalize4gan(im):
    im += 1.0 # in [0, 2]
    im *= 127.0 # in [0, 255]
    return im.astype(np.uint8)

def lrelu(x,alpha=0.2):
    return tf.maximum(x,alpha*x)	

def conv2d(x, features, kernel=[4,4], strides=[1,2,2,1], name="conv_layer"):
    with tf.variable_scope(name) as scope:
        weights = weight(shape=kernel + features, name="weights")
        biases = bias(shape=[features[-1]], name="bias")
        output = tf.nn.conv2d(x, weights, strides=strides, padding='SAME') 
        output = tf.nn.bias_add(output, biases)
        return output

def deconv2d(x, features, output_shape, kernel=[4,4], strides=[1,2,2,1], name="deconv_layer"):
    with tf.variable_scope(name) as scope:
        weights = weight(shape=kernel + features, name="weights")
        biases = bias(shape=[features[0]], name="bias")
        output = tf.nn.conv2d_transpose(x, weights, output_shape=output_shape, strides=strides, padding='SAME') 
        return tf.reshape(tf.nn.bias_add(output, biases), output.get_shape())

def bias(shape, name):
    return tf.get_variable(name, shape,initializer=tf.constant_initializer(0.00000))

def weight(shape, name):
    return tf.get_variable(name, shape,initializer=tf.glorot_uniform_initializer())	

def dense(x, shape, name):
    with tf.variable_scope(name):
        weights = weight(shape, name="weights")
        biases = bias([shape[-1]], name="bias")
        return tf.matmul(x,weights) + biases

def batch_norm(inputs, decay=0.9, epsilon=0.00001, scale=True, isTrain=True, name="batch_norm"):
    return tf.contrib.layers.batch_norm(inputs, decay=decay, scale=scale, epsilon=epsilon, updates_collections=None, is_training=isTrain, scope=name)			


In [None]:
def discriminator(images, voices, batch_size, reuse):
    with tf.variable_scope("discriminator") as scope:
        if reuse:
            scope.reuse_variables()
        output = conv2d(images, features=[f1, f2], name="d_conv_layer_1")
        output = lrelu(output)

        output = conv2d(output, features=[f2, f3], name="d_conv_layer_2")
        output = batch_norm(output, isTrain=True, name="d_batch_norm_2")
        output = lrelu(output)

        output = conv2d(output, features=[f3, f4], name="d_conv_layer_3")
        output = batch_norm(output, isTrain=True, name="d_batch_norm_3")
        output = lrelu(output)

        output = conv2d(output, features=[f4, f5], name="d_conv_layer_4")
        output = batch_norm(output, isTrain=True, name="d_batch_norm_4")
        output = lrelu(output)

        voice_embeddings = dense(voices, shape=[128,128], name="d_dense_voice")
        voice_embeddings = lrelu(batch_norm(voice_embeddings, isTrain=True, name='d_batch_norm_5'))
        voice_embeddings = tf.expand_dims(voice_embeddings,1)
        voice_embeddings = tf.expand_dims(voice_embeddings,2)
        tiled_voice_embeddings = tf.tile(voice_embeddings, [1,4,4,1], name='d_tiled_voice_embeddings')

        output = tf.concat([output, tiled_voice_embeddings], 3, name='d_concat')
        output = conv2d(output, features=[f5+128, f5], strides=[1,1,1,1], name="d_conv_layer_5")
        output = batch_norm(output, isTrain=True, name="d_batch_norm_8")
        output = lrelu(output)

        output = tf.reshape(output, [batch_size, -1])

        output = dense(output, [s5*s5*f5, 1], name="d_dense_2")
        return output, tf.nn.sigmoid(output)


In [None]:
def sampler(voice_embeddings, batch_size):
    with tf.variable_scope("generator") as scope:
        scope.reuse_variables()
        
        voice_embeddings = dense(voice_embeddings, shape=[128,128], name="g_dense_voice")

        output = dense(voice_embeddings, shape=[128, s5*s5*f5], name="g_dense_1")
        output = batch_norm(output, isTrain=False, name="g_batch_norm_0")
        output = tf.nn.relu(output)
        output = tf.reshape(output, [-1, s5, s5, f5])

        # 4x4x512
        output = deconv2d(output, features=[f4, f5], output_shape=[batch_size,s4,s4,f4], name="g_deconv_layer_1")
        output = batch_norm(output, isTrain=False, name="g_batch_norm_1")
        output = tf.nn.relu(output)

        # 8x8x256
        output = deconv2d(output, features=[f3, f4], output_shape=[batch_size,s3,s3,f3], name="g_deconv_layer_2")
        output = batch_norm(output, isTrain=False, name="g_batch_norm_2")
        output = tf.nn.relu(output)

        # 16x16x128
        output = deconv2d(output, features=[f2, f3], output_shape=[batch_size,s2,s2,f2], name="g_deconv_layer_3")
        output = batch_norm(output, isTrain=False, name="g_batch_norm_3")
        output = tf.nn.relu(output)

        # 32x32x64
        output = deconv2d(output, features=[f1, f2], output_shape=[batch_size,s1,s1,f1], name="g_deconv_layer_4")
        output = tf.nn.tanh(output)
        
        # 64x64x3
        return output


In [None]:
def generator(voice_embeddings, batch_size):
    with tf.variable_scope("generator") as scope:
        
        voice_embeddings = dense(voice_embeddings, shape=[128,128], name="g_dense_voice")

        
        output = dense(voice_embeddings, shape=[128, s5*s5*f5], name="g_dense_1")
        output = batch_norm(output, isTrain=False, name="g_batch_norm_0")
        output = tf.nn.relu(output)
        output = tf.reshape(output, [-1, s5, s5, f5])

        # 4x4x512
        output = deconv2d(output, features=[f4, f5], output_shape=[batch_size,s4,s4,f4], name="g_deconv_layer_1")
        output = batch_norm(output, isTrain=True, name="g_batch_norm_1")
        output = tf.nn.relu(output)
        
        # 8x8x256
        output = deconv2d(output, features=[f3, f4], output_shape=[batch_size,s3,s3,f3], name="g_deconv_layer_2")
        output = batch_norm(output, isTrain=True, name="g_batch_norm_2")
        output = tf.nn.relu(output)

        # 16x16x128
        output = deconv2d(output, features=[f2, f3], output_shape=[batch_size,s2,s2,f2], name="g_deconv_layer_3")
        output = batch_norm(output, isTrain=True, name="g_batch_norm_3")
        output = tf.nn.relu(output)

        # 32x32x64
        output = deconv2d(output, features=[f1, f2], output_shape=[batch_size,s1,s1,f1], name="g_deconv_layer_4")
        output = tf.nn.tanh(output)
        
        # 64x64x3
        return output


In [None]:
tf.reset_default_graph()
#z = tf.placeholder(tf.float32, shape=[None, NOISE_SIZE], name="z")
voice_embeddings = tf.placeholder(tf.float32, shape=[None, 128], name="voice_embeddings")
real_images = tf.placeholder(tf.float32, shape=[None, s1, s1, f1], name="real_input")
real_images_flat = tf.reshape(real_images, [-1,64*64*3])

fake_images = generator(voice_embeddings, batch_size=BATCH_SIZE)
fake_images_flat = tf.reshape(fake_images, [-1,64*64*3])

real_img_real_label_disc_logits, real_disc_real = discriminator(real_images, voice_embeddings,batch_size=BATCH_SIZE, reuse=False)
real_img_fake_label_disc_logits, real_disc_fake = discriminator(real_images, voice_embeddings,batch_size=BATCH_SIZE, reuse=True)
sample = sampler(voice_embeddings, batch_size=BATCH_SIZE)
fake_disc_logits, fake_disc = discriminator(fake_images,voice_embeddings, batch_size=BATCH_SIZE, reuse=True)

generation_loss = tf.reduce_sum(tf.maximum(fake_images_flat, 0) - fake_images_flat * real_images_flat\
                                                 + tf.log(1 + tf.exp(-tf.abs(fake_images_flat))), 1)
tf.cast(generation_loss,tf.int32)

g_loss_1 = tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_disc_logits, labels=tf.zeros_like(fake_disc)+
                                                                tf.random_uniform(minval=0,maxval=0.3,shape=tf.shape(fake_disc)))
g_loss  = tf.reduce_mean(g_loss_1 + generation_loss)
d_loss_real_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=real_img_real_label_disc_logits, labels=tf.zeros_like(real_disc_real)+
                                                                          tf.random_uniform(minval=0,maxval=0.3,shape=tf.shape(real_disc_real))))
d_loss_real_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=real_img_fake_label_disc_logits, labels=tf.ones_like(real_disc_fake)-
                                                                          tf.random_uniform(minval=0,maxval=0.3,shape=tf.shape(real_disc_fake))))
d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_disc_logits, labels=tf.ones_like(fake_disc)-
                                                                     tf.random_uniform(minval=0,maxval=0.3,shape=tf.shape(fake_disc))))

d_loss = d_loss_fake + (d_loss_real_real + d_loss_real_fake)/2
t_vars = tf.trainable_variables()

d_vars = [var for var in t_vars if 'd_' in var.name]
g_vars = [var for var in t_vars if 'g_' in var.name]

d_optim = tf.train.AdamOptimizer(0.0002, beta1=0.5).minimize(d_loss, var_list=d_vars)
g_optim = tf.train.AdamOptimizer(0.0001, beta1=0.5).minimize(g_loss, var_list=g_vars)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(20):
        dl, gl = [],[]
        for i in range(n_batches):
            #batch_noise = np.random.uniform(-1,1,size=(BATCH_SIZE, NOISE_SIZE)).astype(np.float32)
            images_embedding, voice_embedding = next_batch(BATCH_SIZE)
            _, DLOSS = sess.run([d_optim, d_loss],feed_dict={real_images:images_embedding,voice_embeddings:voice_embedding})
            
            # Update G network
            _, GLOSS = sess.run([g_optim, g_loss],feed_dict={real_images:images_embedding,voice_embeddings:voice_embedding})

            # Update G network
            _, GLOSS = sess.run([g_optim, g_loss],feed_dict={real_images:images_embedding,voice_embeddings:voice_embedding})

            dl.append(DLOSS)
            gl.append(GLOSS)
        print('discriminator_loss / generator_loss => %.2f / %.2f for step %d'%(np.mean(dl), np.mean(gl), epoch))

    j=0
    for k in range(0,len(test_voice)-64,64):
        voice_embedding = test_voice[k:k+64]
        image_embedding = test_face[k:k+64]
        fake_image = sess.run(sample, feed_dict={real_images:images_embedding,voice_embeddings:voice_embedding})
        fake_image = denormalize4gan(fake_image)
        for image in fake_image:
            plt.imsave('test_gan_sample_1/'+str(j)+'.png',image)
            j+=1 