# Neural Style Transfer 

In [1]:
import os
import sys
import scipy.io
import scipy.misc
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
#from nst_utils import *
import numpy as np
import tensorflow as tf
from pydub import AudioSegment

ImportError: No module named tensorflow

## Computing the content cost

In [None]:
def compute_content_cost(a_C, a_G):
    """
    Computes the content cost
    
    Arguments:
    a_C -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing content of the image C 
    a_G -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing content of the image G
    
    Returns: 
    J_content -- scalar that you compute using equation 1 above.
    """
    
    ### START CODE HERE ###
    # Retrieve dimensions from a_G (≈1 line)
    m, n_H, n_W, n_C = a_G.shape().as_list()
    
    # Reshape a_C and a_G (≈2 lines)
    a_C_unrolled = tf.transpose(tf.reshape(a_C, [tf.to_int32(m), tf.to_int32(n_H*n_W), tf.to_int32(n_C)]), perm=[0, 2, 1])
    a_G_unrolled = tf.transpose(tf.reshape(a_G, [tf.to_int32(m), tf.to_int32(n_H*n_W), tf.to_int32(n_C)]), perm=[0, 2, 1])

    # compute the cost with tensorflow (≈1 line)
    x = tf.subtract(a_C_unrolled, a_G_unrolled)
    y = tf.square(x)
    z = tf.reduce_sum(y)
    J_content = tf.multiply(tf.to_float(tf.divide(tf.to_int32(1), tf.multiply(tf.to_int32(4), tf.multiply(tf.multiply(n_H,n_W), n_C)))), z)
    ### END CODE HERE ###
    
    return J_content

## Computing the style cost

#### Style matrix (Gram matrix)

In [None]:
def gram_matrix(A):
    """
    Argument:
    A -- matrix of shape (n_C, n_H*n_W)
    
    Returns:
    GA -- Gram matrix of A, of shape (n_C, n_C)
    """

    GA = tf.matmul(A, tf.transpose(A))
    
    return GA

#### Style cost

In [None]:
def compute_layer_style_cost(a_S, a_G):
    """
    Arguments:
    a_S -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing style of the image S 
    a_G -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing style of the image G
    
    Returns: 
    J_style_layer -- tensor representing a scalar value, style cost defined above by equation (2)
    """
    
    # Retrieve dimensions from a_G
    m, n_H, n_W, n_C = a_G.get_shape().as_list()
    
    # Reshape the images to have them of shape (n_C, n_H*n_W)
    a_S = tf.transpose(tf.reshape(a_S, [tf.to_int32(m), tf.to_int32(n_H*n_W), tf.to_int32(n_C)]), perm=[0, 2, 1])
    a_G = tf.transpose(tf.reshape(a_G, [tf.to_int32(m), tf.to_int32(n_H*n_W), tf.to_int32(n_C)]), perm=[0, 2, 1])

    # Computing gram_matrices for both images S and G
    GS = gram_matrix(a_S[0])
    GG = gram_matrix(a_G[0])

    # Computing the loss
    x = tf.subtract(GS, GG)
    y = tf.square(x)
    z = tf.reduce_sum(y)
    nh_nw = tf.multiply(n_H,n_W)
    denominator = tf.multiply(tf.constant(4), tf.multiply(tf.multiply(nh_nw, nh_nw), tf.multiply(n_C, n_C)))
    divided = tf.to_float(tf.divide(tf.constant(1), denominator))
    J_style_layer = tf.multiply(divided, z)
    
    return J_style_layer

#### Compute style cost of several layers

In [None]:
def compute_style_cost(model, STYLE_LAYERS):
    """
    Computes the overall style cost from several chosen layers
    
    Arguments:
    model -- our tensorflow model
    STYLE_LAYERS -- A python list containing:
                        - the names of the layers we would like to extract style from
                        - a coefficient for each of them
    
    Returns: 
    J_style -- tensor representing a scalar value, style cost defined above by equation (2)
    """
    
    # initialize the overall style cost
    J_style = 0

    for layer_name, coeff in STYLE_LAYERS:

        # Select the output tensor of the currently selected layer
        out = model[layer_name]

        # Set a_S to be the hidden layer activation from the layer we have selected, by running the session on out
        a_S = sess.run(out)

        # Set a_G to be the hidden layer activation from same layer. Here, a_G references model[layer_name] 
        # and isn't evaluated yet. Later in the code, we'll assign the image G as the model input, so that
        # when we run the session, this will be the activations drawn from the appropriate layer, with G as input.
        a_G = out
        
        # Compute style_cost for the current layer
        J_style_layer = compute_layer_style_cost(a_S, a_G)

        # Add coeff * J_style_layer of this layer to overall style cost
        J_style += coeff * J_style_layer

    return J_style

## Total cost to optimize
Cost function that minimizes both content and style

In [None]:
def total_cost(J_content, J_style, alpha = 10, beta = 40):
    """
    Computes the total cost function
    
    Arguments:
    J_content -- content cost coded above
    J_style -- style cost coded above
    alpha -- hyperparameter weighting the importance of the content cost
    beta -- hyperparameter weighting the importance of the style cost
    
    Returns:
    J -- total cost as defined by the formula above.
    """
    
    J = alpha*J_content + beta*J_style
    
    return J

## Optimizing the Neural Style Transfer
1. Create an Interactive Session
2. Load the content image
3. Load the style image
4. Randomly initialize the image to be generated
5. Load the VGG16 model
6. Build the TensorFlow graph:
7. Run the content image through the VGG16 model and compute the content cost
8. Run the style image through the VGG16 model and compute the style cost
9. Compute the total cost
10. Define the optimizer and the learning rate
11. Initialize the TensorFlow graph and run it for a large number of iterations, updating the generated image at every step.

#### Model NN
Function initializes the variables of the tensorflow graph, assigns the input image (initial generated image) as the input of the model, and runs the train_step for a large number of steps.

In [None]:
def model_nn(sess, input_spectrogram, num_iterations = 200):
    
    # Initialize global variables (you need to run the session on the initializer)
    sess.run(tf.global_variables_initializer())
    
    # Run the noisy input image (initial generated image) through the model.
    sess.run(model['input'].assign(input_spectrogram))
    
    for i in range(num_iterations):
    
        # Run the session on the train_step to minimize the total cost
        sess.run(train_step) 
        
        # Compute the generated image by running the session on the current model['input']
        generated_spectrogram = sess.run(model['input'])

        # Print every 20 iteration.
        if i%20 == 0:
            Jt, Jc, Js = sess.run([J, J_content, J_style])
            print("Iteration " + str(i) + " :")
            print("total cost = " + str(Jt))
            print("content cost = " + str(Jc))
            print("style cost = " + str(Js))
            
            # save current generated spectogram in the "/output" directory
            #save_image("output/" + str(i) + ".npy", generated_spectrogram)
    
    # save last generated image
    save_image('output/generated_spectrogram.npy', generated_spectrogram)
    
    return generated_spectrogram

#### Load saved model
Returns the CNN model stored in a python dictionary where each variable name is the key and the corresponding value is a tensor containing that variable's value.

In [None]:
def load_cnn_model(path, spectrogram_shape):
    """
    Here is the detailed configuration of the CNN model:
    0 is conv1_1 (5, 5, 1, 32)
    1 is relu
    2 is maxpool
    3 is conv2_1 (3, 3, 32, 64)
    4 is relu
    5 is maxpool
    6 is conv3_1 (3, 3, 32, 64)
    7 is relu
    8 is maxpool
    9 is conv4_1 (3, 3, 32, 64)
    10 is relu
    11 is maxpool
    12 is conv5_1 (3, 3, 32, 64)
    13 is relu
    14 is maxpool
    15 is conv6_1 (3, 3, 32, 64)
    16 is relu
    17 is maxpool
    18 is conv7_1 (3, 3, 32, 64)
    19 is relu
    20 is maxpool
    21 is conv8_1 (3, 3, 32, 64)
    22 is relu
    23 is maxpool
    24 is conv9_1 (3, 3, 32, 64)
    25 is relu
    26 is maxpool
    27 is conv10_1 (3, 3, 32, 64)
    28 is relu
    29 is maxpool
    30 is flatten
    31 is fullyconnected
    32 is softmax
    """
    
    cnn = scipy.io.loadmat(path)
    cnn_layers = cnn['layers']
    
    def _weights(layer, expected_layer_name):
        """
        Return the weights and bias from the CNN model for a given layer.
        """
        wb = cnn_layers[0][layer][0][0][2]
        W = wb[0][0]
        b = wb[0][1]
        layer_name = cnn_layers[0][layer][0][0][0][0]
        assert layer_name == expected_layer_name
        return W, b

    def _relu(conv2d_layer):
        """
        Return the RELU function wrapped over a TensorFlow layer. Expects a
        Conv2d layer input.
        """
        return tf.nn.relu(conv2d_layer)

    def _conv2d(prev_layer, layer, layer_name):
        """
        Return the Conv2D layer using the weights, biases from the CNN
        model at 'layer'.
        """
        W, b = _weights(layer, layer_name)
        W = tf.constant(W)
        b = tf.constant(np.reshape(b, (b.size)))
        return tf.nn.conv2d(prev_layer, filter=W, strides=[1, 1, 1, 1], padding='SAME') + b

    def _conv2d_relu(prev_layer, layer, layer_name):
        """
        Return the Conv2D + RELU layer using the weights, biases from the CNN
        model at 'layer'.
        """
        return _relu(_conv2d(prev_layer, layer, layer_name))

    def _maxpool(prev_layer):
        """
        Return the MaxPooling layer.
        """
        return tf.nn.max_pool(prev_layer, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    
    # The model is stored in a python dictionary where each variable name is the key and the corresponding value is a tensor containing that variable's value.
    graph = {}
    graph['input']   = tf.Variable(np.zeros((1, spectrogram_shape[0], spectrogram_shape[1])), dtype = 'float32')
    graph['conv1_1']  = _conv2d_relu(graph['input'], 0, 'conv1_1')
    graph['maxpool1'] = _maxpool(graph['conv1_'])
    graph['conv2_1']  = _conv2d_relu(graph['maxpool1'], 5, 'conv2_1')
    graph['maxpool2'] = _maxpool(graph['conv2_1'])
    graph['conv3_1']  = _conv2d_relu(graph['maxpool2'], 10, 'conv3_1')
    graph['maxpool3'] = _maxpool(graph['conv3_1'])
    graph['conv4_1']  = _conv2d_relu(graph['maxpool3'], 19, 'conv4_1')
    graph['maxpool4'] = _avgpool(graph['conv4_1'])
    graph['conv5_1']  = _conv2d_relu(graph['maxpool4'], 28, 'conv5_1')
    graph['maxpool5'] = _maxpool(graph['conv5_1'])
    graph['conv6_1']  = _conv2d_relu(graph['maxpool5'], 28, 'conv6_1')
    graph['maxpool6'] = _maxpool(graph['conv6_1'])
    graph['conv7_1']  = _conv2d_relu(graph['maxpool6'], 28, 'conv7_1')
    graph['maxpool7'] = _maxpool(graph['conv7_1'])
    graph['conv8_1']  = _conv2d_relu(graph['maxpool7'], 28, 'conv8_1')
    graph['maxpool8'] = _maxpool(graph['conv8_1'])
    graph['conv9_1']  = _conv2d_relu(graph['maxpool8'], 28, 'conv9_1')
    graph['maxpool9'] = _maxpool(graph['conv9_1'])
    graph['conv10_1']  = _conv2d_relu(graph['maxpool9'], 28, 'conv10_1')
    graph['maxpool10'] = _maxpool(graph['conv10_1'])
    
    return graph

#### Generate noise spectrogram

In [None]:
def generate_noise_spectrogram(content_spectrogram, noise_ratio = CONFIG.NOISE_RATIO):
    """
    Generates a noisy image by adding random noise to the content_spectrogram
    """
    shape_spectrogram = content_spectrogram.shape
    
    # Generate a random noise_image
    noise_spectrogram = np.random.uniform(-20, 20, (1, shape_spectrogram[0], shape_spectrogram[1])).astype('float32')
    
    # Set the input_image to be a weighted average of the content_image and a noise_image
    input_spectrogram = noise_spectrogram * noise_ratio + content_spectrogram * (1 - noise_ratio)
    
    return input_spectrogram


#### Convert audio to spectrogram

In [None]:
"""
# Function that parse audio from audio clip
# Input: wav file name, t1 (start of phoeme), t2 (end of phoeme)
# Returns: audio segments of individual phoeme between t1 and t2
"""
def parse_out_segment(audio_clip, t1, t2):
    
    # Grab audio segment between t1 and t2
    # First grab the first t2 milliseconds
    first_audio_segment = audio_clip[:t2]
    
    # Then grab the last t2-t1 milliseconds
    phoeme_length = t2 - t1
    audio_segment = first_audio_segment[-phoeme_length:]
    
    return audio_segment

In [None]:
"""
# Function that segments audio clip into smaller segments
# Input: wav file name, flag to shift by 250ms, flag to change volume
# Returns: dictionary of audio segments
"""
def segment_audio_clip(audio_file_name, wav_name):
    
    # Length is 1000ms = 1sec
    segment_length = 500
    
    # Read the audio file
    audio_clip = AudioSegment.from_wav(audio_file_name)
    #print audio_clip.duration_seconds
    
    # Calculate the number of segments based on audio clip duration and segment length
    audio_duration_ms = (audio_clip.duration_seconds)*1000    
    num_segments = int(audio_duration_ms / segment_length)
    #print num_segments
    
    # Segment the audio clip and save in dictionary
    segment_dict = {}
    
    for i in range(num_segments):
        key = wav_name + '_' + str(i)
        segment_audio = parse_out_segment(audio_clip, i*1000, i*1000+segment_length)
        segment_dict[key] = segment_audio
    
    return segment_dict

In [None]:
"""
# Function that exports each audio segment to individual WAV files
# Input: dictionary of phoemes audio clips
# Returns: nothing
"""
def export_audio_segments(audio_segments_dict, wav_file_name, accent_id):
    
    for timestamp in audio_segments_dict:
        export_file_name = accent_id + '_spectrograms\\' + timestamp + '.wav'
        #export_file_name = timestamp + '.wav'
        
        #Exports to a wav file
        audio_segment = audio_segments_dict[timestamp]
        audio_segment.export(export_file_name, format="wav")

In [None]:
"""
# Function that creates spectrogram of each WAV file
# Input: none
# Returns: nothing
"""
def convert_audio_to_spectrograms(accent_id, flag_pitch_shift):
    reg_ex = accent_id + '_spectrograms\*.wav'
    wav_file_list = glob.glob(reg_ex)
    #print wav_file_list
    
    spectrogram_list = []
    
    for wav_file in wav_file_list:
        #print wav_file
        
        # Convert wav file to spectrogram (FFT)
        samples, sampling_rate = librosa.load(wav_file)
        
        # Comput STFT of the audio
        D = librosa.stft(samples)
            
        D_magnitude = np.abs(D)
        #print (D_magnitude).shape
        
        D_reshape = np.reshape(D_magnitude,(205,110))
        #print D_reshape.shape
        
        # Append to spectrogram list
        spectrogram_list.append(D_reshape)
        
    return spectrogram_list

In [None]:
def convert_spectrogram(input_wav_file):
    #--------------------------------------------------
    # SEGMENT WAV FILE
    #--------------------------------------------------
    audio_segments_dict = segment_audio_clip(wav_file, wav_name)
            
    #--------------------------------------------------
    # EXPORT AS WAV FILES
    #--------------------------------------------------
    export_audio_segments(audio_segments_dict, wav_name, accent_id)
    
    #--------------------------------------------------
    # CONVERT TO SPECTROGRAM
    #--------------------------------------------------
    spectrogram_list = convert_audio_to_spectrograms(accent_id)
        
    return spectrogram_list

## Main function

In [None]:
if __name__ == "__main__":

    STYLE_LAYERS = [
        ('conv5_1', 0.05),
        ('conv6_1', 0.30),
        ('conv7_1', 0.30),
        ('conv8_1', 0.30),
        ('conv9_1', 0.05)]
    
    # Reset the graph
    tf.reset_default_graph()

    # Start interactive session
    sess = tf.InteractiveSession()

    # Load the "content" spectrogram
    content_spectrogram = convert_to_spectrogram('')

    # Load the "style" spectrogram
    style_spectrogram = convert_to_spectrogram('')

    # Generate noisy spectrogram
    generated_spectrogram = generate_noise_spectrogram(content_spectrogram)

    # Assign the content image to be the input of the CNN model.  
    sess.run(model['input'].assign(content_spectrogram))

    # Load the model
    # TODO : change this
    model = load_cnn_model("pretrained_cnn_model.mat", content_spectrogram.shape)

    # Select the output tensor of layer conv4_2
    out = model['conv3_2']

    # Set a_C to be the hidden layer activation from the layer we have selected
    a_C = sess.run(out)

    # Set a_G to be the hidden layer activation from same layer. Here, a_G references model['conv4_2'] 
    # and isn't evaluated yet. Later in the code, we'll assign the image G as the model input, so that
    # when we run the session, this will be the activations drawn from the appropriate layer, with G as input.
    a_G = out

    # Compute the content cost
    J_content = compute_content_cost(a_C, a_G)

    # Assign the input of the model to be the "style" image 
    sess.run(model['input'].assign(style_spectrogram))

    # Compute the style cost
    J_style = compute_style_cost(model, STYLE_LAYERS)

    J = total_cost(J_content, J_style, 10, 40)

    # Define optimizer
    optimizer = tf.train.AdamOptimizer(2.0)

    # Define train_step
    train_step = optimizer.minimize(J)
    
    model_nn(sess, generated_spectrogram)