# NEURAL STYLE TRANSFER WITH UNEVEN MODELLING
In other words, we shall use one CNN model's layers (melspectrogram-based) for content loss, and another CNN model's layers (MFCC based) for style loss and try to combine them to produce an overall gradient fo the target melspectrogram.

In [8]:
import import_ipynb
N = import_ipynb.NotebookLoader(path=['.'])
N.load_module("ImportsForNeuralStyleTransfer")
from ImportsForNeuralStyleTransfer import *

importing Jupyter notebook from .\ImportsForNeuralStyleTransfer.ipynb
importing Jupyter notebook from genreClassification\ImportsForAudioHandling.ipynb
importing Jupyter notebook from genreClassification\ImportsForModelHandling.ipynb


# Initialising audio data handler

In [3]:
audioDataHandler = AudioDataHandler(
    content_path='AUDIO',
    style_path='AUDIO',
    sr=22050,
    n_fft=1024,
    hop_length=256,
    segment_size=431,
    n_mels=384,
    n_mfcc=40)

# Initialising models

In [4]:
# Initialising models:
model_1 = get_cnn_5_sec_melspectrogram(input_shape=(audioDataHandler.n_mels, audioDataHandler.segment_size, 1))
model_2 = get_cnn_5_sec_mfcc(input_shape=(audioDataHandler.n_mfcc, audioDataHandler.segment_size, 1))

# Viewing model architectures:
model_1.summary()
model_2.summary()

# Loading pretrained weights:
load_model(model_1, 'genreClassification/storage/trainedModelWeights/genreClassificationTrainedModelWeights_sr-22050_n_fft-1024_hop_length-256_n_mels-384_bestValidationAccuracy.npy')
load_model(model_2, 'genreClassification/storage/trainedModelWeights/genreClassificationTrainedModelWeights_sr-22050_n_fft-1024_hop_length-256_n_mfcc-40_bestValidationAccuracy.npy')

In [5]:
content_layers, style_layers = [0, 2, 4, 6], [8, 10]

# Instantiating NST class to access NST functions:
nst = NST(content_layers=content_layers,
          style_layers=style_layers,
          model=model_1,
          content_weight=float(0.1) / len(content_layers),
          style_weight=float(10) / len(style_layers))

# Displaying parameters:
nst.display_params()

content_weight: 0.025
style_weight: 5.0
feature_extractor: <Functional name=functional_3, built=True>
content_layers: ['identity', 'conv2d', 'conv2d_1', 'conv2d_2']
style_layers: ['conv2d_3', 'conv2d_4']


# Neural style transfer

Defining customised loss functions...

In [6]:
def get_total_content_loss(nst, content, target_for_content):
    # Getting content, style and target features:
    content_features = nst.content_feature_extractor(content)
    target_features = nst.content_feature_extractor(target_for_content)

    
    # Initialising loss value:
    loss = tf.zeros(shape=())

    for layer in nst.content_layers:
        layer_output_for_content = content_features[layer]
        layer_output_for_target = target_features[layer]
        loss += nst.content_weight * nst.get_content_loss(layer_output_for_content, layer_output_for_target)

    return loss

#================================================
def get_total_style_loss(nst, style, target_for_style):
    # Getting content, style and target features:
    style_features = nst.style_feature_extractor(style)
    target_features = nst.style_feature_extractor(target_for_style)
    
    # Initialising loss value:
    loss = tf.zeros(shape=())

    for layer in nst.style_layers:
        layer_output_for_style = style_features[layer]
        layer_output_for_target = target_features[layer]
        loss += nst.style_weight * nst.get_style_loss(layer_output_for_style, layer_output_for_target)

    return loss

#================================================
def get_loss_and_grads_customised(nst, audioDataHandler, content, style, target):
    # NOTE: Each of target, content and style are arrays of segments of the melspectrogram of a particular audio file

    with tf.GradientTape() as tape:
        content_loss = get_total_content_loss(nst, content, target)
    # Obtaining the gradients of content loss:
    grads_for_content = tape.gradient(content_loss, target)

    target = tf.Variable(audioDataHandler.melspectrogram_to_mfccs(target))
    with tf.GradientTape() as tape:
        style_loss = get_total_style_loss(nst, style, target)
    # Obtaining the gradients of style loss:
    grads_for_style = tape.gradient(style_loss, target)
    grads_for_style = audioDataHandler.mfccs_to_melspectrogram(grads_for_style)
    
    return content_loss + style_loss, grads_for_content + grads_for_style

## Initialising data

Getting audio name references...

In [9]:
get_audio_file_references('AUDIO')

ID	 | Content name
------------------------------------------------
0	 | CONTENT - Bach - Aria Variata BWV 989 Variation 1 - Brendan Kinsella - Chosic.mp3
1	 | CONTENT - Bach - Goldberg Variations BWV 988 - Aria - Aaron Dunn - Chosic.mp3
2	 | CONTENT - Bach - Minuet - Notebook for Anna Magdalena - Aaron Dunn - Chosic.mp3
3	 | CONTENT - Mozart - Alla Turca - Markus Staab - Chosic.mp3
4	 | CONTENT - Mozart - Piano Concerto 21 in C Major K467 - II-Andante - Markus Staab - Chosic.mp3
5	 | CONTENT - Mozart - Sonata 13 in B Flat Major K333 - I-Allegro - Brendan Kinsella - Chosic.mp3
6	 | CONTENT - Mozart - Sonata 13 in B Flat Major K333 - II-Andante Cantabile - Brendan Kinsella - Chosic.mp3
7	 | CONTENT - Summer Sport - AudioCoffee - Chosic.mp3
8	 | CONTENT - Warm Duck Shuffle - arnebhus - Chosic.mp3
9	 | CONTENT- Slow Burn - Kevin MacLeod - Chosic.mp3


ID	 | Style name
------------------------------------------------
0	 | STYLE - Alien Technology - Mixkit.wav
1	 | STYLE - Arabic Vocal Ambi

In [None]:
content_name = 'CONTENT - Mozart - Alla Turca - Markus Staab - Chosic.mp3'
style_name = 'STYLE - Dark Choir Singing - Pixabay.mp3'
content_segments, style_segments, content_signal, style_signal = audioDataHandler.get_segments(content_name, style_name, return_signals=True, specifications='style mfcc')
content, style, target = nst.get_data_for_nst(content_segments[:5], style_segments[:5], target='content')

Setting NST parameters and instantiating NST class to access NST functions...

In [15]:
content_layers, style_layers = [0, 2, 4, 6], [8, 10]

# Instantiating NST class to access NST functions:
nst = NST(content_layers=content_layers,
          style_layers=style_layers,
          content_model=model_1,
          style_model=model_2,
          content_weight=float(0.1) / len(content_layers),
          style_weight=float(10) / len(style_layers))

# Displaying parameters:
nst.display_params()

content_weight: 0.025
style_weight: 5.0
content_feature_extractor: <Functional name=functional_13, built=True>
style_feature_extractor: <Functional name=functional_15, built=True>
content_layers: ['identity_2', 'conv2d_13', 'conv2d_14', 'conv2d_15']
style_layers: ['conv2d_24', 'conv2d_25']


In [18]:
optimizer = keras.optimizers.SGD(learning_rate=float(1e-3))
# NOTE: Reassigning the optimizer resets its gradient

iterations = 100
retry_count = 0
prev_targets = [target.numpy()]
for iteration in range(iterations):
    # Calculate total loss and gradient:
    loss, grads = get_loss_and_grads_customised(nst, audioDataHandler, content, style, target)
    if tf.math.is_nan(loss) and retry_count < 20:
        try:
            target = tf.Variable(prev_targets[-2])
        except:
            target =  tf.Variable(prev_targets[-1])
        optimizer = keras.optimizers.SGD(learning_rate=float(optimizer.learning_rate/10))
        retry_count += 1
        continue
    retry_count = 0
    
    # Storing current target:
    prev_targets.append(target.numpy())
    
    # Applying gradients:
    optimizer.apply_gradients([(grads, target)])
    
    # Displaying process:
    if iteration % 10 == 0 or (iteration == iterations - 1):
        #visualise(stitch_segments(target))
        print(f'i={iteration}\t | loss={float(loss):.5f}\t |lr={float(optimizer.learning_rate.numpy()):.5e}')

i=0	 | loss=42.11766	 |lr=1.00000e-03
i=10	 | loss=41.86314	 |lr=1.00000e-07
i=20	 | loss=40.81185	 |lr=1.00000e-08
i=30	 | loss=40.13475	 |lr=1.00000e-08
i=40	 | loss=39.16708	 |lr=1.00000e-09
i=50	 | loss=38.49911	 |lr=1.00000e-09
i=60	 | loss=38.18476	 |lr=1.00000e-10
i=70	 | loss=37.82134	 |lr=1.00000e-11
i=80	 | loss=37.32160	 |lr=1.00000e-11
i=90	 | loss=37.36431	 |lr=1.00000e-12
i=99	 | loss=37.24249	 |lr=1.00000e-13


In [32]:
# Reshaping target for melspectrogram conversion (which was reshaped before NST by `get_data_for_nst`:
target_reshaped = np.reshape(target.numpy(), newshape=list(target.shape)[:-1])
# Getting the melspectrogram and signal
target_signal = audioDataHandler.reconstruct_signal_from_melspectrogram_segments_and_play(target, return_value=True)