# IMPORTS FOR NEURAL STYLE TRANSFER
Drawing from the model building functions...

In [2]:
import import_ipynb
N = import_ipynb.NotebookLoader(path=['genreClassification'])
N.load_module("ImportsForAudioHandling")
N.load_module("ImportsForModelHandling")
from ImportsForAudioHandling import *
from ImportsForModelHandling import *

# For handling tensors:
import tensorflow as tf

# For handling graphical display:
import IPython.display as ipd

importing Jupyter notebook from genreClassification\ImportsForAudioHandling.ipynb
importing Jupyter notebook from genreClassification\ImportsForModelHandling.ipynb


# Miscellaneous helper function(s)

Function to show and return all content and style audio names separately based on given tags...

In [None]:
def get_audio_file_references(path='AUDIO', return_value=False):
    all_file_names = get_file_names(path)
    # NOTE: `get_file_names` defined in `genreClassification/ImportsForAudioHandling.ipynb`
    
    # Segregating audio based on content and style tags:
    all_content_names, all_style_names = [], []
    for file_name in all_file_names:
        if 'CONTENT' in file_name:
            all_content_names.append(file_name)  
        elif 'STYLE' in file_name:
            all_style_names.append(file_name)  
    
    print('ID\t | Content name\n------------------------------------------------')
    for i, j in enumerate(all_content_names):
        print(f'{i}\t | {j}')
    print('\n\nID\t | Style name\n------------------------------------------------')
    for i, j in enumerate(all_style_names):
        print(f'{i}\t | {j}')

    if return_value:
        return all_content_names, all_style_names

# Audio handling

**IMPLEMENTATION NOTE**: To make function calls using customisable parameters easier, I designed the functions such that all audio parameters that may be relevant are assigned to an object that can access the relevant functions. Extra data may be passed in the argument in necessary, but such a design makes it very convenient to test for different audio parameters and makes the code less messy.

In [1]:
class AudioDataHandler:
    def __init__(self, content_path, style_path, sr, n_fft, hop_length, segment_size, n_mels, n_mfcc):
        # Audio path parameters:
        self.content_path = content_path
        self.style_path = style_path

        # Audio parameters:
        self.sr = sr                     # Sampling rate
        self.n_fft = n_fft               # FFT window size
        self.hop_length = hop_length     # Hop length
        self.segment_size = segment_size # Number of frames per melspectrogram segment
        self.n_mels = n_mels             # Number of mel bands
        self.n_mfcc = n_mfcc             # Number of MFCCs per audio file

    #================================================
    # PARAMETER DISPLAY

    def display_params(self):
        for key in vars(self):
            print(f'{key}: {vars(self)[key]}')

    #############################################################
    # Basic audio-handling functions...

    #================================================
    # Function to obtain time-domain signal:
    def get_signal(self, audio_path, audio_name):
        # NOTE: Try-except is for increasing robustness of file path handling
        try:
            signal, _ = librosa.load(audio_path + '/' + audio_name, sr=self.sr)
        except:
            signal, _ = librosa.load(audio_path + audio_name, sr=self.sr)
        return signal

    #================================================
    # Function to obtain melspectrogram:
    def get_melspectrogram(self, signal=None, audio_path=None, audio_name=None):
        # Get signal if not given:
        if signal is None:
            signal = self.get_signal(audio_name)

        # Get melspectrogram:
        melspectrogram = librosa.feature.melspectrogram(y=signal, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
        return melspectrogram

    #================================================
    # Function to obtain MFCCs:
    def get_mfccs(self, signal=None, audio_path=None, audio_name=None):
        # Get signal if not given:
        if signal is None:
            signal = self.get_signal(audio_path, audio_name)

        # Get MFCCs:
        mfccs = librosa.feature.mfcc(y=signal, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.n_mfcc)
        return mfccs

    #================================================
    # Function to convert melspectrogram to MFCCs:
    # NOTE: The MFCC calculation will depend on the peak loudness in decibels
    def melspectrogram_to_mfccs(self, melspectrogram):
        # NOTE: `melspectrogram` is not intended to be log-power (i.e. decibel-scaled), only power

        # Log-power melspectrogram:
        if len(melspectrogram.shape) == 3:
            melspectrogram = tf.math.log(tf.transpose(melspectrogram + 1e-6, perm=(0, 2, 1)))
        else:
            melspectrogram = tf.math.log(tf.transpose(melspectrogram + 1e-6, perm=(1, 0)))

        # Calculating MFCCs on log-power melspectrogram:
        mfccs = tf.signal.mfccs_from_log_mel_spectrograms(melspectrogram)

        # Transposing (for convenience) then truncating as specified:
        if len(mfccs.shape) == 3:
            mfccs = tf.transpose(mfccs, perm=(0, 2, 1))[:, :self.n_mfcc, :]
        else:
            mfccs = tf.transpose(mfccs, perm=(1, 0))[:self.n_mfcc, :]

        return mfccs

    #================================================
    # Function to convert MFCCs to melspectrogram:
    # NOTE: The MFCC calculation will depend on the peak loudness in decibels
    def mfccs_to_melspectrogram(self, mfccs):
        # Calculating MFCCs:
        melspectrogram = librosa.feature.inverse.mfcc_to_mel(mfcc=np.array(mfccs), n_mels=self.n_mels)

        # Converting log scale to linear scale:
        melspectrogram = librosa.db_to_power(melspectrogram)
        # NOTE: `melspectrogram` is not intended to be log-power (i.e. decibel-scaled), only power
        return melspectrogram

    #================================================
    # Function to play audio signal:
    def play_signal(self, signal, sr=None):
        if sr is None:
            sr = self.sr

        audio_element_url = ipd.Audio(signal, rate=sr)
        ipd.display(audio_element_url)

    #================================================
    # Small wrapper function to play audio file by first obtaining the audio file's signal:
    def play_audio_file(self, audio_path, audio_file, sr=None):
        if sr is None:
            sr = self.sr

        signal = self.get_signal(audio_path, audio_file, sr=sr)
        audio_element_url = ipd.Audio(signal, rate=sr)
        ipd.display(audio_element_url)

    #================================================
    # Small wrapper function to display the waveform of a given signal and then play it:
    def display_and_play_signal(self, signal, sr=None, title=None):
        if sr is None:
            sr = self.sr

        plt.plot(signal)
        plt.title(title)
        plt.show()
        self.play_signal(signal, sr=sr)

    #================================================
    # Function to visualise audio data:
    def visualise(self, spectrogram, title=None, xlabel='frame', ylabel=None):
        # NOTE: Audio may be visualised as either a spectrogram (including melspectrogram) or MFCCs

        fig, ax = plt.subplots()
        img = ax.imshow(spectrogram, aspect='auto', origin='lower')
        fig.colorbar(img)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)

        # NOTE: The colorbar that indicates amplitude

        plt.show()

    #############################################################
    # Segmentation and de-segmentation of melspectrograms...

    #================================================
    # The segmentation function that represents the full preprocessing pipeline for audio:
    def get_segments(self, content_name=None, style_name=None, return_signals=False, specifications=''):
        # NOTE: Segment size in `audio_params` is given in number of frames

        # Getting signals...
        content_signal = self.get_signal(self.content_path, content_name)
        style_signal = self.get_signal(self.style_path, style_name)

        # Getting the audio data (melspectrograms or MFCCs, as specified):
        if 'content mfcc' in specifications:
            content_data = self.get_mfccs(signal=self.content_signal)
        else: # Default to gathering melspectrograms
            content_data = self.get_melspectrogram(signal=content_signal)
        if 'style mfcc' in specifications:
            style_data = self.get_mfccs(signal=style_signal)
        else: # Default to gathering melspectrograms
            style_data = self.get_melspectrogram(signal=style_signal)

        #------------------------------------
        # Getting segments...

        segment_size = self.segment_size # Renaming for convenience

        # Content segments:
        content_segments = []
        for i in range(int(content_data.shape[1]/segment_size)):
            content_segments.append(content_data[:, i*segment_size:(i+1)*segment_size])
        content_segments = np.array(content_segments) # Converting to an array for ease of handling and storing

        # Style segments:
        style_segments = []
        for i in range(int(style_data.shape[1]/segment_size)):
            style_segments.append(style_data[:, i*segment_size:(i+1)*segment_size])
        style_segments = np.array(style_segments) # Converting to an array for ease of handling and storing

        if return_signals:
            return content_segments, style_segments, content_signal, style_signal
        return content_segments, style_segments

    #================================================
    # De-segmentation, i.e. stiching segments together...
    def stitch_segments(self, segments):
        return np.concatenate(list(segments), axis=1)

    #############################################################
    # Signal reconstruction from melspectrogram(s)...

    #================================================
    def reconstruct_signal_from_melspectrogram(self, melspectrogram):
        reconstructed_signal = librosa.feature.inverse.mel_to_audio(melspectrogram, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length)
        return reconstructed_signal

    #================================================
    def reconstruct_signal_from_melspectrogram_segments(self, segments):
        melspectrogram = self.stitch_segments(segments)
        return self.reconstruct_signal_from_melspectrogram(melspectrogram)

    #================================================
    # Small wrapper that enables playing signal reconstructed from melspectrogram:
    def reconstruct_signal_from_melspectrogram_and_play(self, melspectrogram, return_value=True):
        reconstructed_signal = self.reconstruct_signal_from_melspectrogram(melspectrogram)
        audio_element_url = ipd.Audio(reconstructed_signal, rate=self.sr)
        ipd.display(audio_element_url)
        if return_value:
            return reconstructed_signal

    #================================================
    # Small wrapper that enables playing signal reconstructed from melspectrogram segments:
    def reconstruct_signal_from_melspectrogram_segments_and_play(self, segments, return_value=True):
        reconstructed_signal = self.reconstruct_signal_from_melspectrogram_segments(segments)
        audio_element_url = ipd.Audio(reconstructed_signal, rate=self.sr)
        ipd.display(audio_element_url)
        if return_value:
            return reconstructed_signal

**NOTE ON** `tensorflow.signal.mfccs_from_mel_spectrograms`:

MFCCs are returned for every mel band in the melspectrogram, and it is up to the caller to select a subset of the MFCCs based on their application. For example, it is typical to only use the first few for speech recognition, as this results in an approximately pitch-invariant representation of the signal. For my purposes, I shall be using the first 40 MFCCs.

> REFERENCE: https://docs.w3cub.com/tensorflow~1.15/signal/mfccs_from_log_mel_spectrograms.html

---

**ALTERNATE CODE FOR MELSPECTROGRAM TO MFCCS USING** `librosa`:

```python
def melspectrogram_to_mfccs(self, melspectrogram):
    # NOTE: `melspectrogram` is not intended to be log-power (i.e. decibel-scaled), only power
    
    # Log-power melspectrogram:
    melspectrogram = librosa.power_to_db(melspectrogram)

    # Calculating MFCCs:
    mfccs = librosa.feature.mfcc(S=melspectrogram, n_mfcc=self.n_mfcc)
    return mfccs
```

I have used the Tensorflow version since it allows for gradient calculation of Tensorflow variables, which I needed for neural style transfer using unever modelling which involved using the MFCCs of a target melspectrogram to evaluate style cost.

# Neural style transfer (NST)
For explanations of each of the NST-specific functions, check `DemoForNeuralStyleTransfer.ipynb`. I have used a class to encapsulate the primary NST functionalities to ensure abstraction and hence greater convenience during testing.

In [7]:
class NST:
    def __init__(self, content_layers, style_layers, content_weight, style_weight, model=None, content_model=None, style_model=None):
        self.content_weight = content_weight
        self.style_weight = style_weight

        #------------------------------------
        # Obtaining feature extractor(s) along with content and style layers...

        if model is None and content_model is None and style_model is None:
            print('Model for either content or style is not given!')

        try: # Default
            self.feature_extractor, layer_labels = self.get_feature_extractor(model)
            self.content_layers = [layer_labels[i] for i in content_layers]
            self.style_layers = [layer_labels[i] for i in style_layers]
        except: #Special case
            self.content_feature_extractor, layer_labels_1 = self.get_feature_extractor(content_model)
            self.style_feature_extractor, layer_labels_2 = self.get_feature_extractor(style_model)
            self.content_layers = [layer_labels_1[i] for i in content_layers]
            self.style_layers = [layer_labels_2[i] for i in style_layers]

    #================================================
    # PARAMETER DISPLAY

    def display_params(self):
        for key in vars(self):
            print(f'{key}: {vars(self)[key]}')

    #================================================
    # FEATURE EXTRACTOR

    def get_feature_extractor(self, model, return_layer_labels=True):
        outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
        feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)

        if return_layer_labels:
            return feature_extractor, list(outputs_dict.keys())
        return feature_extractor

    #================================================
    # DATA PACKAGING FOR NST

    def get_data_for_nst(self, content_segments, style_segments, target_type=None, seed=None):
        # `seed` is the pRNG seed for initialising target array:

        # Reshaping `content_segments` and `style_segments` to fit the desired model's layers' input shape:
        # NOTE 1: Reshaping here just involves appending an extra dimension of magnitude 1 (corresponding to the "channel" dimension in the CNN's layers)
        # NOTE 2: Such reshaping was also done when supplying training data to the model during training
        content_segments = np.reshape(content_segments, newshape=list(content_segments.shape) + [1])
        style_segments = np.reshape(style_segments, newshape=list(style_segments.shape) + [1])

        # Get feature vectors representations for audio file:
        content = tf.Variable(content_segments)
        style = tf.Variable(style_segments)
        # NOTE: Renaming is done only for readability
        if target_type is None or target_type == 'random':
            target = tf.Variable(tf.random_normal_initializer(mean=0.0, stddev=1.0, seed=seed)(shape=content_segments.shape))
        elif target_type == 'zero':
            target = tf.Variable(tf.zeros(shape=content_segments.shape))
        elif target_type == 'content':
            target = tf.Variable(content_segments)
        elif target_type == 'style':
            target = tf.Variable(style_segments)

        return content, style, target

    #================================================
    # GRAM MATRIX CALCULATION

    def get_gram_matrix(self, layer_outputs):
        # 1. Flatten each layer output to obtain the feature vectors:
        layer_outputs = tf.transpose(layer_outputs, (3, 0, 1, 2)) # What is this for? See the text box below this code box
        layer_outputs = tf.reshape(layer_outputs, (tf.shape(layer_outputs)[0], -1))

        # 2. Defining a tensor A containing each feature vector:
        # We already have `layer_outputs`

        # 3. Define a tensor B containing the transpose of each feature vector (in matching order):
        '''
        For this, we can simply do `tf.transpose(layer_outputs)`.
        Such an application of `tf.transpose` will reverse the dimensionality of `layer_outputs`.
        This not only transposes the outer tensor but every feature vector within as well.
        Hence, this obtains B^T.
        '''

        # 4. Obtain AB^T (see previous comment):
        gram_matrix = tf.matmul(layer_outputs, tf.transpose(layer_outputs))
        # NOTE: `tf.matmul` performs matrix multiplication
        return gram_matrix

    #================================================
    # CONTENT LOSS CALCULATION

    def get_content_loss(self, layer_output_for_content, layer_output_for_target):
        n1, n2, n3, n4 = layer_output_for_target.shape

        # Returning sum of squared errors between layer outputs:
        return tf.reduce_sum(tf.square(layer_output_for_content-layer_output_for_target))/(n1*n2*n3*n4)
        # NOTE: We want to preserve the `tf.tensor` datatype

    #================================================
    # STYLE LOSS CALCULATION

    def get_style_loss(self, layer_output_for_style, layer_output_for_target):
        n1, n2, n3, n4 = layer_output_for_target.shape

        # Obtaining Gram matrices:
        style_gram_matrix = self.get_gram_matrix(layer_output_for_style) # Gram matrix for the style image
        target_gram_matrix = self.get_gram_matrix(layer_output_for_target) # Gram matrix for the target image

        # Returning a constant multiple of mean-squared error:
        return tf.reduce_sum(tf.square(style_gram_matrix-target_gram_matrix))/(n1*n2*n3*n4)
        # NOTE 1: `tf.reduce_sum` Computes the sum of elements across dimensions of a tensor
        # NOTE 2: We want to preserve the `tf.tensor` datatype, hence we apply operations from Tensorflow and not NumPy

    #================================================
    # TOTAL LOSS CALCULATION

    def get_total_loss(self, content, style, target):
        # Getting content, style and target features:
        content_features = self.feature_extractor(content)
        style_features = self.feature_extractor(style)
        target_features = self.feature_extractor(target)

        # Initialising loss value:
        loss = tf.zeros(shape=())

        for layer in self.content_layers:
            layer_output_for_content = content_features[layer]
            layer_output_for_target = target_features[layer]
            loss += self.content_weight * self.get_content_loss(layer_output_for_content, layer_output_for_target)

        for layer in self.style_layers:
            layer_output_for_style = style_features[layer]
            layer_output_for_target = target_features[layer]
            loss += self.style_weight * self.get_style_loss(layer_output_for_style, layer_output_for_target)

        return loss

    #================================================
    # GRADIENT AND TOTAL LOSS CALCULATION

    def get_loss_and_grads(self, content, style, target):
        # NOTE: Each of target, content and style are arrays of segments of the melspectrogram of a particular audio file

        with tf.GradientTape() as tape:
            loss = self.get_total_loss(content, style, target)
        # Obtaining the gradient of loss w.r.t. `target`:
        grads = tape.gradient(loss, target)
        return loss, grads

    #================================================
    # BASIC NST LOOP

    def nst_loop(self, content, style, target, n_iter, initial_learning_rate=0.01, decay_steps=10, decay_rate=0.8, print_frequency=10):
        optimizer = keras.optimizers.SGD(tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=initial_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate))
        # NOTE: Reassigning the optimizer resets its gradient

        for i in range(n_iter):
            # Calculating total loss and gradient:
            loss, grads = self.get_loss_and_grads(content, style, target)

            # Applying gradients:
            optimizer.apply_gradients([(grads, target)])

            # Displaying process:
            if i % print_frequency == 0 or (i == n_iter - 1):
                #visualise(stitch_segments(target))
                print(f'i={i}\t | loss={float(loss):.5e} \t | lr={float(optimizer.learning_rate.numpy()):.5e}')

**IMPLEMENTATION NOTE: Effect of reshaping by appending an extra dimension of magnitude 1**:

Here is an example:

```python
# Original array:
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print('Original array:')
print(a.shape)
print(a)
# Reshaped array
print('\nReshaped array:')
b = np.reshape(a, newshape=list(a.shape) + [1])
print(b.shape)
print(b)
```

Output:

```
Original array:
[[1, 2, 3, 4],
 [5, 6, 7, 8]]
(2, 4)

Reshaped array:
[[[1],
  [2],
  [3],
  [4]],
 [[5],
  [6],
  [7],
  [8]]]
(2, 4, 1)
```

We observe that we convert an array of row vectors to an array of column vectors with the same values, i.e. both are of the form `[v1, v2]`, where each of `v1` and `v2` are vectors with a constant set of values; in the first case they are row vectors, in the second (reshaped) case they are column vectors. Hence, we see that the 4th dimension in the CNN model's layers, i.e. the channel dimension, is such that each channel is a separate column in the last dimension; in this way, the same set of inputs is handled across multiple channels (multiple filters, in convolutional layers) within the same 4-dimensional matrix. Very neat!

**NOTE**: Such reshaping was also done when supplying training data to the model during training
