<a href="https://colab.research.google.com/github/pranigopu/ambience-to-music-neuralStyleTransfer/blob/main/InterfaceViaGoogleGolab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NEURAL STYLE TRANSFER (BASIC)

In [None]:
#@title STARTUP AND SETUP

# Importing dependencies
import numpy as np              # For handling NumPy arrays
import matplotlib.pyplot as plt # For handling plotting
import tensorflow as tf         # For handling tensors
import IPython.display as ipd   # For handling graphical display
import librosa                  # For handling audio
import pandas as pd             # For handling data presentation
from google.colab import drive  # For mounting Google Drive
import os                       # For handling mounted file systems
import keras                    # For handling neural networks
# Suppressing all warnings because I want to live on the edge:
from warnings import filterwarnings
filterwarnings('ignore')

#================================================

#@markdown _Specifying present working directory_ :
#@markdown - `pwd` must be put as `/content/`  to access files in your Colab session storage.
#@markdown - To access a directory in your Google Drive, change `pwd` accordingly.
#@markdown - Note that Google Drive is mounted by default when running this cell.
#@markdown - Note also that you can choose below to not mount Google Drive.
mount_drive = True #@param {type:"boolean"}
if mount_drive:
  drive.mount('/content/drive', force_remount=True)
pwd = 'drive/MyDrive/Labwork/CC-Labs/Assessment' #@param {type:"string"}
# Add forward-slash if not present:
if pwd[-1] != '/':
    pwd = pwd + '/'

---

# DEFINITIONS

---

In [None]:
#@title DEFINITION 1: Audio handler class
class AudioDataHandler:
    def __init__(self, content_path, style_path, sr, n_fft, hop_length, segment_size, n_mels, n_mfcc):
        # Audio path parameters:
        self.content_path = content_path
        self.style_path = style_path

        # Audio parameters:
        self.sr = sr                     # Sampling rate
        self.n_fft = n_fft               # FFT window size
        self.hop_length = hop_length     # Hop length
        self.segment_size = segment_size # Number of frames per melspectrogram segment
        self.n_mels = n_mels             # Number of mel bands
        self.n_mfcc = n_mfcc             # Number of MFCCs per audio file

    #================================================
    # PARAMETER DISPLAY

    def display_params(self):
        for key in vars(self):
            print(f'{key}: {vars(self)[key]}')

    #############################################################
    # Basic audio-handling functions...

    #================================================
    # Function to obtain time-domain signal:
    def get_signal(self, audio_path, audio_name):
        # NOTE: Try-except is for increasing robustness of file path handling
        try:
            signal, _ = librosa.load(audio_path + '/' + audio_name, sr=self.sr)
        except:
            signal, _ = librosa.load(audio_path + audio_name, sr=self.sr)
        return signal

    #================================================
    # Function to obtain melspectrogram:
    def get_melspectrogram(self, signal=None, audio_path=None, audio_name=None):
        # Get signal if not given:
        if signal is None:
            signal = self.get_signal(audio_name)

        # Get melspectrogram:
        melspectrogram = librosa.feature.melspectrogram(y=signal, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
        return melspectrogram

    #================================================
    # Function to obtain MFCCs:
    def get_mfccs(self, signal=None, audio_path=None, audio_name=None):
        # Get signal if not given:
        if signal is None:
            signal = self.get_signal(audio_path, audio_name)

        # Get MFCCs:
        mfccs = librosa.feature.mfcc(y=signal, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.n_mfcc)
        return mfccs

    #================================================
    # Function to convert melspectrogram to MFCCs:
    # NOTE: The MFCC calculation will depend on the peak loudness in decibels
    def melspectrogram_to_mfccs(self, melspectrogram):
        # NOTE: `melspectrogram` is not intended to be log-power (i.e. decibel-scaled), only power

        # Log-power melspectrogram:
        if len(melspectrogram.shape) == 3:
            melspectrogram = tf.math.log(tf.transpose(melspectrogram + 1e-6, perm=(0, 2, 1)))
        else:
            melspectrogram = tf.math.log(tf.transpose(melspectrogram + 1e-6, perm=(1, 0)))

        # Calculating MFCCs on log-power melspectrogram:
        mfccs = tf.signal.mfccs_from_log_mel_spectrograms(melspectrogram)

        # Transposing (for convenience) then truncating as specified:
        if len(mfccs.shape) == 3:
            mfccs = tf.transpose(mfccs, perm=(0, 2, 1))[:, :self.n_mfcc, :]
        else:
            mfccs = tf.transpose(mfccs, perm=(1, 0))[:self.n_mfcc, :]

        return mfccs

    #================================================
    # Function to convert MFCCs to melspectrogram:
    # NOTE: The MFCC calculation will depend on the peak loudness in decibels
    def mfccs_to_melspectrogram(self, mfccs):
        # Calculating MFCCs:
        melspectrogram = librosa.feature.inverse.mfcc_to_mel(mfcc=np.array(mfccs), n_mels=self.n_mels)

        # Converting log scale to linear scale:
        melspectrogram = librosa.db_to_power(melspectrogram)
        # NOTE: `melspectrogram` is not intended to be log-power (i.e. decibel-scaled), only power
        return melspectrogram

    #================================================
    # Function to play audio signal:
    def play_signal(self, signal, sr=None):
        if sr is None:
            sr = self.sr

        audio_element_url = ipd.Audio(signal, rate=sr)
        ipd.display(audio_element_url)

    #================================================
    # Small wrapper function to play audio file by first obtaining the audio file's signal:
    def play_audio_file(self, audio_path, audio_file, sr=None):
        if sr is None:
            sr = self.sr

        signal = self.get_signal(audio_path, audio_file, sr=sr)
        audio_element_url = ipd.Audio(signal, rate=sr)
        ipd.display(audio_element_url)

    #================================================
    # Small wrapper function to display the waveform of a given signal and then play it:
    def display_and_play_signal(self, signal, sr=None, title=None):
        if sr is None:
            sr = self.sr

        plt.plot(signal)
        plt.title(title)
        plt.show()
        self.play_signal(signal, sr=sr)

    #================================================
    # Function to visualise audio data:
    def visualise(self, spectrogram, title=None, xlabel='frame', ylabel=None):
        # NOTE: Audio may be visualised as either a spectrogram (including melspectrogram) or MFCCs

        fig, ax = plt.subplots()
        img = ax.imshow(spectrogram, aspect='auto', origin='lower')
        fig.colorbar(img)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)

        # NOTE: The colorbar that indicates amplitude

        plt.show()

    #############################################################
    # Segmentation and de-segmentation of melspectrograms...

    #================================================
    # The segmentation function that represents the full preprocessing pipeline for audio:
    def get_segments(self, content_name=None, style_name=None, return_signals=False, specifications='', segments_to_include=None):
        # NOTE: Segment size in `audio_params` is given in number of frames

        # Getting signals...
        content_signal = self.get_signal(self.content_path, content_name)
        style_signal = self.get_signal(self.style_path, style_name)

        # Getting the audio data (melspectrograms or MFCCs, as specified):
        if 'content mfcc' in specifications:
            content_data = self.get_mfccs(signal=self.content_signal)
        else: # Default to gathering melspectrograms
            content_data = self.get_melspectrogram(signal=content_signal)
        if 'style mfcc' in specifications:
            style_data = self.get_mfccs(signal=style_signal)
        else: # Default to gathering melspectrograms
            style_data = self.get_melspectrogram(signal=style_signal)

        #------------------------------------
        # Getting segments...

        segment_size = self.segment_size # Renaming for convenience

        # Content segments:
        content_segments = []
        for i in range(int(content_data.shape[1]/segment_size)):
            content_segments.append(content_data[:, i*segment_size:(i+1)*segment_size])
        content_segments = np.array(content_segments) # Converting to an array for ease of handling and storing

        # Style segments:
        style_segments = []
        for i in range(int(style_data.shape[1]/segment_size)):
            style_segments.append(style_data[:, i*segment_size:(i+1)*segment_size])
        style_segments = np.array(style_segments) # Converting to an array for ease of handling and storing

        # Limiting the number of segments included:
        if not (segments_to_include is None):
            content_segments = content_segments[:segments_to_include]
            style_segments = style_segments[:segments_to_include]

        if return_signals:
            return content_segments, style_segments, content_signal, style_signal
        return content_segments, style_segments

    #================================================
    # De-segmentation, i.e. stiching segments together...
    def stitch_segments(self, segments):
        return np.concatenate(list(segments), axis=1)

    #############################################################
    # Signal reconstruction from melspectrogram(s)...

    #================================================
    def reconstruct_signal_from_melspectrogram(self, melspectrogram):
        reconstructed_signal = librosa.feature.inverse.mel_to_audio(melspectrogram, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length)
        return reconstructed_signal

    #================================================
    def reconstruct_signal_from_melspectrogram_segments(self, segments):
        melspectrogram = self.stitch_segments(segments)
        return self.reconstruct_signal_from_melspectrogram(melspectrogram)

    #================================================
    # Small wrapper that enables playing signal reconstructed from melspectrogram:
    def reconstruct_signal_from_melspectrogram_and_play(self, melspectrogram, return_value=True):
        reconstructed_signal = self.reconstruct_signal_from_melspectrogram(melspectrogram)
        audio_element_url = ipd.Audio(reconstructed_signal, rate=self.sr)
        ipd.display(audio_element_url)
        if return_value:
            return reconstructed_signal

    #================================================
    # Small wrapper that enables playing signal reconstructed from melspectrogram segments:
    def reconstruct_signal_from_melspectrogram_segments_and_play(self, segments, return_value=True):
        reconstructed_signal = self.reconstruct_signal_from_melspectrogram_segments(segments)
        audio_element_url = ipd.Audio(reconstructed_signal, rate=self.sr)
        ipd.display(audio_element_url)
        if return_value:
            return reconstructed_signal

In [None]:
#@title DEFINITION 2: CNN model to be used for NST
def get_cnn_5_sec_melspectrogram(input_shape, n_classes=10, lr=0.0001): # There are 10 genres, so 10 classes
    '''
    Input parameters:
    - `input_shape (tuple)`: Shape of input data
    - `n_classes`: Number of output classes

    Return values:
    - `model`: CNN model
    '''

    # BUILD MODEL TOPOLOGY

    model = keras.Sequential([
        keras.layers.Input(input_shape),
        keras.layers.Identity(),
        keras.layers.BatchNormalization(),
        #________________________
        # CONVOLUTIONAL LAYERS
        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same'),

        keras.layers.Conv2D(32, (2, 2), activation='relu'),
        keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        #________________________
        # DENSE LAYERS
        # Flatten output and feed it into dense layer:
        keras.layers.Flatten(),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(n_classes, activation='softmax')]) # Output layer

    #------------------------------------
    # COMPILING MODEL WITH APPROPRIATE LOSS AND OPTIMIZER

    # Optimizer:
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    # Loss function:
    loss = keras.losses.CategoricalCrossentropy()

    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'])

    return model

In [None]:
#@title DEFINITION 3: NST handler class
class NST:
    def __init__(self, content_layers, style_layers, content_weight, style_weight, model=None, content_model=None, style_model=None):
        self.content_weight = content_weight
        self.style_weight = style_weight

        #------------------------------------
        # Obtaining feature extractor(s) along with content and style layers...

        if model is None and content_model is None and style_model is None:
            print('Model for either content or style is not given!')

        try: # Default
            self.feature_extractor, layer_labels = self.get_feature_extractor(model)
            self.content_layers = [layer_labels[i] for i in content_layers]
            self.style_layers = [layer_labels[i] for i in style_layers]
        except: #Special case
            self.content_feature_extractor, layer_labels_1 = self.get_feature_extractor(content_model)
            self.style_feature_extractor, layer_labels_2 = self.get_feature_extractor(style_model)
            self.content_layers = [layer_labels_1[i] for i in content_layers]
            self.style_layers = [layer_labels_2[i] for i in style_layers]

    #================================================
    # PARAMETER DISPLAY

    def display_params(self):
        params = vars(self)
        return pd.DataFrame(data={'Parameter name':params.keys(), 'Parameter value':params.values()})

    #================================================
    # FEATURE EXTRACTOR

    def get_feature_extractor(self, model, return_layer_labels=True):
        outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
        feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)

        if return_layer_labels:
            return feature_extractor, list(outputs_dict.keys())
        return feature_extractor

    #================================================
    # DATA PACKAGING FOR NST

    def get_data_for_nst(self, content_segments, style_segments, target_type=None, seed=None):
        # `seed` is the pRNG seed for initialising target array:

        # Reshaping `content_segments` and `style_segments` to fit the desired model's layers' input shape:
        # NOTE 1: Reshaping here just involves appending an extra dimension of magnitude 1 (corresponding to the "channel" dimension in the CNN's layers)
        # NOTE 2: Such reshaping was also done when supplying training data to the model during training
        content_segments = np.reshape(content_segments, newshape=list(content_segments.shape) + [1])
        style_segments = np.reshape(style_segments, newshape=list(style_segments.shape) + [1])

        # Get feature vectors representations for audio file:
        content = tf.Variable(content_segments)
        style = tf.Variable(style_segments)
        # NOTE: Renaming is done only for readability
        if target_type is None or target_type == 'random':
            target = tf.Variable(tf.random_normal_initializer(mean=0.0, stddev=1.0, seed=seed)(shape=content_segments.shape))
        elif target_type == 'zero':
            target = tf.Variable(tf.zeros(shape=content_segments.shape))
        elif target_type == 'content':
            target = tf.Variable(content_segments)
        elif target_type == 'style':
            target = tf.Variable(style_segments)

        return content, style, target

    #================================================
    # GRAM MATRIX CALCULATION

    def get_gram_matrix(self, layer_outputs):
        # 1. Flatten each layer output to obtain the feature vectors:
        layer_outputs = tf.transpose(layer_outputs, (3, 0, 1, 2)) # What is this for? See the text box below this code box
        layer_outputs = tf.reshape(layer_outputs, (tf.shape(layer_outputs)[0], -1))

        # 2. Defining a tensor A containing each feature vector:
        # We already have `layer_outputs`

        # 3. Define a tensor B containing the transpose of each feature vector (in matching order):
        '''
        For this, we can simply do `tf.transpose(layer_outputs)`.
        Such an application of `tf.transpose` will reverse the dimensionality of `layer_outputs`.
        This not only transposes the outer tensor but every feature vector within as well.
        Hence, this obtains B^T.
        '''

        # 4. Obtain AB^T (see previous comment):
        gram_matrix = tf.matmul(layer_outputs, tf.transpose(layer_outputs))
        # NOTE: `tf.matmul` performs matrix multiplication
        return gram_matrix

    #================================================
    # CONTENT LOSS CALCULATION

    def get_content_loss(self, layer_output_for_content, layer_output_for_target):
        n1, n2, n3, n4 = layer_output_for_target.shape

        # Returning sum of squared errors between layer outputs:
        return tf.reduce_sum(tf.square(layer_output_for_content-layer_output_for_target))/(n1*n2*n3*n4)
        # NOTE: We want to preserve the `tf.tensor` datatype

    #================================================
    # STYLE LOSS CALCULATION

    def get_style_loss(self, layer_output_for_style, layer_output_for_target):
        n1, n2, n3, n4 = layer_output_for_target.shape

        # Obtaining Gram matrices:
        style_gram_matrix = self.get_gram_matrix(layer_output_for_style) # Gram matrix for the style image
        target_gram_matrix = self.get_gram_matrix(layer_output_for_target) # Gram matrix for the target image

        # Returning a constant multiple of mean-squared error:
        return tf.reduce_sum(tf.square(style_gram_matrix-target_gram_matrix))/(n1*n2*n3*n4)
        # NOTE 1: `tf.reduce_sum` Computes the sum of elements across dimensions of a tensor
        # NOTE 2: We want to preserve the `tf.tensor` datatype, hence we apply operations from Tensorflow and not NumPy

    #================================================
    # TOTAL LOSS CALCULATION

    def get_total_loss(self, content, style, target):
        # Getting content, style and target features:
        content_features = self.feature_extractor(content)
        style_features = self.feature_extractor(style)
        target_features = self.feature_extractor(target)

        # Initialising loss value:
        loss = tf.zeros(shape=())

        for layer in self.content_layers:
            layer_output_for_content = content_features[layer]
            layer_output_for_target = target_features[layer]
            loss += self.content_weight * self.get_content_loss(layer_output_for_content, layer_output_for_target)

        for layer in self.style_layers:
            layer_output_for_style = style_features[layer]
            layer_output_for_target = target_features[layer]
            loss += self.style_weight * self.get_style_loss(layer_output_for_style, layer_output_for_target)

        return loss

    #================================================
    # GRADIENT AND TOTAL LOSS CALCULATION

    def get_loss_and_grads(self, content, style, target):
        # NOTE: Each of target, content and style are arrays of segments of the melspectrogram of a particular audio file

        with tf.GradientTape() as tape:
            loss = self.get_total_loss(content, style, target)
        # Obtaining the gradient of loss w.r.t. `target`:
        grads = tape.gradient(loss, target)
        return loss, grads

    #================================================
    # BASIC NST LOOP

    def nst_loop(self, content, style, target, n_iter, initial_learning_rate=0.01, decay_steps=10, decay_rate=0.8, print_frequency=10, decay_on_excess=0.1, rollback_on_excess=2, max_retries_on_excess=20, history_buffer_size=10):
        '''
        EXPLAINING SOME ARGUMENTS:
        `decay_on_excess`: Upon getting loss as NaN, how much to reduce learning rate
        `rollback_on_excess`: Upon getting loss as NaN or `inf`, how many steps to back to go to get a past target value
        `max_retries_on_excess`: How many times in a row to retry upon getting NaN or `inf`
        `history_buffer_size`: Number of past targets to store
        '''

        optimizer = keras.optimizers.SGD(tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=initial_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate))
        # NOTE: Reassigning the optimizer resets its gradient

        retries = 0
        past_targets = [target.numpy()]
        for i in range(n_iter):
            # Calculating total loss and gradient:
            loss, grads = self.get_loss_and_grads(content, style, target)

            # Upon enountering `loss` as NaN or `inf`:
            if (tf.math.is_nan(loss) or tf.math.is_inf(loss)) and retries < max_retries_on_excess:
                try:
                    target = tf.Variable(past_targets[-rollback_on_excess])
                except:
                    target =  tf.Variable(past_targets[-1])
                optimizer = keras.optimizers.SGD(learning_rate=float(optimizer.learning_rate*decay_on_excess))
                retries += 1
                continue
            retries = 0

            #________________________
            # Handling history...

            # Storing current target:
            past_targets.append(target.numpy())

            # Limiting the size of past target storage list (if necessary) by removing the oldest entry:
            if len(past_targets) > history_buffer_size:
                del past_targets[0]
            #________________________

            # Applying gradients:
            optimizer.apply_gradients([(grads, target)])

            # Displaying process:
            if i % print_frequency == 0 or (i == n_iter - 1):
                #visualise(stitch_segments(target))
                print(f'i={i}\t | loss={float(loss):.5e} \t | lr={float(optimizer.learning_rate.numpy()):.5e}')

---

# INSTANTIATIONS

---

In [None]:
#@title INSTANTIATION 1: Audio data handler object
#@markdown _The parameters for audio data are not supposed to be subject to change!_
audioDataHandler = AudioDataHandler(
    content_path='',
    style_path='',
    sr=22050,
    n_fft=1024,
    hop_length=256,
    segment_size=431,
    n_mels=384,
    n_mfcc=40)

In [None]:
#@title INSTANTIATION 2: CNN model object
model = get_cnn_5_sec_melspectrogram(input_shape=(audioDataHandler.n_mels, audioDataHandler.segment_size, 1))
# Loading weights
trained_weights_file_name = 'GenreClassificationTrainedModelWeights.npy' #@param {type:"string"}
V = np.load(pwd + trained_weights_file_name, allow_pickle=True).tolist()
W = []
for i in range(len(V)):
    W.append(V[i])
model.set_weights(W)
#@markdown _The weights were obtained through a customised training process._

#@markdown <br> Model layer names with indices for reference...
df = {}
for i, layer in enumerate(model.layers): df[i] = layer.name
df = pd.DataFrame(data={'Layer number': df.keys(), 'Layer name': df.values()}).set_index(['Layer number', 'Layer name'])
ipd.display(df)

Layer number,Layer name
0,identity_2
1,batch_normalization_4
2,conv2d_16
3,max_pooling2d_16
4,conv2d_17
5,max_pooling2d_17
6,conv2d_18
7,max_pooling2d_18
8,conv2d_19
9,max_pooling2d_19


---

**_The NST handler shall be instantiated just before the NST loop, for convenience._**

---

# PERFORMING NST

---

**NOTE**: For audio, make sure to upload the desired audio files to the session storage and enter their names in the respective inputs.

In [None]:
#@title REFERENCE FOR DEFAULT AUDIO DATA
#@markdown **_If default audio is used._**
# Navidate to the present working directory if not there already:

try:
    # Iterate through all files in the current directory:
    content_list, style_list = [], []
    for file in os.listdir(pwd + 'AUDIO/'):
        file_path = f'{file}'
        # Only appending audio file names:
        if 'CONTENT' in file_path:
            content_list.append(f'{file}')
        elif 'STYLE' in file_path:
            style_list.append(f'{file}')

    # Navigating back to the present working directory:

    print('ID\t | Content name\n------------------------------------------------')
    for i, j in enumerate(content_list):
        print(f'{i}\t | {j}')
    print('\n\nID\t | Style name\n------------------------------------------------')
    for i, j in enumerate(style_list):
        print(f'{i}\t | {j}')
except:
    print("Default settings and data not used, I guess you're going custom.")

ID	 | Content name
------------------------------------------------
0	 | CONTENT - Bach - Aria Variata BWV 989 Variation 1 - Brendan Kinsella - Chosic.mp3
1	 | CONTENT - Bach - Goldberg Variations BWV 988 - Aria - Aaron Dunn - Chosic.mp3
2	 | CONTENT - Bach - Minuet - Notebook for Anna Magdalena - Aaron Dunn - Chosic.mp3
3	 | CONTENT - Mozart - Alla Turca - Markus Staab - Chosic.mp3
4	 | CONTENT - Mozart - Piano Concerto 21 in C Major K467 - II-Andante - Markus Staab - Chosic.mp3
5	 | CONTENT - Mozart - Sonata 13 in B Flat Major K333 - I-Allegro - Brendan Kinsella - Chosic.mp3
6	 | CONTENT - Mozart - Sonata 13 in B Flat Major K333 - II-Andante Cantabile - Brendan Kinsella - Chosic.mp3
7	 | CONTENT - Summer Sport - AudioCoffee - Chosic.mp3
8	 | CONTENT - Warm Duck Shuffle - arnebhus - Chosic.mp3
9	 | CONTENT- Slow Burn - Kevin MacLeod - Chosic.mp3


ID	 | Style name
------------------------------------------------
0	 | STYLE - Alien Technology - Mixkit.wav
1	 | STYLE - Arabic Vocal Ambi

In [None]:
#@title INITIALISATION OF AUDIO DATA
#@markdown **CONTENT AND STYLE AUDIO FILE NAMES**:

try:
    #@markdown _For content and style names, only include either the file name with the extension or the ID as given above..._
    directory_path = 'AUDIO' #@param {type:"string"}
    if directory_path == '!content':
        directory_path = '/content/'
    else:
        # Removing extraneous beginning forward-slash if present:
        if directory_path[0] == '/':
            directory_path = directory_path[1:]
        # Adding forward-slash at the end if not present:
        if directory_path[-1] != '/':
            directory_path = directory_path + '/'
        directory_path = pwd + directory_path
    content = 'CONTENT - Warm Duck Shuffle - arnebhus - Chosic.mp3' #@param {type:"string"}
    style = 'STYLE - Space Traveller - Pixabay.mp3' #@param {type:"string"}
    try:
        content, style = content_list[int(content)], style_list[int(style)]
        print('Content name :', content)
        print('Style name   :', style)
    except:
        pass
    #@markdown _Specifying directory path_ :
    #@markdown - `directory_path` must be put as `!content` to access files in your Colab session storage.
    #@markdown - In every other case, the given path will be taken relative to the present working directory.
    #@markdown - Do not worry about putting forward-slashes however you want in the path name.
    #@markdown - To access a directory in your Google Drive, change `directory_path` accordingly.
    #@markdown - Note that Google Drive is or can be mounted in the first cell of this notebook.
    #@markdown - If audio files are in separate directories, give full paths for names <br> _Remember to set_ `directory_path` _to the common directory (empty if needed)._

    #@markdown <br> _For limiting the number of segments of each audio file to include..._
    segments_to_include = 5 #@param {type:"integer"}

    content_segments, style_segments, content_signal, style_signal = audioDataHandler.get_segments(directory_path + content, directory_path + style, return_signals=True, segments_to_include=segments_to_include)
except:
    print('ERROR: Files not found! Check the directory path, audio names and their availability in the intended location.')

In [None]:
#@title RUNNING NST ON CHOSEN AUDIO
#@markdown **NST HANDLER PARAMETERS**:
content_layers = '0, 2' #@param {type:"string"}
style_layers = '4, 8, 12, 16' #@param {type:"string"}
content_weight = 50 #@param
style_weight = 10 #@param
# Ensuring the weights are float values:
content_weight, style_weight = float(content_weight), float(style_weight)

content_layers = [int(i) for i in content_layers.split(',')]
style_layers = [int(i) for i in style_layers.split(',')]
target_type = 'content' #@param ['content', 'style', 'zero', 'random']
#@markdown _Options for target type..._
#@markdown - `content` $\implies$ Target is initialised with content data
#@markdown - `style` $\implies$ Target is initialised with style data
#@markdown - `zero` $\implies$ Target is initialised with zeros
#@markdown - `random` $\implies$ Target is initialised with Gaussian noise

# Instantiating NST class to access NST functions:
nst = NST(
    content_layers=content_layers,
    style_layers=style_layers,
    model=model,
    content_weight=content_weight,
    style_weight=style_weight)

#------------------------------------

# Obtaining content, style and target fit for NST based on obtained segments:
# NOTE: There is the option of using the previous inputs, which can also mean the previous target; this allows you to carry on from where you left off last time
#@markdown _Reset NST inputs, including target (ignoring_ `target type` _argument)?_
reset_nst_inputs = True #@param {type:"boolean"}
if reset_nst_inputs or dir().count('target') == 0:
    content, style, target = nst.get_data_for_nst(content_segments, style_segments, target_type=target_type)
# NOTE: `dir().count('target')` returns 0 if `target` is undefined, 1 otherwise

#------------------------------------

#@markdown _Display parameters stored in the NST handler?_
display_nst_handler_params = True #@param {type:"boolean"}
print('NST HANDLER PARAMETERS')
ipd.display(nst.display_params())
print('\n================================================\n')

#================================================

#@markdown <br> **NST LOOP PARAMETERS**:
iterations_for_nst = 100 #@param
initial_learning_rate = 0.001 #@param
decay_steps = 10 #@param
decay_rate = 1.0 #@param
print_frequency = 10 #@param
decay_on_excess = 0.5 #@param
rollback_on_excess = 2 #@param
max_retries_on_excess = 20 #@param
history_buffer_size = 10 #@param

#@markdown **NOTE**: _Explaining the above parameters..._
#@markdown - `iterations_for_nst`: How many iterations to perform NST for
#@markdown - `initial_learning_rate`: Initial learning rate of NST
#@markdown - `decay_steps`: Number of iterations/steps after which learning rate must decay
#@markdown - `decay_rate`: Rate of learning rate decay
#@markdown - `print_frequency`: Number of times to print parameters to show NST progress
#@markdown - `decay_on_excess`: Upon getting loss as NaN, how much to reduce learning rate
#@markdown - `rollback_on_excess`: Upon getting loss as NaN or `inf`, how many steps to back to go to get a past target value
#@markdown - `max_retries_on_excess`: How many times in a row to retry upon getting NaN or `inf`
#@markdown - `history_buffer_size`: Number of past targets to store

nst.nst_loop(
    content=content,
    style=style,
    target=target,
    n_iter=iterations_for_nst,
    initial_learning_rate=initial_learning_rate,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    print_frequency=print_frequency,
    decay_on_excess=decay_on_excess,
    rollback_on_excess=rollback_on_excess,
    max_retries_on_excess=max_retries_on_excess,
    history_buffer_size=history_buffer_size)

NST HANDLER PARAMETERS


Unnamed: 0,Parameter name,Parameter value
0,content_weight,50.0
1,style_weight,10.0
2,feature_extractor,"<Functional name=functional_28, built=True>"
3,content_layers,"[identity_2, conv2d_16]"
4,style_layers,"[conv2d_17, conv2d_19, conv2d_21, conv2d_23]"




i=0	 | loss=4.94727e+10 	 | lr=1.00000e-03
i=20	 | loss=8.25954e+15 	 | lr=1.56250e-05
i=30	 | loss=2.66836e+19 	 | lr=1.95313e-06
i=40	 | loss=3.25571e+25 	 | lr=2.44141e-07
i=50	 | loss=1.82337e+28 	 | lr=3.05176e-08
i=70	 | loss=8.19101e+28 	 | lr=4.76837e-10
i=80	 | loss=2.32007e+31 	 | lr=5.96046e-11
i=90	 | loss=1.65142e+30 	 | lr=7.45058e-12
i=99	 | loss=7.21643e+24 	 | lr=1.86265e-12


In [None]:
#@title PLAY AUDIO
print('\nTarget signal:')
# Reshaping target for enabling reconstruction of melspectrogram:
target_signal = None
target_reshaped = np.reshape(np.array(target), newshape=list(target.shape)[:-1])
# You can limit the number of target segments to reconstruct, if you don't want the whole audio:
target_segments_to_reconstruct = 5 #@param {type:"integer"}
target_signal = audioDataHandler.reconstruct_signal_from_melspectrogram_segments_and_play(target_reshaped[:target_segments_to_reconstruct], return_value=True)

# Play portions of the content and style audio for reference:
play_content_and_style = False #@param {type:"boolean"}
if play_content_and_style:
    print('Content signal:')
    audioDataHandler.play_signal(content_signal[:1000000])
    print('\nStyle signal:')
    audioDataHandler.play_signal(style_signal[:1000000])


Target signal:
