# 1. Setup Environment

In [None]:
%%capture --no-display
%load_ext autoreload
%autoreload 1
RAND_STATE_GLOB = 1291

import cv2
import scipy
import numpy as np
import pandas as pd
from numpy import savetxt

import os
import sys
import time
import joblib
import inspect
os.environ["CUDA_VISIBLE_DEVICES"]="0"
abspathdir = os.path.abspath(inspect.getfile(inspect.currentframe()))
currentdir = os.path.dirname(abspathdir)
parentdir  = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

from Utils._fe_utils import get_id

import librosa as librosa
import librosa.display as display

import IPython.display as ipd
from IPython.display import clear_output

import matplotlib.pyplot as plt
%matplotlib inline 

# Deep Learning (Keras Setups)
from keras.utils  import Sequence
from keras.layers import Input, Dense, Lambda, Flatten, Reshape, BatchNormalization
from keras.layers import Conv2D, AveragePooling2D, MaxPooling2D, Dropout, SpatialDropout2D
from keras.layers import LeakyReLU
from keras.models import Model, load_model, model_from_json
from keras.losses import mse
from keras import backend as K
from keras import optimizers
from keras.callbacks import ModelCheckpoint

#### ii. Helper Functions 

In [None]:
def get_sampleRate(audio_database):
    v = list(audio_database.values())[0]
    return v["SampleRate"]

### 1.1 Load semi-structured features from database file (mgbd.pkl)

In [None]:
db_dir = currentdir + '/tmpdata'
dbname = "/mgdb.pkl"
db = joblib.load(db_dir + dbname)

# Extract the video data (a nested database within the mgdb)
video_db = db["Video"]

# Extract the audio data
audio_db = db["Audio"]

#### 1.1.1 Perform some filtering on the databases and simplify the datastructures  

In [None]:
video_db_simple = {}
for k, v in video_db.items():
    new_entry = {}
    old_entry = video_db[k]
    
    # Remove a sample if the duration is less than 4 (secs)
    duration = int(old_entry["Duration"])
    if duration < 4:
        continue
    
    new_entry["MusicID"] = old_entry["MetaInfo"]["MusicID"]
    new_entry["MusicEncoding"] = old_entry["MusicEncoding"]
    new_entry["MotiongramX"] = old_entry["MotiongramX"]
    new_entry["MotiongramY"] = old_entry["MotiongramY"]
    
    video_db_simple[k] = new_entry

audio_db_simple = {}
for k, v in audio_db.items():
    old_entry = audio_db[k]
    
    # Remove a sample if the duration is less than 4 (secs)
    duration = int(old_entry["Duration"])
    if duration < 4:
        continue    
    
    audio_db_simple[old_entry["MusicID"]] = old_entry["RawAudio"]

#### 1.1.2 Convert the simplified dictionaries to pandas dataframes for easier access

In [None]:
# Convert dictionaries to pandas dataframes for easier access
video_df = pd.DataFrame(video_db_simple)
audio_df = pd.DataFrame(audio_db_simple)

# 2. Feature Processing

#### 2.1 Setup class for handling spectrogram generation

In [None]:
class SpectrogramGenerator:
    
    def __init__(self, audio_df, sr, fmin, fmax, n_fft, hop_length, n_bins):
        self.sr = sr
        self.fmin = fmin
        self.fmax = fmax
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_bins = n_bins
        
        self.audio_df = audio_df
        self.existing_spectros = {} 
        
    def get_spectrogram(self, music_id):
        if music_id in self.existing_spectros:
            return self.existing_spectros[music_id]
        
        y = self.audio_df[music_id].to_numpy(dtype=np.float32)
        mel_spectrogram = librosa.feature.melspectrogram(
            y, n_mels=self.n_bins, sr=self.sr, 
            n_fft=self.n_fft, hop_length=self.hop_length, 
            window=scipy.signal.hamming,
            fmin=self.fmin, fmax=self.fmax
        )
        mel_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        self.existing_spectros[music_id] = mel_db
        return mel_db

#### 2.2 Setup a class for generating data

In [None]:
class RegressionDataGenerator(Sequence):
    
    def __init__(self, input_ids, input_df, target_generator, batch_size=32, 
             dim=(128, 128), n_channels=1, shuffle=True, flatten=True):
        '''Initialization'''
        self.dim = dim
        self.batch_size = batch_size
        self.input_ids = input_ids
        self.input_df = input_df
        self.target_generator = target_generator
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.flatten = flatten
        self.on_epoch_end()        
    
    def __len__(self):
        '''Denotes number of batches per epoch'''
        return int(np.floor(len(self.input_ids) / self.batch_size))
    
    def __getitem__(self, index): 
        '''Generate one batch of data'''
        # Generate idexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        # Find list of ids
        input_ids_temp = [self.input_ids[k] for k in indexes]
        
        # Generate data
        X, y = self.__data_generation(input_ids_temp)
        return (X, y)
        
    def __data_generation(self, input_ids_temp):
        '''Generates data containing batch_size samples'''
        # Initialization
        if self.flatten == True:
            X = np.empty((self.batch_size, self.dim[0] * self.dim[1]))
            y = np.empty((self.batch_size, self.dim[0] * self.dim[1]))
        else:
            X = np.empty((self.batch_size, *self.dim))
            y = np.empty((self.batch_size, *self.dim))
        
        # Generate data
        for i, id_ in enumerate(input_ids_temp):
            # Fetch example from the input dataframe
            example = self.input_df[id_]
            
            # Get the motiogram
            mgy = example["MotiongramY"]
            mgy = cv2.cvtColor(mgy, cv2.COLOR_RGB2GRAY)
            mgy = cv2.resize(mgy, self.dim)
            mgy = mgy.astype(np.float32)
            mgy = mgy / 255.
            X[i,] = mgy.flatten() #np.expand_dims(mgy, axis=-1)
            
            # Fetch the spectrogram
            mel = self.target_generator.get_spectrogram(
                example["MusicID"]
            )
            y[i,] = mel.flatten() #np.expand_dims(mel, axis=-1)
            return X, y
    
    def on_epoch_end(self):
        '''Updates indexes after each epoch'''
        self.indexes = np.arange(len(self.input_ids))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

# 3. Model Preparation

#### 3.1 Setup a class that delivers a ML Model (CVAE)

In [None]:
class ConvolutionalVariationalAutoEncoder:
    
    def __init__(self, input_dims, latent_dim, lr):
        self.input_dims = input_dims
        self.latent_dim = latent_dim
        self.lr         = lr        
        self.kernel     = (3,3)  
        self.act_layer  = LeakyReLU()
        self.optimizer  = optimizers.adam(lr=self.lr)
        self.loss       = 'mse'
        self.act_last_layer = 'sigmoid'
        
        self._make_encoder()
        self._make_decoder()
        self._make_VAE()
    
    def get_full_model(self):
        return self.full_vae
    
    def get_encoder(self):
        return self.encoder_vae
    
    def get_decoder(self):
        return self.decoder_vae
    
    def _make_encoder(self):
        '''Creates a field in the class representing the encoder model'''
        
        # Setup the input layer
        input_dim_sca = self.input_dims[0] * self.input_dims[1]
        self.input_encoder = Input(shape=(input_dim_sca,), name='input_encoder')
        
        # Setup hidden layers (Convolutional Layers)
        encoder     = Reshape((self.input_dims[0], self.input_dims[1],-1))(self.input_encoder)
        encoder     = Conv2D(32, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((2, 2), padding='same')(encoder)
        encoder     = Conv2D(64, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((2, 2), padding='same')(encoder)
        encoder     = Conv2D(128, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((2, 2), padding='same')(encoder)
        encoder     = Conv2D(256, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((2, 2), padding='same')(encoder)
        encoder     = Conv2D(512, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((2, 2), padding='same')(encoder)
        encoder     = Conv2D(1024, self.kernel, activation=self.act_layer, padding='same')(encoder)
        encoder     = MaxPooling2D((4, 4), padding='same')(encoder)
        encoder     = Flatten()(encoder)

        # Setup special layers required for a VAE
        self.z_mean    = Dense(self.latent_dim, name='z_mean')(encoder)
        self.z_log_var = Dense(self.latent_dim, name='z_log_var')(encoder)  
        self.z         = Lambda(ConvolutionalVariationalAutoEncoder.sampling, 
                        output_shape=(self.latent_dim,), name='z')([self.z_mean, self.z_log_var])
        
        # Full encoder model
        self.encoder_vae = Model(self.input_encoder, 
                 [self.z_mean, self.z_log_var, self.z], name='encoder_vae')
    
    def _make_decoder(self):
        # Setup the "input" layer for the decoder
        input_dim_sca = self.input_dims[0] * self.input_dims[1]
        input_latent = Input(shape=(self.latent_dim,), name='z_sampling')
        
        # Setup hidden layers (fully connected dense layers)
        decoder = Dense(input_dim_sca//32, activation=self.act_layer)(input_latent)
        decoder = Dense(input_dim_sca//8, activation=self.act_layer)(decoder)
        decoder = Dense(input_dim_sca//4, activation=self.act_layer)(decoder)
        decoder = Dense(input_dim_sca, activation=self.act_last_layer)(decoder)
        
        # Full decoder model
        self.decoder_vae = Model(input_latent, decoder, name='decoder_vae')
    
    def _make_VAE(self):
        '''Creates a full Variational AutoEncoder model'''
        # Create the "output" layer (in this case our decoder model)
        output_decoder = self.decoder_vae(
            self.encoder_vae(self.input_encoder)[2])
                
        # Create the full model
        self.full_vae = Model(
            self.input_encoder, output_decoder, name='full_vae')
        
        # Setup special loss
        input_dim_sca        = self.input_dims[0] * self.input_dims[1]
        reconstruction_loss  = mse(self.input_encoder, output_decoder)
        reconstruction_loss *= input_dim_sca 
        
        #self.full_vae.add_loss(vae_loss)
        self.full_vae.compile(optimizer=self.optimizer, 
                  loss=self.vae_loss_carrier(reconstruction_loss))
    
    def vae_loss_carrier(self, reconstruction_loss):
        def vae_loss_fn(y_true, y_pred):
            kl_loss  = 1 + self.z_log_var - K.square(self.z_mean) - K.exp(self.z_log_var)
            kl_loss  = K.sum(kl_loss, axis=-1)
            kl_loss *= -0.5
            vae_loss = K.mean(reconstruction_loss + kl_loss)
            return vae_loss
        return vae_loss_fn
        
    @staticmethod
    def sampling(args):
        '''Provides a method to sample from a normal distribution'''
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon

# 4. Training

#### 4.1 Setup global variables for spectrogram generation

In [None]:
# sampling rate
sr = get_sampleRate(audio_db)

# min/max freq 
fmin, fmax = 20, sr / 2 

# number of samples for each fft window. 
# for music it is recommended 2048, but with 4096 we are getting better results
n_fft = 4096

#(columns) - so we can get 128 frames 
hop_length = 690

#(rows) - With this, we get nice 128 x 128 spectrograms 
n_mels = 128

##### 4.1.1 Instantiate an instance of the SpectrogramGenerator 

In [None]:
target_generator = SpectrogramGenerator(
    audio_df=audio_df, sr=sr, 
    fmin=fmin, fmax=fmax, n_fft=n_fft,
    hop_length=hop_length, n_bins=n_mels
)

#### 4.2 Setup global variables for NN (hyperparameters)

In [None]:
input_dims    = (128, 128)
latent_dim    = 512
batch_size    = 128
epochs        = 1500
learning_rate = 0.00025

#### 4.3 Setup KFolds 

In [None]:
from sklearn.model_selection import StratifiedKFold
RAND_STATE_GLOB = 1291

# Create an instance of "StratifiedKFold" class
# This class will perform the splits for us
skf = StratifiedKFold(
    n_splits=4, shuffle=True, random_state=RAND_STATE_GLOB
)

# Create a list of integer labels on which the SKF can
# perform its splits
labels = list(video_df.loc["MusicEncoding", :])

# Create a dummy array to represent features.
# We are only interested in the indexes provided
dummy_features = np.zeros((len(video_df.columns), 1))

#### 4.4 Begin Actual Training 

In [None]:
# An indexer to label the current fold
fold_var = 1

# Perform the K splits 
for train, test in skf.split(dummy_features, labels):
    
    # Print the samples in the test-train split
    print(train.shape, test.shape)
    
    # Create our partition dictionary
    partition = {
        "train": list(video_df.columns[train]),
        "validation": list(video_df.columns[test]) 
    }
    
    # Setup our data generators
    training_generator = RegressionDataGenerator(
        input_ids=partition["train"], input_df=video_df, 
        target_generator=target_generator, batch_size=batch_size,
        dim=input_dims, n_channels=1, shuffle=False
    )
    
    validation_generator = RegressionDataGenerator(
        input_ids=partition["validation"], input_df=video_df, 
        target_generator=target_generator, batch_size=batch_size,
        dim=input_dims, n_channels=1, shuffle=False
    )
    
    # Get an instance of the CVAE model
    cvae  = ConvolutionalVariationalAutoEncoder(
        input_dims, latent_dim, learning_rate)
    model = cvae.get_full_model()
        
    # Train the model
    model.fit_generator(
        generator=training_generator,
        steps_per_epoch=train.shape[0] // batch_size,
        validation_data=validation_generator,
        use_multiprocessing=True,
        workers=6,
        epochs=1,
        verbose=1
    )