# Dynamical Variational Autoencoders

Run the following cells to install the DVAE package, import some libraries and define spectrogram visualization parameters.

In [1]:
pip install git+https://github.com/sleglaive/DVAE-speech@code_release &> /dev/null

In [2]:
!git clone https://github.com/sleglaive/DVAE-speech.git &> /dev/null

In [3]:
#@title
from ipywidgets import interact, interactive
from google.colab import files
import os
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
from IPython.display import Audio
import dvae
from dvae import LearningAlgorithm

sr = 16000
wlen_sec = 64e-3 # STFT window length in seconds
hop_percent = 0.25  # hop size as a percentage of the window length
wlen = wlen_sec*sr # window length of 64 ms
wlen = np.int(np.power(2, np.ceil(np.log2(wlen)))) # next power of 2
hop = np.int(hop_percent*wlen) # hop size
win = np.sin(np.arange(.5,wlen-.5+1)/wlen*np.pi); # sine analysis window

### Choose one of the availabe speech files, or upload yours

In [4]:
#@title
def choose_file(file):
  if file == 'male':
    !wget -q -O speech_orig.wav https://sleglaive.github.io/demos/icassp2020/HOME-LIVINGB-2_-5/speech_orig.wav
  elif file == 'female':
    !wget -q -O speech_orig.wav https://sleglaive.github.io/demos/icassp2020/CAFE-CAFE-2_-5/speech_orig.wav

options = [('male speaker', 'male'), ('female speaker', 'female')]
file_widget = interactive(choose_file, file=options)
display(file_widget)

interactive(children=(Dropdown(description='file', options=(('male speaker', 'male'), ('female speaker', 'fema…

Run the next cell if you want to upload a file, otherwise just move on.

In [5]:
uploaded = files.upload()
if len(list(uploaded.keys()))!=0:
  os.rename(list(uploaded.keys())[0],'speech_orig.wav')

### Choose one of the DVAE models

In [20]:
#@title
def choose_model(model):
  model_path = ([f[0] for f in os.walk('./DVAE-speech/saved_model') 
                if '_'+model+'_' in f[0] and '_F' in f[0]][0])
  cfg_file = os.path.join(model_path, 'config.ini')
  print(cfg_file)
  model_state = [f for f in os.listdir(model_path) if f.endswith('.pt')]
  if len(model_state)==1:
    model_state = model_state[0]
  else:
    model_state = model_state[1]
  model_state = os.path.join(model_path, model_state)
  print(model_state)
  learning_algo = LearningAlgorithm(config_file=cfg_file)
  learning_algo.load_state_dict(state_dict_file=model_state)
  return learning_algo

options = [('VAE (complexified)', 'VAE'), 
           ('VAE (original)', 'origVAE'), 
           ('DKF', 'DKF'), 
           ('DSAE', 'DSAE'), 
           ('causal RVAE (complexified)', 'RVAE-Causal'), 
           ('non-causal RVAE (complexified)', 'RVAE-NonCausal'),
           ('causal RVAE (original)', 'origRVAE-Causal'), 
           ('non-causal RVAE (original)', 'origRVAE_NonCausal'),
           ('STORN', 'STORN'), 
           ('SRNN', 'SRNN'), 
           ('VRNN', 'VRNN')
           ]

model_widget = interactive(choose_model, model=options)

display(model_widget)

interactive(children=(Dropdown(description='model', options=(('VAE (complexified)', 'VAE'), ('VAE (original)',…

### Analysis-resynthesis

Run the following cell to perform analysis-resynthesis through the DVAE encoder-decoder.


In [32]:
wav_file = 'speech_orig.wav'
x, sr = librosa.load(wav_file, sr=sr)

rec_wav_file = './' + wav_file[:-4] + '_recon.wav'

learning_algo = model_widget.result
learning_algo.generate(audio_orig=wav_file, audio_recon=rec_wav_file)

x_rec, sr = librosa.load(rec_wav_file, sr=sr)

Choose between the original and the reconstructed signal.

In [33]:
#@title
def show_results(file):
  if file=='original':
    time = np.arange(x.shape[0])/sr
    fig, ax = plt.subplots(figsize=(8,2))
    ax.plot(time, x)
    ax.set_title('original waveform')
    ax.set_xlabel('time (s)')
    ax.set_ylabel('amplitude')

    X = librosa.stft(x, n_fft=wlen, hop_length=hop, win_length=wlen, 
                    window=win)

    fig, ax = plt.subplots(figsize=(10,6))
    img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(X)), hop_length=hop, sr=sr,
                                  y_axis='linear', x_axis='time', ax=ax, cmap='plasma')
    ax.set_title('original power spectrogram')
    fig.colorbar(img, ax=ax, format="%+2.0f dB")
    ax.set_ylim(0, 4000)

    display(Audio(x, rate=sr))

  else:
    time = np.arange(x_rec.shape[0])/sr
    fig, ax = plt.subplots(figsize=(8,2))
    ax.plot(time, x_rec)
    ax.set_title('reconstructed waveform')
    ax.set_xlabel('time (s)')
    ax.set_ylabel('amplitude')

    X_rec = librosa.stft(x_rec, n_fft=wlen, hop_length=hop, win_length=wlen, 
                    window=win)

    fig, ax = plt.subplots(figsize=(10,6))
    img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(X_rec)), hop_length=hop, sr=sr,
                                  y_axis='linear', x_axis='time', ax=ax, cmap='plasma')
    ax.set_title('reconstructed power spectrogram')
    fig.colorbar(img, ax=ax, format="%+2.0f dB")
    ax.set_ylim(0, 4000)

    display(Audio(x_rec, rate=sr))


interact(show_results, file=[('original', 'original'), ('reconstructed', 'reconstructed')]);


interactive(children=(Dropdown(description='file', options=(('original', 'original'), ('reconstructed', 'recon…