# Text-to-speech demo

- Tacotron2 (mel-spectrogram prediction part): https://github.com/Rayhane-mamah/Tacotron-2
- WaveNet: https://github.com/r9y9/wavenet_vocoder

This is a proof of concept for Tacotron2 text-to-speech synthesis. Models used here were trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).

**Notice**: The waveform generation is super slow since it implements naive autoregressive generation. It doesn't use parallel generation method described in [Parallel WaveNet](https://arxiv.org/abs/1711.10433). 

**Estimated time to complete**: 2 ~ 3 hours.

In [0]:
import os
from os.path import exists, join, expanduser

os.chdir(expanduser("~"))

wavenet_dir = "wavenet_vocoder"
if not exists(wavenet_dir):
  ! git clone https://github.com/r9y9/$wavenet_dir
    
taco2_dir = "Tacotron-2"
if not exists(taco2_dir):
  ! git clone https://github.com/r9y9/$taco2_dir
  ! cd $taco2_dir && git checkout -B wavenet3 origin/wavenet3

## Setup

### Install dependencies

In [0]:
# Install dependencies
! pip install -q --upgrade "tensorflow<=1.9.0"
! pip install librosa
! pip install lws
os.chdir(join(expanduser("~"), taco2_dir))
! pip install -q -r requirements.txt

os.chdir(join(expanduser("~"), wavenet_dir))
! pip install -q -e '.[train]'

In [0]:
import torch
import tensorflow
tensorflow.__version__

### Download pretrained models

#### Tacotron2 (mel-spectrogram prediction part)

In [0]:
os.chdir(join(expanduser("~"), taco2_dir))
! mkdir -p logs-Tacotron
if not exists("logs-Tacotron/pretrained"):
  ! curl -O -L "https://www.dropbox.com/s/vx7y4qqs732sqgg/pretrained.tar.gz"
  ! tar xzvf pretrained.tar.gz
  ! mv pretrained logs-Tacotron

#### WaveNet

In [0]:
os.chdir(join(expanduser("~"), wavenet_dir))
wn_preset = "20180510_mixture_lj_checkpoint_step000320000_ema.json"
wn_checkpoint_path = "20180510_mixture_lj_checkpoint_step000320000_ema.pth"

if not exists(wn_preset):
  !curl -O -L "https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json"
if not exists(wn_checkpoint_path):
  !curl -O -L "https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth"

## Input texts to be synthesized

Choose your favorite sentences :)

In [0]:
os.chdir(join(expanduser("~"), taco2_dir))

In [0]:
%%bash
cat << EOS > text_list.txt
I cannot believe that we have end to end text to speech setup.
This is really awesome!
This is text-to-speech online demonstration by Tacotron 2, Mel Super Resolution and Wavenet.
Thanks for your patience.
EOS

cat text_list.txt

## Mel-spectrogram prediction by Tacoron2

In [0]:
# Remove old files if exist
! rm -rf tacotron_output
! python synthesize.py --model='Tacotron' --mode='eval' \
  --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=1' \
  --text_list=./text_list.txt

# Spectrogram Inversion by MelSrez

In [0]:
import librosa
import numpy as np
import lws

def create_mel_filterbank(*args, **kwargs):
  return librosa.filters.mel(*args, **kwargs)

def create_inverse_mel_filterbank(*args, **kwargs):
  W = create_mel_filterbank(*args, **kwargs)
  return np.linalg.pinv(W)

class SpectralUtil(object):
  NFFT = 1024
  NHOP = 256
  FMIN = 125.
  FMAX = 7600.
  NMELS = 80
  fs = 22050

  def __init__(self):
    meltrans = create_mel_filterbank(
            self.fs, self.NFFT, fmin=self.FMIN, fmax=self.FMAX, n_mels=self.NMELS)
    invmeltrans = create_inverse_mel_filterbank(
            self.fs, self.NFFT, fmin=self.FMIN, fmax=self.FMAX, n_mels=self.NMELS)

    self.meltrans = tf.constant(meltrans, dtype = 'float32')
    self.invmeltrans = tf.constant(invmeltrans, dtype = 'float32')
    self.invmeltrans_np = invmeltrans
    self.meltrans_np = meltrans
    self.lws_processor = lws.lws(self.NFFT, self.NHOP, mode='speech', perfectrec=False)

  def mag_to_mel_linear_spec(self, mag_spec):
    linear_mel =  tf.expand_dims(
      tf.tensordot(mag_spec[:,:,:,0], tf.transpose(self.meltrans), axes = 1 ), -1)
    return linear_mel

  def mel_linear_to_mag_spec(self, mel_spec, transform = 'inverse'):
    if transform == 'inverse':
      transform_mat = tf.transpose(self.invmeltrans)
    elif transform == 'transposed':
      transform_mat = meltrans
    else:
      raise NotImplementedError()
    mag_spec =  tf.expand_dims(
      tf.tensordot(mel_spec[:,:,:,0], transform_mat, axes = 1 ), -1)
    return mag_spec

  def audio_from_mag_spec(self, mag_spec):
    mag_spec = mag_spec.astype('float64')
    spec_lws = self.lws_processor.run_lws(mag_spec[:,:,0])
    magspec_inv = self.lws_processor.istft(spec_lws)[:, np.newaxis, np.newaxis]
    magspec_inv = magspec_inv.astype('float32')
    return magspec_inv
  
  def tacotron_mel_to_mag(self, tacotron_mel):
    norm_min_level_db = -100
    norm_ref_level_db = 20
    nfft = 1024
    nhop = 256
    mel_min = 125
    mel_max = 7600
    mel_num_bins = 80
    fs = 22050
    
    X_mel_dbnorm = np.interp(tacotron_mel, (0, 4), (0, 1))
    X_mel_db = (X_mel_dbnorm * -norm_min_level_db) + norm_min_level_db
    X_mel = np.power(10, (X_mel_db + norm_ref_level_db) / 20)
    X_mag = np.dot(X_mel, self.invmeltrans_np.T)
    
    return X_mag

In [0]:
import IPython
from IPython.display import Audio
import tensorflow as tf
os.chdir(expanduser("~"))
vocoder_ckpt_dir = join(expanduser("~"), "srezvocodercpts")
if not os.path.isdir(vocoder_ckpt_dir):
  os.makedirs(vocoder_ckpt_dir)
  os.chdir(vocoder_ckpt_dir)
  ! wget deepyeti.ucsd.edu/paarth/srezvocoderckpts/best_gen_loss_l1-50001.index;
  ! wget deepyeti.ucsd.edu/paarth/srezvocoderckpts/best_gen_loss_l1-50001.meta;
  ! wget deepyeti.ucsd.edu/paarth/srezvocoderckpts/best_gen_loss_l1-50001.data-00000-of-00001

os.chdir(expanduser("~"))
gen_graph = tf.Graph()
with gen_graph.as_default():
  gan_saver = tf.train.import_meta_graph(join(expanduser("~"), "srezvocodercpts/best_gen_loss_l1-50001.meta"))
gen_sess = tf.Session(graph=gen_graph)
print("Restoring")
gan_saver.restore(gen_sess, join(expanduser("~"), "srezvocodercpts/best_gen_loss_l1-50001"))
gen_mag_spec = gen_graph.get_tensor_by_name('generator/decoder_1/strided_slice_1:0')
x_mag_input = gen_graph.get_tensor_by_name('ExpandDims_1:0')

su = SpectralUtil()

with open("Tacotron-2/tacotron_output/eval/map.txt") as f:
  maps = f.readlines()
maps = list(map(lambda x:x[:-1].split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
  print(idx, text)

for idx, (text, mel) in enumerate(maps):
  mel_path = join("Tacotron-2", mel)
  c = np.load(mel_path)
  
  X_mag = su.tacotron_mel_to_mag(c)
  x_mag_target_length = int(X_mag.shape[0] / 64 ) * 64 + 64
  X_mag = np.pad(X_mag, ([0,x_mag_target_length - X_mag.shape[0]], [0,0]), 'constant')
  num_examples = int(x_mag_target_length/64)
  X_mag = np.reshape(X_mag, [num_examples, 64, 513, 1])
  
  gen_mags = []
  heuristic_mags = []
  for n in range(num_examples):
    _gen, _heur = gen_sess.run([gen_mag_spec, x_mag_input], feed_dict = {
        x_mag_input : X_mag[n:n+1]
        })
    gen_mags.append(_gen[0])
    heuristic_mags.append(_heur[0])
  
  gen_mag = np.concatenate(gen_mags, axis = 0)
  heur_mag = np.concatenate(heuristic_mags, axis = 0)
  
  _gen_audio = su.audio_from_mag_spec(gen_mag)
  _heur_audio = su.audio_from_mag_spec(heur_mag)
  print(idx, text)
  print("generated melsrez")
  IPython.display.display(Audio(_gen_audio[:,0,0], rate=22050))
  print("heuristic inversion")
  IPython.display.display(Audio(_heur_audio[:,0,0], rate=22050))

## Waveform synthesis by WaveNet

In [0]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import torch

In [0]:
os.chdir(join(expanduser("~"), wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

In [0]:
from glob import glob
from tqdm import tqdm

with open("../Tacotron-2/tacotron_output/eval/map.txt") as f:
  maps = f.readlines()
maps = list(map(lambda x:x[:-1].split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
  print(idx, text)

### Waveform generation

**Note**: This will takes hours to finish depending on the number and lenght of texts. Try short sentences first if you would like to see samples quickly.

In [0]:
waveforms = []

for idx, (text, mel) in enumerate(maps):
  print("\n", idx, text)
  mel_path = join("../Tacotron-2", mel)
  c = np.load(mel_path)
  if c.shape[1] != hparams.num_mels:
    np.swapaxes(c, 0, 1)
  # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
  c = np.interp(c, (0, 4), (0, 1))
 
  # Generate
  waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
  
  waveforms.append(waveform)

  # Audio
  IPython.display.display(Audio(waveform, rate=hparams.sample_rate))

## Summary: audio samples

In [0]:
for idx, (text, mel) in enumerate(maps):
  print(idx, text)
  IPython.display.display(Audio(waveforms[idx], rate=hparams.sample_rate))

For more information, please visit https://github.com/r9y9/wavenet_vocoder. More samples can  be  found at https://r9y9.github.io/wavenet_vocoder/. 