# Tacotron2: WaveNet-basd text-to-speech demo

- Tacotron2 (mel-spectrogram prediction part): https://github.com/Rayhane-mamah/Tacotron-2
- WaveNet: https://github.com/r9y9/wavenet_vocoder

This is a proof of concept for Tacotron2 text-to-speech synthesis. Models used here were trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).

**Notice**: The waveform generation is super slow since it implements naive autoregressive generation. It doesn't use parallel generation method described in [Parallel WaveNet](https://arxiv.org/abs/1711.10433). 

**Estimated time to complete**: 2 ~ 3 hours.

In [0]:
# !pip install -U -q PyDrive
# import os
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# # 1. Authenticate and create the PyDrive client.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

# # choose a local (colab) directory to store the data.
# local_download_path = os.path.expanduser('/content/sounds')
# try:
#   os.makedirs(local_download_path)
# except: pass

# # obtain the training data from google drive
# download = drive.CreateFile({'id': '1lKb0ORrlnTkFG9PXRvU7LYW2svy6K9w_'})
# name = os.path.join(local_download_path, 'split-mp3.zip')
# download.GetContentFile(name)


[?25l[K    1% |▎                               | 10kB 21.0MB/s eta 0:00:01[K    2% |▋                               | 20kB 2.3MB/s eta 0:00:01[K    3% |█                               | 30kB 3.3MB/s eta 0:00:01[K    4% |█▎                              | 40kB 2.1MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.6MB/s eta 0:00:01[K    6% |██                              | 61kB 3.1MB/s eta 0:00:01[K    7% |██▎                             | 71kB 3.6MB/s eta 0:00:01[K    8% |██▋                             | 81kB 4.1MB/s eta 0:00:01[K    9% |███                             | 92kB 4.5MB/s eta 0:00:01[K    10% |███▎                            | 102kB 3.5MB/s eta 0:00:01[K    11% |███▋                            | 112kB 3.5MB/s eta 0:00:01[K    12% |████                            | 122kB 5.0MB/s eta 0:00:01[K    13% |████▎                           | 133kB 5.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 9.2MB/s eta 0:00:01[

In [0]:
# !unzip -q ./sounds/split-mp3.zip

## Setup

### Install dependencies

In [0]:
import os
from os.path import exists, join, expanduser

os.chdir(expanduser("~"))

wavenet_dir = "DTT_wavenet_pytorch"
if not exists(wavenet_dir):
  ! git clone https://github.com/popo0293/$wavenet_dir
    
taco2_dir = "Tacotron-2"
if not exists(taco2_dir):
  ! git clone https://github.com/r9y9/$taco2_dir
  ! cd $taco2_dir && git checkout -B wavenet3 origin/wavenet3

In [0]:
# Install dependencies
! pip install -q --upgrade "tensorflow<=1.9.0"

os.chdir(join(expanduser("~"), taco2_dir))
! pip install -q -r requirements.txt

os.chdir(join(expanduser("~"), wavenet_dir))
! pip install -q -e '.[train]'

In [0]:
!pip install -q cadl

[?25l[K    6% |██▏                             | 10kB 17.9MB/s eta 0:00:01[K    13% |████▍                           | 20kB 2.3MB/s eta 0:00:01[K    20% |██████▋                         | 30kB 3.3MB/s eta 0:00:01[K    27% |████████▉                       | 40kB 2.1MB/s eta 0:00:01[K    34% |███████████                     | 51kB 2.6MB/s eta 0:00:01[K    41% |█████████████▎                  | 61kB 3.1MB/s eta 0:00:01[K    48% |███████████████▌                | 71kB 3.6MB/s eta 0:00:01[K    55% |█████████████████▋              | 81kB 4.1MB/s eta 0:00:01[K    62% |███████████████████▉            | 92kB 4.5MB/s eta 0:00:01[K    69% |██████████████████████          | 102kB 3.5MB/s eta 0:00:01[K    75% |████████████████████████▎       | 112kB 3.6MB/s eta 0:00:01[K    82% |██████████████████████████▌     | 122kB 5.0MB/s eta 0:00:01[K    89% |████████████████████████████▊   | 133kB 5.0MB/s eta 0:00:01[K    96% |███████████████████████████████ | 143kB 9.2MB/s eta 0:

In [0]:
import os
import sys
import subprocess
from glob import glob
import numpy as np
import tensorflow as tf
from cadl import wavenet, vctk
from cadl import wavenet_utils as wnu
from cadl.utils import sample_categorical
from scipy.io import wavfile

In [0]:
def get_dataset(saveto='/content/split-mp3', convert_mp3_to_16khzwav=False):
    """Convert MP3 files in 'saveto' directory to wav files.
    subfolders under the 'saveto' directory are considered chapters
    Each file name should be formatted CHAPTERNAME-UTTERANCE-DESCRIPTION.mp3
    ffmpeg must be installed to convert the files.
    Parameters
    ----------
    saveto : str, optional
        Directory to save the resulting dataset ['sounds']
    convert_to_16khz : bool, optional
        Description
    Returns
    -------
        dataset
    """
    if not os.path.exists(saveto):
        sys.exit("Error: '" + saveto + "' folder does not exist")

    wavs = glob('{}/**/*.wav'.format(saveto), recursive=True)
    if not wavs and convert_mp3_to_16khzwav:
        wavs = glob('{}/**/*.mp3'.format(saveto), recursive=True)
        for wav_i in wavs:
            subprocess.check_call(
                ['ffmpeg', '-i', wav_i, '-f', 'wav', '-ac', '1', '-ar', '16000', '-y', '%s.16khz.wav' % wav_i])

    wavs = glob('{}/**/*.wav'.format(saveto), recursive=True)

    if not wavs:
        sys.exit("Error: No 16khz wav files were found in '" + saveto + "'")        

    dataset = []
    for wav_i in wavs:
        chapter_i, utter_i = wav_i.split('/')[-2:]
        dataset.append({
            'name': wav_i,
            'chapter': chapter_i,
            'utterance': utter_i.split('-')[-2].strip('.wav')})
    return dataset

In [0]:
# get_dataset(convert_mp3_to_16khzwav=True)

In [0]:
# !find /content/split-mp3/ -name '*.mp3' -delete

In [0]:
# os.chdir(join('/content/split-mp3/'))
# !ls -1 > ../allfiles.txt

In [0]:
os.chdir(join('/content'))
!mkdir checkpoint

mkdir: cannot create directory ‘checkpoint’: File exists


In [0]:
os.chdir("/content/")
if not exists("preprocessed"):
  ! curl -O -L "https://www.dropbox.com/s/u2y4s00c40lhrpk/preprocessed.zip?dl=0"
#   ! unzip -q preprocessed.zip

In [0]:
# !python preprocess.py dtt /content /content/preprocessed --preset ./presets/dtt_speech.json

Sampling frequency: 16000
100% 1147/1147 [00:28<00:00, 36.96it/s]
Wrote 1147 utterances, 46877696 time steps (0.81 hours)
Max input length:  20
Max output length: 204800


In [0]:
# !mv /dtt_speech.json ./presets/

In [0]:
os.chdir(join(expanduser("~"), wavenet_dir))
!python train.py --data-root /content/preprocessed --checkpoint /content/checkpoint/checkpoint_step000001441.pth --checkpoint-dir /content/checkpoint --preset ./presets/dtt_speech.json

Using TensorFlow backend.
Command line args:
 {'--checkpoint': '/content/checkpoint/checkpoint_step000001441.pth',
 '--checkpoint-dir': '/content/checkpoint',
 '--data-root': '/content/preprocessed',
 '--help': False,
 '--hparams': '',
 '--log-event-path': None,
 '--preset': './presets/dtt_speech.json',
 '--reset-optimizer': False,
 '--restore-parts': None,
 '--speaker-id': None}
Hyperparameters:
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_eps: 1e-08
  allow_clipping_in_normalization: True
  amsgrad: False
  batch_size: 2
  builder: wavenet
  checkpoint_interval: 10000
  cin_channels: 80
  clip_thresh: -1
  dropout: 0.050000000000000044
  ema_decay: 0.9999
  exponential_moving_average: True
  fft_size: 1024
  fmax: 7600
  fmin: 125
  frame_shift_ms: None
  freq_axis_kernel_size: 3
  gate_channels: 512
  gin_channels: -1
  hop_size: 256
  initial_learning_rate: 0.001
  input_type: raw
  kernel_size: 3
  layers: 24
  legacy: True
  log_scale_min: -32.23619130191664
  lr_schedule: noam_l

In [0]:
# !zip /content/DTTpytorch.zip -r /content/checkpoint 

  adding: content/checkpoint/ (stored 0%)
  adding: content/checkpoint/checkpoint_step000000037.pth (deflated 7%)
  adding: content/checkpoint/test_eval/ (stored 0%)
  adding: content/checkpoint/test_eval/step000000638_predicted.wav (deflated 10%)
  adding: content/checkpoint/test_eval/step000000638_target.wav (deflated 38%)
  adding: content/checkpoint/test_eval/step000000638_waveplots.png (deflated 3%)
  adding: content/checkpoint/checkpoint_step000000029.pth (deflated 7%)
  adding: content/checkpoint/checkpoint_step000001296.pth (deflated 7%)
  adding: content/checkpoint/checkpoint_step000000090.pth (deflated 7%)


In [0]:
# !zip /content/preprocessed.zip -r /content/preprocessed

  adding: content/preprocessed/ (stored 0%)
  adding: content/preprocessed/dtt-audio-00750.npy (deflated 43%)
  adding: content/preprocessed/dtt-mel-00782.npy (deflated 13%)
  adding: content/preprocessed/dtt-mel-00534.npy (deflated 13%)
  adding: content/preprocessed/dtt-audio-00388.npy (deflated 43%)
  adding: content/preprocessed/dtt-audio-00744.npy (deflated 40%)
  adding: content/preprocessed/dtt-audio-00657.npy (deflated 40%)
  adding: content/preprocessed/dtt-mel-00173.npy (deflated 12%)
  adding: content/preprocessed/dtt-audio-00614.npy (deflated 39%)
  adding: content/preprocessed/dtt-mel-00479.npy (deflated 15%)
  adding: content/preprocessed/dtt-mel-00163.npy (deflated 13%)
  adding: content/preprocessed/dtt-mel-00004.npy (deflated 14%)
  adding: content/preprocessed/dtt-audio-00240.npy (deflated 42%)
  adding: content/preprocessed/dtt-mel-00043.npy (deflated 16%)
  adding: content/preprocessed/dtt-audio-00784.npy (deflated 39%)
  adding: content/preprocessed/dtt-audio-00971

In [0]:
import torch
import tensorflow
tensorflow.__version__

'1.9.0'

### Download pretrained models

#### Tacotron2 (mel-spectrogram prediction part)

In [0]:
os.chdir(join(expanduser("~"), taco2_dir))
! mkdir -p logs-Tacotron
if not exists("logs-Tacotron/pretrained"):
  ! curl -O -L "https://www.dropbox.com/s/vx7y4qqs732sqgg/pretrained.tar.gz"
  ! tar xzvf pretrained.tar.gz
  ! mv pretrained logs-Tacotron

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1034    0  1034    0     0    844      0 --:--:--  0:00:01 --:--:--   844
100  288M  100  288M    0     0  34.8M      0  0:00:08  0:00:08 --:--:-- 43.2M
pretrained/
pretrained/checkpoint
pretrained/model.ckpt-189500.meta
pretrained/model.ckpt-189500.data-00000-of-00001
pretrained/model.ckpt-189500.index


#### WaveNet

In [0]:
# os.chdir('/content/checkpoint')
wn_preset = "/root/DTT_wavenet_pytorch/presets/dtt_speech.json"
wn_checkpoint_path = "/content/checkpoint/checkpoint_step000001441.pth"

# if not exists(wn_preset):
#   !curl -O -L "https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json"
# if not exists(wn_checkpoint_path):
#   !curl -O -L "https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth"

## Input texts to be synthesized

Choose your favorite sentences :)

In [0]:
os.chdir(join(expanduser("~"), taco2_dir))

In [0]:
%%bash
cat << EOS > text_list.txt
This is really awesome!
Thanks for your patience.
EOS

cat text_list.txt

This is really awesome!
Thanks for your patience.


## Mel-spectrogram prediction by Tacoron2

In [0]:
# Remove old files if exist
! rm -rf tacotron_output
! python synthesize.py --model='Tacotron' --mode='eval' \
  --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=1' \
  --text_list=./text_list.txt

loaded model at logs-Tacotron/pretrained/model.ckpt-189500
Hyperparameters:
  allow_clipping_in_normalization: True
  attention_dim: 128
  attention_filters: 32
  attention_kernel: (31,)
  cleaners: english_cleaners
  cumulative_weights: True
  decoder_layers: 2
  decoder_lstm_units: 1024
  embedding_dim: 512
  enc_conv_channels: 512
  enc_conv_kernel_size: (5,)
  enc_conv_num_layers: 3
  encoder_lstm_units: 256
  fft_size: 1024
  fmax: 7600
  fmin: 125
  frame_shift_ms: None
  griffin_lim_iters: 60
  hop_size: 256
  impute_finished: False
  input_type: raw
  log_scale_min: -32.23619130191664
  mask_encoder: False
  mask_finished: False
  max_abs_value: 4.0
  max_iters: 2500
  min_level_db: -100
  num_freq: 513
  num_mels: 80
  outputs_per_step: 1
  postnet_channels: 512
  postnet_kernel_size: (5,)
  postnet_num_layers: 5
  power: 1.1
  predict_linear: False
  prenet_layers: [256, 256]
  quantize_channels: 65536
  ref_level_db: 20
  rescale: True
  rescaling_max: 0.999
  sample_rate: 2

## Waveform synthesis by WaveNet

In [0]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import torch

In [0]:
os.chdir(join(expanduser("~"), wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

Load checkpoint from /content/checkpoint/checkpoint_step000001441.pth


In [0]:
from glob import glob
from tqdm import tqdm

with open("../Tacotron-2/tacotron_output/eval/map.txt") as f:
  maps = f.readlines()
maps = list(map(lambda x:x[:-1].split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
  print(idx, text)

List of texts to be synthesized
0 This is really awesome!
1 Thanks for your patience.


### Waveform generation

**Note**: This will takes hours to finish depending on the number and lenght of texts. Try short sentences first if you would like to see samples quickly.

In [0]:
waveforms = []

for idx, (text, mel) in enumerate(maps):
  print("\n", idx, text)
  mel_path = join("../Tacotron-2", mel)
  c = np.load(mel_path)
  if c.shape[1] != hparams.num_mels:
    np.swapaxes(c, 0, 1)
  # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
  c = np.interp(c, (0, 4), (0, 1))
 
  # Generate
  waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
  
  waveforms.append(waveform)

  # Audio
  IPython.display.display(Audio(waveform, rate=hparams.sample_rate))

  0%|          | 6/28928 [00:00<08:52, 54.26it/s]


 0 This is really awesome!


100%|██████████| 28928/28928 [08:28<00:00, 56.85it/s]


  0%|          | 6/28928 [00:00<09:10, 52.52it/s]


 1 Thanks for your patience.


 38%|███▊      | 10909/28928 [03:13<05:19, 56.39it/s]

KeyboardInterrupt: ignored

## Summary: audio samples

In [0]:
for idx, (text, mel) in enumerate(maps):
  print(idx, text)
  IPython.display.display(Audio(waveforms[idx], rate=hparams.sample_rate))

For more information, please visit https://github.com/r9y9/wavenet_vocoder. More samples can  be  found at https://r9y9.github.io/wavenet_vocoder/. 